## AiDA Lab Tutorial Part 7a--Transfer Learning  Preprocessing (Special Thanks to Dr. Kyle Hasenstab)

By now, you should have trained a simple CNN to accomplish the super-resolution task with either a 955 (part 5) or UNet (part 6).  As you have seen, while results are good, it can take a long time to train these models depending on the complexity of the task.  What if instead, you can for example, take a CNN trained to identify soccer balls in an image, and "transfer" its knowledge to a new task to identify basketballs in an image?  This is the core of the concept of transfer learning.  Transfer learning essentially entails training a model to perform one task, saving the model parameters and weights, then loading those saved weights as an initial starting point when training either the same or modified CNN for a different task!  https://www.tensorflow.org/tutorials/images/transfer_learning is a good reference

Here, we will use the model developed by Dr. Kang Wang, one of the stellar T32 Residents who worked for Albert in around 2018.  His paper is located here and in the repo (Kang_Radiology_AI_Paper): https://pubs.rsna.org/doi/full/10.1148/ryai.2019180022

As you are now acquianted with coding in python, I will preface the CNN training with pseudo-code you will need done in pre-training, as this will differ based on your specific task:

### Preprocessing Pseudo-Code

In [1]:
import numpy as np
import h5py
import matplotlib.pyplot as plt
import os
from scipy import ndimage
import traceback
#import seaborn as sns
import pandas as pd
#import tensorflow as tf
import cv2
import PIL
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import csv
#print(tf.config.list_physical_devices('GPU'))

In [2]:
#Reading in all hdf5 files
WORKING_DIR = os.getcwd()

HDF_PATH = "C:/Users/david/Desktop/DSC 180A - FA22/Code/data/"
if not os.path.exists(HDF_PATH):
    print("HDF_PATH does not exist. Please change the path to the data folder.")

SAVE_PATH = os.path.join(WORKING_DIR,"data")
if not os.path.exists(SAVE_PATH):
    os.mkdir(SAVE_PATH)
if not os.path.exists(os.path.join(SAVE_PATH,"1024_images")):
    os.mkdir(os.path.join(SAVE_PATH,"1024_images"))
if not os.path.exists(os.path.join(SAVE_PATH,"256_images")):
    os.mkdir(os.path.join(SAVE_PATH,"256_images"))

file0 = h5py.File(HDF_PATH + "bnpp_frontalonly_1024_0.hdf5", 'r')
file1 = h5py.File(HDF_PATH + "bnpp_frontalonly_1024_1.hdf5", 'r')
file2 = h5py.File(HDF_PATH + "bnpp_frontalonly_1024_2.hdf5", 'r')
file3 = h5py.File(HDF_PATH + "bnpp_frontalonly_1024_3.hdf5", 'r')
file4 = h5py.File(HDF_PATH + "bnpp_frontalonly_1024_4.hdf5", 'r')
file5 = h5py.File(HDF_PATH + "bnpp_frontalonly_1024_5.hdf5", 'r')
file6 = h5py.File(HDF_PATH + "bnpp_frontalonly_1024_6.hdf5", 'r')
#file7 = h5py.File("bnpp_frontalonly_1024_7.hdf5", 'r')
file10 = h5py.File(HDF_PATH + "bnpp_frontalonly_1024_10.hdf5", 'r')

files = [file0, file1, file2, file3, file4, file5, file6, file10]

In [4]:
#saving all images to 1024_images folder
# i = 0
# for file in files:
#     for key in file.keys():
#         im = np.asarray(file[key])
#         if not os.path.exists(os.path.join(SAVE_PATH,'1024_images',str(key),'.png')):
#             plt.imsave(SAVE_PATH + '/1024_images/' + key + '.png', arr = im, cmap = 'gray')
#         i += 1
#         if i % 500 == 0:
#             print(i)

In [5]:
#print('# of 1024 Images: ',len([name for name in os.listdir(os.getcwd()+'/data/1024_images') if os.path.isfile(os.path.join(os.getcwd()+'/data/1024_images', name))]))

In [6]:
#saving all images to 256_images folder
# i=0
# for file in files:
#     for key in file.keys():
#         im = Image.open(SAVE_PATH + '/1024_images/' + key + '.png')
#         #print(im.size)
#         im = im.resize((256,256))
#         #print(im.size)
#         if not os.path.exists(os.path.join(SAVE_PATH, '/256_images/', key, '.png')):
#             im.save(SAVE_PATH + '/256_images/' + key + '.png')
#         i += 1
#         if i % 500 == 0:
#             print(i)

In [3]:
df1 = pd.read_csv(WORKING_DIR+'\\BNPP_data_frontalonly_AgesBMI_06242021_dsc180.csv')
df1.drop(columns=['phonetic_id','Sample_Collection_TM','age_at_sampletime'], inplace=True)
df1

Unnamed: 0,unique_key,bmi
0,Abachug_50267230_img1,25.51
1,Abadik_50217497_img1,31.38
2,Abafouck_52403307_img1,33.81
3,Abagash_52691625_img1,30.64
4,Abakleem_50725934_img1,34.81
...,...,...
26762,Zufosloo_50189474_img1,44.06
26763,Zuliquep_52986445_img1,26.07
26764,Zunakot_51932665_img1,22.73
26765,Zuplouke_51797661_img1,27.66


In [4]:
df2 = pd.read_csv(WORKING_DIR+'\\BNPPlabs_dcmlist_merged_noMRN_frontal_only_dsc180a.csv')
df2.drop(columns=['phonetic_id','unique_key.1','ref_unit','cr_unit','bnpp_value'], inplace=True)
df2

Unnamed: 0,unique_key,bnpp_value_num,cr_value_num,Has_PNA,Has_AcuteHF
0,Abachug_50267230_img1,418.0,0.61,1,0
1,Abadik_50217497_img1,2161.0,1.31,0,0
2,Abafouck_52403307_img1,118.0,0.66,0,0
3,Abagash_52691625_img1,49.9,0.64,0,0
4,Abakleem_50725934_img1,20029.0,10.54,0,0
...,...,...,...,...,...
26662,Zufosloo_50189474_img1,2988.0,1.29,0,1
26663,Zuliquep_52986445_img1,5684.0,0.50,0,1
26664,Zunakot_51932665_img1,123.0,0.94,0,0
26665,Zuplouke_51797661_img1,1290.0,1.77,0,0


In [5]:
data = pd.merge(df1, df2, on='unique_key', how='inner')
data.index = data['unique_key']
data.drop(columns=['unique_key'], inplace=True)
data.dropna(inplace=True)
print(data.dtypes)
data

bmi               float64
bnpp_value_num    float64
cr_value_num      float64
Has_PNA             int64
Has_AcuteHF         int64
dtype: object


Unnamed: 0_level_0,bmi,bnpp_value_num,cr_value_num,Has_PNA,Has_AcuteHF
unique_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Abachug_50267230_img1,25.51,418.0,0.61,1,0
Abadik_50217497_img1,31.38,2161.0,1.31,0,0
Abafouck_52403307_img1,33.81,118.0,0.66,0,0
Abagash_52691625_img1,30.64,49.9,0.64,0,0
Abakleem_50725934_img1,34.81,20029.0,10.54,0,0
...,...,...,...,...,...
Zufosloo_50189474_img1,44.06,2988.0,1.29,0,1
Zuliquep_52986445_img1,26.07,5684.0,0.50,0,1
Zunakot_51932665_img1,22.73,123.0,0.94,0,0
Zuplouke_51797661_img1,27.66,1290.0,1.77,0,0


In [3]:
filenum = 1
for file in files:
    images = []#np.empty((256, 256, 4))
    ids = []
    # bnpp = []
    # no_values = []
    # cardio_edema = []
    # other = []
    for key in tqdm(file.keys()):
        im = Image.open(SAVE_PATH + '/256_images/' + key + '.png')
        im = np.asarray(im)
        im = (im - np.min(im))/(np.max(im) - np.min(im))
        dim1 = []
        for i in range(256):
            dim2 = []
            for j in range(256):
                dim3 = [im[i][j][0]]
                dim2.append(dim3)
            dim1.append(dim2)
        # try:
        #     row = data.loc[key].values
        #     bnpp.append(row[1])
        #     if row[2] >= 400:
        #         cardio_edema.append(1)
        #     else:
        #         cardio_edema.append(0)
        #     other.append(np.array([row[0], row[2], row[3], row[4]], dtype='object'))
        # except:
        #     no_values.append(key)
        #     continue
        images.append(dim1)
        ids.append(key)
    images = np.array(images).astype('float32')
    np.save(WORKING_DIR + '\\data\\256_images_np\\file' + str(filenum), images, allow_pickle=True)
    break
    filenum += 1
        

100%|██████████| 2700/2700 [02:19<00:00, 19.30it/s]


In [4]:
INH5 = 'data/PreProcessed_Marked_images4.hdf5' #dynamic range fixed data w/ val > 0 == 1 for labels

inh5 = h5py.File(INH5,'r')
inh5['training_images'][0]

array([[[0.01514009],
        [0.00557355],
        [0.00408146],
        ...,
        [0.06558593],
        [0.06027119],
        [0.06692618]],

       [[0.01707189],
        [0.00812512],
        [0.00254875],
        ...,
        [0.10878366],
        [0.09779954],
        [0.11870433]],

       [[0.02050634],
        [0.0060581 ],
        [0.00359129],
        ...,
        [0.15888216],
        [0.12038266],
        [0.13424452]],

       ...,

       [[0.15407875],
        [0.07660754],
        [0.04781844],
        ...,
        [0.5070376 ],
        [0.4492944 ],
        [0.23345134]],

       [[0.17041118],
        [0.08624604],
        [0.05902913],
        ...,
        [0.56176186],
        [0.52686906],
        [0.27182052]],

       [[0.18336068],
        [0.10985743],
        [0.07847337],
        ...,
        [0.650262  ],
        [0.599939  ],
        [0.35178334]]], dtype=float32)

In [16]:
df = pd.DataFrame(data=list(zip(new_images, other, bnpp, cardio_edema)), index = ids,columns=['images', 'other','bnpp', 'cardio_edema'])
new = df.other.apply(pd.Series) \
    .merge(df, right_index = True, left_index = True) \
    .drop(["other"], axis = 1)
new.rename(columns={0:'bmi', 2:'cr', 3:'Has_PNA',4:'Has_AcuteHF'},inplace=True)
display(new)
new.to_csv(WORKING_DIR+'\\data\\all_data.tsv',sep = '\t')

Y = new['bnpp']
new.drop(columns=['bnpp'], inplace=True)
X = new
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
print(len(X_train), len(X_test), len(X_val))

ValueError: Length of values (500) does not match length of index (16567)

In [None]:
#if not os.path.exists(os.path.join(WORKING_DIR, 'data', 'train.tsv')):
X_train.to_csv(WORKING_DIR+'\\data\\X_train.tsv', sep='\t')
#if not os.path.exists(os.path.join(WORKING_DIR, 'data', 'test.tsv')):
X_test.to_csv(WORKING_DIR+'\\data\\X_test.tsv', sep='\t')
#if not os.path.exists(os.path.join(WORKING_DIR, 'data', 'val.tsv')):
X_val.to_csv(WORKING_DIR+'\\data\\X_val.tsv', sep='\t')
#if not os.path.exists(os.path.join(WORKING_DIR, 'data', 'y_train.tsv')):
y_train.to_csv(WORKING_DIR+'\\data\\y_train.tsv', sep='\t')
#if not os.path.exists(os.path.join(WORKING_DIR, 'data', 'y_test.tsv')):
y_test.to_csv(WORKING_DIR+'\\data\\y_test.tsv', sep='\t')
#if not os.path.exists(os.path.join(WORKING_DIR, 'data', 'y_val.tsv')):
y_val.to_csv(WORKING_DIR+'\\data\\y_val.tsv', sep='\t')

In [5]:
f = h5py.File('images.hdf5','w')
images = np.load(WORKING_DIR + '\\data\\256_images_np\\file1.npy', allow_pickle=True)
f.create_dataset('training_images',data=images,maxshape=(None,256,256,1))
f.close()

<HDF5 dataset "training_images": shape (2700, 256, 256, 1), type "<f4">

In [4]:
with h5py.File('images.hdf5','a') as hf:
    for i in range(2,9):
        print(hf['training_images'].shape)
        images = np.load(WORKING_DIR + f'\\data\\256_images_np\\file{i}.npy', allow_pickle=True)
        hf['training_images'].resize((hf['training_images'].shape[0] + images.shape[0]), axis = 0)
        hf['training_images'][-images.shape[0]:] = images
    print(hf['training_images'].shape)
    hf.close()

(2700, 256, 256, 1)
(5400, 256, 256, 1)
(8100, 256, 256, 1)
(10800, 256, 256, 1)
