## AiDA Lab Tutorial Part 7a--Transfer Learning  Preprocessing (Special Thanks to Dr. Kyle Hasenstab)

By now, you should have trained a simple CNN to accomplish the super-resolution task with either a 955 (part 5) or UNet (part 6).  As you have seen, while results are good, it can take a long time to train these models depending on the complexity of the task.  What if instead, you can for example, take a CNN trained to identify soccer balls in an image, and "transfer" its knowledge to a new task to identify basketballs in an image?  This is the core of the concept of transfer learning.  Transfer learning essentially entails training a model to perform one task, saving the model parameters and weights, then loading those saved weights as an initial starting point when training either the same or modified CNN for a different task!  https://www.tensorflow.org/tutorials/images/transfer_learning is a good reference

Here, we will use the model developed by Dr. Kang Wang, one of the stellar T32 Residents who worked for Albert in around 2018.  His paper is located here and in the repo (Kang_Radiology_AI_Paper): https://pubs.rsna.org/doi/full/10.1148/ryai.2019180022

As you are now acquianted with coding in python, I will preface the CNN training with pseudo-code you will need done in pre-training, as this will differ based on your specific task:

### Preprocessing Pseudo-Code

In [1]:
import numpy as np
import h5py
import matplotlib.pyplot as plt
import os
from scipy import ndimage
import traceback
import seaborn as sns
import pandas as pd
import tensorflow as tf
import cv2
import PIL
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
#Reading in all hdf5 files
WORKING_DIR = os.getcwd()

HDF_PATH = "C:/Users/david/Desktop/DSC 180A - FA22/Code/data/"
if not os.path.exists(HDF_PATH):
    print("HDF_PATH does not exist. Please change the path to the data folder.")

SAVE_PATH = os.path.join(WORKING_DIR,"data")
if not os.path.exists(SAVE_PATH):
    os.mkdir(SAVE_PATH)
if not os.path.exists(os.path.join(SAVE_PATH,"1024_images")):
    os.mkdir(os.path.join(SAVE_PATH,"1024_images"))
if not os.path.exists(os.path.join(SAVE_PATH,"256_images")):
    os.mkdir(os.path.join(SAVE_PATH,"256_images"))

file0 = h5py.File(HDF_PATH + "bnpp_frontalonly_1024_0.hdf5", 'r')
file1 = h5py.File(HDF_PATH + "bnpp_frontalonly_1024_1.hdf5", 'r')
file2 = h5py.File(HDF_PATH + "bnpp_frontalonly_1024_2.hdf5", 'r')
file3 = h5py.File(HDF_PATH + "bnpp_frontalonly_1024_3.hdf5", 'r')
file4 = h5py.File(HDF_PATH + "bnpp_frontalonly_1024_4.hdf5", 'r')
file5 = h5py.File(HDF_PATH + "bnpp_frontalonly_1024_5.hdf5", 'r')
file6 = h5py.File(HDF_PATH + "bnpp_frontalonly_1024_6.hdf5", 'r')
#file7 = h5py.File("bnpp_frontalonly_1024_7.hdf5", 'r')
file10 = h5py.File(HDF_PATH + "bnpp_frontalonly_1024_10.hdf5", 'r')

files = [file0, file1, file2, file3, file4, file5, file6, file10]

In [3]:
#saving all images to 1024_images folder
# i = 0
# for file in files:
#     for key in file.keys():
#         im = np.asarray(file[key])
#         if not os.path.exists(os.path.join(SAVE_PATH,'1024_images',str(key),'.png')):
#             plt.imsave(SAVE_PATH + '/1024_images/' + key + '.png', arr = im, cmap = 'gray')
#         i += 1
#         if i % 500 == 0:
#             print(i)

In [4]:
#print('# of 1024 Images: ',len([name for name in os.listdir(os.getcwd()+'/data/1024_images') if os.path.isfile(os.path.join(os.getcwd()+'/data/1024_images', name))]))

In [5]:
#saving all images to 256_images folder
# i=0
# for file in files:
#     for key in file.keys():
#         im = Image.open(SAVE_PATH + '/1024_images/' + key + '.png')
#         #print(im.size)
#         im = im.resize((256,256))
#         #print(im.size)
#         if not os.path.exists(os.path.join(SAVE_PATH, '/256_images/', key, '.png')):
#             im.save(SAVE_PATH + '/256_images/' + key + '.png')
#         i += 1
#         if i % 500 == 0:
#             print(i)

In [6]:
df1 = pd.read_csv(WORKING_DIR+'\\BNPP_data_frontalonly_AgesBMI_06242021_dsc180.csv')
df1.drop(columns=['phonetic_id','Sample_Collection_TM'], inplace=True)
df1.dtypes

unique_key            object
age_at_sampletime    float64
bmi                  float64
dtype: object

In [7]:
df2 = pd.read_csv(WORKING_DIR+'\\BNPPlabs_dcmlist_merged_noMRN_frontal_only_dsc180a.csv')
df2.drop(columns=['phonetic_id','unique_key.1','ref_unit','cr_unit','bnpp_value'], inplace=True)
df2.dtypes

unique_key         object
bnpp_value_num    float64
cr_value_num      float64
Has_PNA             int64
Has_AcuteHF         int64
dtype: object

In [8]:
data = pd.merge(df1, df2, on='unique_key', how='inner')
data.index = data['unique_key']
data.drop(columns=['unique_key'], inplace=True)
data.dropna(inplace=True)
print(data.dtypes)
data

age_at_sampletime    float64
bmi                  float64
bnpp_value_num       float64
cr_value_num         float64
Has_PNA                int64
Has_AcuteHF            int64
dtype: object


Unnamed: 0_level_0,age_at_sampletime,bmi,bnpp_value_num,cr_value_num,Has_PNA,Has_AcuteHF
unique_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abachug_50267230_img1,59.0,25.51,418.0,0.61,1,0
Abadik_50217497_img1,58.0,31.38,2161.0,1.31,0,0
Abafouck_52403307_img1,58.0,33.81,118.0,0.66,0,0
Abagash_52691625_img1,60.0,30.64,49.9,0.64,0,0
Abakleem_50725934_img1,59.0,34.81,20029.0,10.54,0,0
...,...,...,...,...,...,...
Zufosloo_50189474_img1,54.0,44.06,2988.0,1.29,0,1
Zuliquep_52986445_img1,68.0,26.07,5684.0,0.50,0,1
Zunakot_51932665_img1,62.0,22.73,123.0,0.94,0,0
Zuplouke_51797661_img1,85.0,27.66,1290.0,1.77,0,0


In [9]:
images = []
ids = []
bnpp = []
no_values = []
cardio_edema = []
other = []
for file in tqdm(files):
    for key in file.keys():
        im = Image.open(SAVE_PATH + '/256_images/' + key + '.png')
        im = np.asarray(im)
        im = (im - np.min(im))/(np.max(im) - np.min(im))
        try:
            row = data.loc[key].values
            bnpp.append(row[2])
            if row[2] >= 400:
                cardio_edema.append(1)
            else:
                cardio_edema.append(0)
            d = np.array([row[0], row[1], row[3], row[4], row[5]], dtype='object')
            other.append(d)
        except:
            no_values.append(key)
            continue
        images.append(im)
        ids.append(key)
        

100%|██████████| 8/8 [01:18<00:00,  9.75s/it]


In [10]:
print('# Non-null images: ',len(images))
images = np.array(images)
images[0]

# Non-null images:  16567


MemoryError: Unable to allocate 32.4 GiB for an array with shape (16567, 256, 256, 4) and data type float64

In [None]:
df = pd.DataFrame(data=list(zip(images, other, bnpp, cardio_edema)), index = ids,columns=['images', 'other','bnpp', 'cardio_edema'])
new = df.other.apply(pd.Series) \
    .merge(df, right_index = True, left_index = True) \
    .drop(["other"], axis = 1)
new.rename(columns={0:'age', 1:'bmi', 2:'cr', 3:'Has_PNA',4:'Has_AcuteHF'},inplace=True)
display(new)
new.to_csv(WORKING_DIR+'\\data\\all_data.tsv',sep = '\t')

Y = new['bnpp']
new.drop(columns=['bnpp'], inplace=True)
X = new
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)
print(len(X_train), len(X_test), len(X_val))
X_train.iloc[0]

Unnamed: 0,age,bmi,cr,Has_PNA,Has_AcuteHF,images,bnpp,cardio_edema
Abachug_50267230_img1,59.0,25.51,0.61,1.0,0.0,"[[[104, 104, 104, 255], [88, 88, 88, 255], [73...",418.0,1
Abadik_50217497_img1,58.0,31.38,1.31,0.0,0.0,"[[[196, 196, 196, 255], [191, 191, 191, 255], ...",2161.0,1
Abafouck_52403307_img1,58.0,33.81,0.66,0.0,0.0,"[[[41, 41, 41, 255], [32, 32, 32, 255], [27, 2...",118.0,0
Abagash_52691625_img1,60.0,30.64,0.64,0.0,0.0,"[[[5, 5, 5, 255], [4, 4, 4, 255], [4, 4, 4, 25...",49.9,0
Abakleem_50725934_img1,59.0,34.81,10.54,0.0,0.0,"[[[22, 22, 22, 255], [16, 16, 16, 255], [13, 1...",20029.0,1
...,...,...,...,...,...,...,...,...
Zufosloo_50189474_img1,54.0,44.06,1.29,0.0,1.0,"[[[1, 1, 1, 255], [0, 0, 0, 255], [3, 3, 3, 25...",2988.0,1
Zuliquep_52986445_img1,68.0,26.07,0.50,0.0,1.0,"[[[0, 0, 0, 255], [0, 0, 0, 255], [0, 0, 0, 25...",5684.0,1
Zunakot_51932665_img1,62.0,22.73,0.94,0.0,0.0,"[[[201, 201, 201, 255], [177, 177, 177, 255], ...",123.0,0
Zuplouke_51797661_img1,85.0,27.66,1.77,0.0,0.0,"[[[141, 141, 141, 255], [141, 141, 141, 255], ...",1290.0,1


13253 1657 1657


age                                                          78.0
bmi                                                         31.22
cr                                                           1.77
Has_PNA                                                       0.0
Has_AcuteHF                                                   1.0
images          [[[58, 58, 58, 255], [42, 42, 42, 255], [25, 2...
cardio_edema                                                    1
Name: Nigomu_52681339_img1, dtype: object

In [None]:
if not os.path.exists(os.path.join(WORKING_DIR, 'data', 'train.tsv')):
    X_train.to_csv(WORKING_DIR+'\\data\\X_train.tsv', sep='\t')
if not os.path.exists(os.path.join(WORKING_DIR, 'data', 'test.tsv')):
    X_test.to_csv(WORKING_DIR+'\\data\\X_test.tsv', sep='\t')
if not os.path.exists(os.path.join(WORKING_DIR, 'data', 'val.tsv')):
    X_val.to_csv(WORKING_DIR+'\\data\\X_val.tsv', sep='\t')
if not os.path.exists(os.path.join(WORKING_DIR, 'data', 'y_train.tsv')):
    y_train.to_csv(WORKING_DIR+'\\data\\y_train.tsv', sep='\t')
if not os.path.exists(os.path.join(WORKING_DIR, 'data', 'y_test.tsv')):
    y_test.to_csv(WORKING_DIR+'\\data\\y_test.tsv', sep='\t')
if not os.path.exists(os.path.join(WORKING_DIR, 'data', 'y_val.tsv')):
    y_val.to_csv(WORKING_DIR+'\\data\\y_val.tsv', sep='\t')

In [11]:
f = h5py.File('images.hdf5','w')
f.create_dataset('images', data=images)
f.close()

MemoryError: Unable to allocate 32.4 GiB for an array with shape (16567, 256, 256, 4) and data type float64