In [3]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import os
import tensorflow as tf
import sklearn
import sklearn.preprocessing
import sklearn.model_selection
import datetime

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)

    except RuntimeError as e:
        print(e)

In [3]:
# def init_model():
#     input_tensor = tf.keras.Input(shape=(68,68,3))
#     #input_tensor = tf.keras.Input(shape=(224,224,3))
#     #to_res = (224, 224)
#     model_res = tf.keras.applications.ResNet50V2(include_top = False, weights = None, input_tensor = input_tensor)
    
#     # for layer in model_res.layers[:143]:
#     #     layer.trainable = False
    
#     model = tf.keras.models.Sequential()
#     #model.add(tf.keras.layers.Lambda(lambda image: tf.image.resize(image, to_res)))
#     model.add(model_res)
#     model.add(tf.keras.layers.Flatten())
    
    
#     # for i in range(100):
#     #     model.add(tf.keras.layers.Dense(128,activation='relu'))
#     #     model.add(tf.keras.layers.Dropout(0.2))
#     #     model.add(tf.keras.layers.BatchNormalization())

#     model.add(tf.keras.layers.Dense(12, activation = 'softmax'))
    
    
#     return model

In [None]:
# def init_model(n_dense_layers):
#     input_tensor = tf.keras.Input(shape=(68,68,3))
#     model_res = tf.keras.applications.ResNet50V2(include_top = False, weights = None, input_tensor = input_tensor)
#     model = tf.keras.models.Sequential()

#     model.add(model_res) #Resnet
#     model.add(tf.keras.layers.Flatten()) 
    
    
#     for i in range(n_dense_layers):
#         model.add(tf.keras.layers.Dense(128,activation='relu'))
#         model.add(tf.keras.layers.Dropout(0.2))
#         model.add(tf.keras.layers.BatchNormalization())

#     model.add(tf.keras.layers.Dense(12, activation = 'softmax'))
    
    
#     return model

In [4]:
class DataLoader(tf.keras.utils.Sequence):
    #def __init__(self, df, model = None, one_hot_encoder, learning_rate_schedule = None, batch_size, input_size = (68,68,3), shuffle = True):
    def __init__(self, df, one_hot_encoder, batch_size, model = None, input_size = (68,68,3), shuffle = True):
        self.df = df
        self.dmsomean = np.load('F:\Programming\DTU\Human MCF7\Segmented\Inspection\ClassMean\DMSO.npy')
        self.batch_size = batch_size
        self.input_size = input_size
        self.shuffle = shuffle
        
        self.n = len(self.df)
        self.n_classes = self.df['moa'].nunique()
        
        self.one_hot_encoder = one_hot_encoder
        
        #self.learning_rate_schedule = iter(learning_rate_schedule)
        
        self.on_epoch_end()
        
        
    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
        
        # lr = self.get_next_learning_rate()
        # if lr:
        #     tf.keras.backend.set_value(model.optimizer.learning_rate, lr)
        #     print(f"Learning rate adjusted to: {lr}")
            
    # def get_next_learning_rate(self):
    #     try:
    #         lr = next(self.learning_rate_schedule)
    #     except StopIteration:
    #         lr = None
    #     return lr

    def __get_img(self, path):
        image_arr = np.load(path)
        image_arr = image_arr/255
        image_arr -= self.dmsomean #Normalize by dmso

        image_arr = tf.image.resize(image_arr, (self.input_size[0], self.input_size[1]))
        return image_arr
    
    def __get_label(self, moa):
        #print(moa)
        label = self.one_hot_encoder.transform(moa.to_numpy().reshape(-1, 1))
        return label
    
    def __get_batch(self, batch):

        img_batch = batch['path'].apply(self.__get_img)
        img_batch = np.array([img for img in img_batch])
        img_batch = tf.keras.applications.resnet50.preprocess_input(img_batch)
        

        #label_batch = batch['moa'].apply(self.__get_label)
        label_batch = self.one_hot_encoder.transform(batch.moa.to_numpy().reshape(-1, 1))
        
        return img_batch, label_batch
    
    def __getitem__(self, index):
        batch = self.df[index * self.batch_size:(index+1)*self.batch_size]
        X, Y = self.__get_batch(batch)
        return X, Y
        
    def __len__(self):
        return self.n // self.batch_size
        
                

In [5]:
path = 'Data_paths.csv'
df = pd.read_csv(path)
df = df[df.moa != 'DMSO']
#df = df.drop(df[df['moa'] == 'Microtubule stabilizers'].sample(frac=0.75).index)
#df.reset_index(drop=True, inplace=True)

In [6]:
df.moa.value_counts()

Microtubule stabilizers      80089
Microtubule destabilizers    15055
Epithelial                   14929
Aurora kinase inhibitors     12821
Eg5 inhibitors               12545
Kinase inhibitors            11622
DNA damage                    9391
Actin disruptors              7412
Protein degradation           6611
DNA replication               6019
Cholesterol-lowering          5415
Protein synthesis             3764
Name: moa, dtype: int64

In [7]:


# indices = np.arange(len(df))
# X_train, X_test, y_train, y_test, idx_train, idx_test = sklearn.model_selection.train_test_split(df['path'], df['moa'], indices, test_size = 0.5, random_state=0, shuffle = True, stratify=df['moa'])

In [8]:
moa_one_hot_encoder = sklearn.preprocessing.OneHotEncoder(sparse=False)
moa_one_hot_encoder.fit(df['moa'].to_numpy().reshape(-1, 1))

OneHotEncoder(sparse=False)

In [9]:
indices = np.arange(len(df))
X_train, X_test, y_train, y_test, idx_train, idx_test = sklearn.model_selection.train_test_split(df['path'], df['moa'], indices, test_size = 0.5, random_state=0, shuffle = True, stratify=df['moa'])

df_train = df.loc[idx_train]
df_test = df.loc[idx_test]
print(df_train.moa.value_counts())

Microtubule stabilizers      40044
Microtubule destabilizers     7527
Epithelial                    7464
Aurora kinase inhibitors      6410
Eg5 inhibitors                6272
Kinase inhibitors             5811
DNA damage                    4696
Actin disruptors              3706
Protein degradation           3306
DNA replication               3010
Cholesterol-lowering          2708
Protein synthesis             1882
Name: moa, dtype: int64


In [10]:
# df_distribution = pd.DataFrame(y_test.value_counts())
# max_count = df_distribution.max().moa
# moa_mean = np.mean(df_distribution.moa)
# moa_class_weights = {}
# for moa, i in df_distribution.iterrows():
#     label = moa_one_hot_encoder.transform(np.array([moa]).reshape(-1,1))
#     moa_class_weights[np.argmax(label)] = np.floor(np.sqrt(max_count//i.moa))
# print(df_distribution)
# moa_class_weights

                             moa
Microtubule stabilizers    40045
Microtubule destabilizers   7528
Epithelial                  7465
Aurora kinase inhibitors    6411
Eg5 inhibitors              6273
Kinase inhibitors           5811
DNA damage                  4695
Actin disruptors            3706
Protein degradation         3305
DNA replication             3009
Cholesterol-lowering        2707
Protein synthesis           1882


{9: 1.0,
 8: 2.0,
 6: 2.0,
 1: 2.0,
 5: 2.0,
 7: 2.0,
 3: 2.0,
 0: 3.0,
 10: 3.0,
 4: 3.0,
 2: 3.0,
 11: 4.0}

2022.06.13_00.58.58


In [None]:
# model = init_model()


# #train_dataloader = DataLoader(df = df_train, model = model, one_hot_encoder = moa_one_hot_encoder, learning_rate_schedule, batch_size=32, input_size=(68,68,3))
# train_dataloader = DataLoader(df = df_train, model = model, one_hot_encoder = moa_one_hot_encoder, batch_size=32, input_size=(68,68,3))
# test_dataloader = DataLoader(df = df_test, one_hot_encoder = moa_one_hot_encoder, batch_size = 32, input_size=(68,68,3))


# rlrop = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',factor = 0.1, patience = 5, min_lr=1e-6)
# check_point = tf.keras.callbacks.ModelCheckpoint(filepath='checkpoints/test.h5' , monitor = "val_acc", mode = "max", save_best_only=True)
# model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.RMSprop(learning_rate=1e-2, momentum=0.9, decay = 0.01),metrics=['accuracy'])



# #history=model.fit(train_dataloader, validation_data=test_dataloader, epochs= 1, callbacks = [check_point], class_weight = moa_class_weights)
# #history=model.fit(train_dataloader, validation_data=test_dataloader, epochs= 20, callbacks = [check_point])
# history=model.fit(train_dataloader, validation_data=test_dataloader, epochs= 20, callbacks = [check_point, rlrop], class_weight = moa_class_weights)
# model.summary()
# model.save('models/test2.h5')
        
    

'train_results\\2022.06.13_01.15.11_checkpoints.h5'

In [11]:
import init_models
for i in range(4):
    model_id = datetime.datetime.now().strftime("%Y.%m.%d_%H.%M.%S")
    base_path = 'train_results'
    path_checkpoint = os.path.join(base_path, model_id+'_checkpoint.h5')
    path_model = os.path.join(base_path, model_id+'_model.h5')  
    path_log = os.path.join(base_path, model_id+'_log.csv')  
    
    model = init_models.init_model(i)


    #train_dataloader = DataLoader(df = df_train, model = model, one_hot_encoder = moa_one_hot_encoder, learning_rate_schedule, batch_size=32, input_size=(68,68,3))
    train_dataloader = DataLoader(df = df_train, model = model, one_hot_encoder = moa_one_hot_encoder, batch_size=32, input_size=(68,68,3))
    test_dataloader = DataLoader(df = df_test, one_hot_encoder = moa_one_hot_encoder, batch_size = 32, input_size=(68,68,3))


    rlrop = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',factor = 0.1, patience = 5, min_lr=1e-6)
    check_point = tf.keras.callbacks.ModelCheckpoint(filepath=path_checkpoint , monitor = "val_acc", mode = "max", save_best_only=True)
    csv_logger = tf.keras.callbacks.CSVLogger(path_log, append = True, separator=',')
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.RMSprop(learning_rate=1e-2, momentum=0.9, decay = 0.01),metrics=['accuracy'])



    #history=model.fit(train_dataloader, validation_data=test_dataloader, epochs= 1, callbacks = [check_point], class_weight = moa_class_weights)
    #history=model.fit(train_dataloader, validation_data=test_dataloader, epochs= 20, callbacks = [check_point])
    history=model.fit(train_dataloader, validation_data=test_dataloader, epochs= 20, callbacks = [check_point, rlrop, csv_logger], class_weight = moa_class_weights)
    model.summary()
    model.save(path_model)
        
    

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
 646/2901 [=====>........................] - ETA: 6:16 - loss: 2.5040 - accuracy: 0.5938