### Import Libraries

In [18]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'tensorflow'

### Load dataset

In [2]:
data = pd.read_csv('./processed_data/processed_data.csv')

### Import Dataset if Kaggle Notebook

In [None]:
# !kaggle datasets download -d 'msk-impact'
# !7za x msk-impact.zip -o/content
# !rm msk-impact.zip.zip

In [9]:
data.head(2)

Unnamed: 0,SAMPLE_ID,SAMPLE_COLLECTION_SOURCE,SPECIMEN_TYPE,SAMPLE_COVERAGE,TUMOR_PURITY,SAMPLE_TYPE,SAMPLE_CLASS,ONCOTREE_CODE,CANCER_TYPE_DETAILED,SOMATIC_STATUS,...,TumorTypeCode,ExonicMutationCount,SNVCount,INDELCount,SCNACount,SVCount,SilentMutationCount,TissueAge_yrs,N,Cancer_Type
0,P-0000023-T01-IM3,In-House,Biopsy,454,30.0,0,Tumor,PEMESO,Peritoneal Mesothelioma,Matched,...,PEMESO,5,3,2,1,0,0,0.52,18,Mesothelioma
1,P-0001126-T01-IM3,In-House,Resection,574,30.0,1,Tumor,PEMESO,Peritoneal Mesothelioma,Matched,...,PEMESO,2,1,1,1,0,1,0.22,18,Mesothelioma


#### Convert Binary Target Variable to Numeric

In [8]:
data['SAMPLE_TYPE'].replace(['Primary', 'Metastasis'],[0, 1], inplace=True)

In [10]:
data.head(2)

Unnamed: 0,SAMPLE_ID,SAMPLE_COLLECTION_SOURCE,SPECIMEN_TYPE,SAMPLE_COVERAGE,TUMOR_PURITY,SAMPLE_TYPE,SAMPLE_CLASS,ONCOTREE_CODE,CANCER_TYPE_DETAILED,SOMATIC_STATUS,...,TumorTypeCode,ExonicMutationCount,SNVCount,INDELCount,SCNACount,SVCount,SilentMutationCount,TissueAge_yrs,N,Cancer_Type
0,P-0000023-T01-IM3,In-House,Biopsy,454,30.0,0,Tumor,PEMESO,Peritoneal Mesothelioma,Matched,...,PEMESO,5,3,2,1,0,0,0.52,18,Mesothelioma
1,P-0001126-T01-IM3,In-House,Resection,574,30.0,1,Tumor,PEMESO,Peritoneal Mesothelioma,Matched,...,PEMESO,2,1,1,1,0,1,0.22,18,Mesothelioma


### Train and Validation Split

In [14]:
X = data.drop(['SAMPLE_TYPE'], axis=1)
y = data[['SAMPLE_TYPE']].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=45, stratify=y)

### Define Callbacks

In [None]:
# Callback for micro-f1 score
class LossHistory(tf.keras.callbacks.Callback):
    
    def __init__(self,validation_data):
        self.x_test = validation_data[0]
        self.y_test= validation_data[1]
        
    def on_train_begin(self, logs={}):
        ## on begin of training, we are creating a instance varible called history
        ## it is a dict with keys [loss, acc, val_loss, val_acc]
        self.history={'loss': [],'accuracy': [],'val_loss': [],'val_accuracy': [],'val_f1': []}
        
    def on_epoch_end(self, epoch, logs={}):
        ## on end of each epoch, we will get logs and update the self.history dict
        self.history['loss'].append(logs.get('loss'))
        self.history['accuracy'].append(logs.get('accuracy'))
      
        if logs.get('val_loss', -1) != -1:
            self.history['val_loss'].append(logs.get('val_loss'))
        if logs.get('val_accuracy', -1) != -1:
            self.history['val_accuracy'].append(logs.get('val_accuracy'))
       
        y_pred= self.model.predict(self.x_test)
        y_label_pred=np.argmax(y_pred,axis=1)
        y_test_arg = self.y_test
        
        F1 = f1_score(y_test_arg, y_label_pred, average='micro')
        self.history['val_f1'].append(F1)
        print(' Val F1 Score : ', np.round(F1,4))

In [None]:
# Early Stopping Scheduler
earlyStopper = ReduceLROnPlateau(monitor='val_accuracy', factor=0.9,patience=2, min_lr=0.001)

# Learning Rate Scheduler
def ruleBasedScheduler(epoch, lr):
    if((epoch+1)%3==0):     
        lr=0.95*lr   
    return lr

learningRateScheduler = LearningRateScheduler(ruleBasedScheduler, verbose=0.1)

### Training Parameter Settings

In [15]:
learning_rate = 0.01
num_classes = 2
epochs = 20
batch_size = 64

### Initialize Optimizers

In [17]:
adamopt = tf.keras.optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8)

NameError: name 'tf' is not defined

### Define Model Architecture

### Initialize History Callback

In [None]:
history_callback = LossHistory(validation_data=([X_train, y_test]))  

### Model Persistence

In [None]:
# To Save Best Model subject to Validation Accuracy improvement
filepath= "./Model1/best_model_1.hdf5"
checkpoint1 = ModelCheckpoint(filepath=filepath, monitor='val_accuracy',verbose=1, save_best_only=True, mode='auto')

### TensorBoard Setup

In [None]:
# Set Path for Tensorboard Data
log_dir = os.path.join("logs",'fits','Model')
tfcb = tf.keras.callbacks.TensorBoard(log_dir=log_dir,histogram_freq=1,write_graph=True)

### Model Compilation

In [None]:
model.compile(optimizer=adamopt,\
              loss = tf.keras.losses.sparse_categorical_crossentropy,\
              metrics=['accuracy'])
model.summary()

In [None]:
# Combining all Callbacks
mycallbacks = [history_callback,checkpoint,earlyStopper,tfcb]

### Model Training

In [19]:
history1 = model1.fit(x=X_train,\
                      y=y_train,\
                      batch_size=batch_size,\
                      epochs=epochs,\
                      steps_per_epoch = int(np.ceil(X_train.shape[0] / batch_size) ),\
                      validation_data=(X_test, y_test),\
                      callbacks=mycallbacks)

NameError: name 'model1' is not defined

### TensorBoard Visualization

In [20]:
# Load Tensor Board
%load_ext tensorboard
%tensorboard --logdir logs/fits

ModuleNotFoundError: No module named 'tensorboard'