### Import Libraries

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
from keras.models import load_model
from tensorflow.keras.utils import plot_model
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Function to Compute Performance Metrics

In [2]:
#Source : Self-case study 1
def performanceResults(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0
    for i in range(len(y_hat)):
        if y_actual[i] == y_hat[i] == 1:
            TP += 1
        if y_hat[i] == 1 and y_actual[i] != y_hat[i]:
            FP += 1
        if y_actual[i] == y_hat[i] == 0:
            TN += 1
        if y_hat[i] == 0 and y_actual[i] != y_hat[i]:
            FN += 1
    acc = (TP + TN) / (TP + FP + TN + FN)
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    fpr = FP / (FP + TN)
    return acc, sensitivity, specificity, fpr

In [3]:
def final_fun_1(X_raw, best_model_path):
    best_model = load_model(best_model_path, compile=False)
#     best_model.summary()
    y_pred= best_model.predict(X_raw)
    return  np.argmax(y_pred,axis=1)

In [4]:
def final_fun_2(X_emb,y_raw):
    best_model_path =  '../input/msk-impact-trained-dnn/Saved_Models/best_model_3.hdf5'
    y_pred = final_fun_1(X_emb, best_model_path)
    PR = performanceResults(y_raw, y_pred)
    (lrfpr, lrtpr, lrthresholds) = metrics.roc_curve(y_raw, y_pred)
    roc_auc = metrics.auc(lrfpr, lrtpr)
    return PR, roc_auc

In [5]:
def load_and_process_dataset():
    master_data = pd.read_csv('../input/msk-impact-basic/processed_data.csv')
    master_data = master_data.drop(['TissueAge_yrs','AssayPerformance','SpecimenType'],axis=1)
    master_data['SAMPLE_TYPE'].replace(['Primary', 'Metastasis'],[0, 1], inplace=True)
    master_data['SPECIMEN_TYPE'].replace(['Resection', 'Biopsy','Cytology','CUSA'],[0, 1, 2, 3], inplace=True)
    master_data['SOMATIC_STATUS'].replace(['Matched', 'Unmatched'],[0, 1], inplace=True)
    master_data['SEX'].replace(['Male', 'Female'],[0, 1], inplace=True)
    master_data['SMOKING_HISTORY'].replace(['Prev/Curr Smoker', 'Never','Unknown'],[0, 1, 2], inplace=True)
    master_data['SampleType'].replace(['FFPE', 'DNA', 'Cell Pellet','FNA', 'Other'],[0, 1, 2, 3, 4], inplace=True)
    master_data['SAMPLE_COLLECTION_SOURCE'].replace(['In-House','Outside'],[0, 1], inplace=True)
    master_data['VITAL_STATUS'].replace(['ALIVE','DECEASED'],[0, 1], inplace=True)
    sample_ids = master_data['SAMPLE_ID'].tolist()
    data_mutations = pd.read_csv('../input/msk-impact-basic/data_mutations.txt',\
                                 sep='\t',\
                                 skiprows=1,\
                                 low_memory=False)
    t_ref_counts = []
    t_alt_counts = []
    protein_poss = []
    codon_counts = []
    pos_diffs = []
    for sample_id in tqdm(sample_ids):
        segment = data_mutations[data_mutations['Tumor_Sample_Barcode']==sample_id]
        pos_diff = segment['End_Position']- segment['Start_Position']
        pos_diff = np.sum(pos_diff)
        codons  = segment['Codons'].values
        codons = [x for x in codons if str(x) != 'nan']
        t_ref = np.mean(segment['t_ref_count'])
        t_alt = np.mean(segment['t_alt_count'])
        protein_pos = np.mean(segment['Protein_position'])
        t_ref_counts.append(t_ref) 
        t_alt_counts.append(t_alt) 
        protein_poss.append(protein_pos)
        codon_counts.append(len(codons))
        pos_diffs.append(pos_diff)
        
    master_data['t_ref_counts'] = t_ref_counts
    master_data['t_alt_counts'] = t_alt_counts
    master_data['protein_pos'] = protein_poss
    master_data['codon_counts'] = codon_counts
    master_data['pos_diffs'] = pos_diffs
    return master_data

In [6]:
processed_data = load_and_process_dataset()
X = processed_data.drop(['SAMPLE_TYPE'], axis=1)
y = processed_data['SAMPLE_TYPE']

numericalFeats = X.select_dtypes(include=[np.number]).columns

X_num = X[numericalFeats]
X_num = X_num.fillna(X_num.mean())
standard_scaler = StandardScaler()
X_num = standard_scaler.fit_transform(X_num)

100%|██████████| 8121/8121 [00:45<00:00, 178.02it/s]


In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X['DetailedTumorType'])
X_dtumort = tokenizer.texts_to_sequences(X['DetailedTumorType'])
X_dtumort = pad_sequences(X_dtumort, dtype='uint8')

In [8]:
nrows = X_num.shape[0]
pickidx = np.random.choice(nrows, size=10, replace=False)
test_input = [X_num[pickidx, :],X_dtumort[pickidx, :]]
test_labels = y[pickidx].to_numpy()

In [9]:
best_model_path =  '../input/msk-impact-trained-dnn/Saved_Models/best_model_3.hdf5'
y_pred = final_fun_1(test_input, best_model_path)

2022-10-31 15:50:07.695609: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2022-10-31 15:50:07.884799: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [10]:
print(test_labels)

[0 0 1 0 1 1 1 0 1 1]


In [11]:
print(y_pred)

[0 0 1 0 1 1 0 0 1 1]


In [12]:
PR, roc_auc = final_fun_2(test_input, test_labels)

In [13]:
# Source: Self-case study 1
print('Best Model Performance Metrics on Test Input Data')
print ('Accuracy: ', PR[0])
print ('Sensitivity: ', PR[1])
print ('Specificity: ', PR[2])
print ('False Positive Rate: ', PR[3])
print('Area Under the Curve (AUC):', roc_auc)

Best Model Performance Metrics on Test Input Data
Accuracy:  0.9
Sensitivity:  0.8333333333333334
Specificity:  1.0
False Positive Rate:  0.0
Area Under the Curve (AUC): 0.9166666666666667
