# Final networks

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

import helper_functions as my

### Initialize Data

In [None]:
#input path for the excel spreadsheet
INPUT_PATH = "../Conversion/"

In [None]:
#define which institution will be kept seperate
testInst = "y"
insts = [("a", "MDACC"), ("m", "MSKCC"), ("t", "Tokyo"), ("y", "Yale")]
df_original = pd.DataFrame()
for inst in insts:
    #excel sheet with all data
    df_i = pd.read_excel(INPUT_PATH + inst[1] + "_final_sigPCs.xlsx")
    df_i['patient number'] = df_i['patient number'].astype(str)
    df_i['patient number'] = inst[0] + "_" + df_i['patient number']
    df_i = df_i.drop(["days_PVE_follow_up", "FLR_increase"], axis=1)
    #seperate the testing institution
    if inst[0] == testInst:
        df_testInst = df_i
        continue
    df_i = df_i.dropna(axis=0, how='any')
    df_original = pd.concat([df_original, df_i], axis=0)
    
#format all
df_original = df_original.reset_index()
df_original = df_original.dropna(axis=0, how='any')
df_original = df_original.reset_index()
df_original = df_original.drop(["level_0", "index"], axis=1)

#format the separate institution
if testInst != "":
    df_testInst = df_testInst.dropna(axis=0, how='any')
    df_testInst = df_testInst.reset_index()
    df_testInst = df_testInst.drop(["index"], axis=1)
    df_testInst.set_index('patient number', drop=False, inplace=True)
    df_ess_inst = df_testInst[["patient number", "follow_up_FLR%", "follow_up_TLV", "KGR"]]
    df_testInst = my.dropFutureValues(df_testInst)

df_original.set_index('patient number', drop=False, inplace=True)

#keep Y value before throwing it away
df_ess = df_original[["patient number", "follow_up_FLR%", "follow_up_TLV", "KGR"]]

#get all X input features
df_X = my.dropFutureValues(df_original)

## FLR volume classification (> 30%)

In [None]:
#predict FLR percentage classification
cutOff = 0
df_input = df_X
df_inputInst = df_testInst[df_input.columns]

#initialize dataframes
df_important = pd.Series(0.0, index = df_input.columns, name="Average feature importance")
df_used = pd.Series(index=df_input.columns, dtype='int', name="Features frequency used")
df_avgGuessF = pd.DataFrame(index=df_ess.index)
numRuns = 100
#initialize lists
avg_tpr = []
avg_fpr = []
avg_inst_tpr = []
avg_inst_fpr = []
allAcc = []
allAuc = []
allInstAcc = []
allInstAuc = []
preds = np.zeros(numRuns)
df_avgGuessP = pd.DataFrame(index=df_ess.index)
for i in range(numRuns):
    #Create a Gaussian Classifier
    clf=RandomForestClassifier(n_estimators=100)

    # Split dataset into training set and test set
    x_train, x_test, y_train, y_test = my.splitXY(df_input, df_ess, col="follow_up_FLR%", classes=[0.30])

    #exclude highly correlated features
    x_train, to_drop = my.exclude_correlated_features_trainingset(x_train)
    x_test = my.drop_correlated_features_evaluation(x_test, to_drop)

    #Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(x_train, y_train)

    #get prediction
    y_pred=clf.predict(x_test)

    # Model Accuracy, how often is the classifier correct?
    tScore = clf.score(x_test, y_test)

    #save important features
    feature_imp = pd.Series(clf.feature_importances_,index=x_test.columns)
    for feat in feature_imp.index:
        df_important[feat] += feature_imp[feat]
        
    #save guesses
    df_thisGuessP = pd.DataFrame(y_pred, index=x_test.index)
    df_avgGuessP = df_avgGuessP.join(df_thisGuessP, lsuffix="_l")

    # calculate the fpr and tpr for all thresholds of the classification
    probs = clf.predict_proba(x_test)
    preds = probs[1][:,1]
    fpr, tpr, threshold = metrics.roc_curve(y_test[:,1], preds)
    roc_auc = metrics.auc(fpr, tpr)

    #save fpr, tpr, and auc
    avg_fpr = np.append(avg_fpr, fpr)
    avg_tpr = np.append(avg_tpr, tpr)
    allAuc = np.append(allAuc, roc_auc)
    allAcc = np.append(allAcc, tScore)

    #keep track of which ones were dropped
    for newName in x_train.columns:
        if newName in df_used.index:
            df_used[newName] += 1
        else:
            df_used = df_used.reindex(np.append(df_used.index, newName), fill_value=1)

    #SEPARATE INSTITUTION
    df_inst = my.drop_correlated_features_evaluation(df_inputInst, to_drop)
    #set x and y like other insts
    df_instX, df_instY = my.setXY(df_inst, df_ess_inst, col="follow_up_FLR%", classes=[0.30])

    #test
    probs = clf.predict_proba(df_instX)
    pred = probs[1]
    test_score = clf.score(df_instX,  df_instY)

    #auc curve
    fpr, tpr, threshold = metrics.roc_curve(df_instY[:,1], pred[:,1])
    inst_auc = metrics.auc(fpr, tpr)
    avg_inst_fpr = np.append(avg_inst_fpr, fpr)
    avg_inst_tpr = np.append(avg_inst_tpr, tpr)
    allInstAuc = np.append(allInstAuc, inst_auc)
    allInstAcc = np.append(allInstAcc, test_score)

#make it a percentage of total runs
df_important = df_important/numRuns

#get averages for quick results
avgScore = sum(allAcc)/numRuns
avgInstScore = sum(allInstAcc)/numRuns
avg_roc_auc = sum(allAuc)/numRuns
avgInst_auc = sum(allInstAuc)/numRuns
print("Test acc: ", avgScore, "\tInst acc: ", avgInstScore)
print("Test auc: ", avg_roc_auc, "\tInst auc: ", avgInst_auc, "\n")

#export auc curve stats to excel spreadsheet
fpr_tpr = np.stack((avg_fpr, avg_tpr), axis=1)
inst_fpr_tpr = np.stack((avg_inst_fpr, avg_inst_tpr), axis=1)
df_auc = pd.DataFrame(fpr_tpr, columns=["Test FPR", "Test TPR"])
df_instAuc = pd.DataFrame(inst_fpr_tpr, columns=["Inst FPR", "Inst TPR"])
df_allAuc = pd.concat([df_auc, df_instAuc], axis=1)

#export all to excel sheet
df_score = pd.DataFrame({'Testing Set Accuracy':allAcc, 'Inst Accuracy':allInstAcc})
df_avgAuc = pd.DataFrame({'Testing Set AUC':allAuc, 'Inst AUC':allInstAuc})
df_features = pd.concat([df_used, df_important], axis=1)
df_bothScores = pd.concat([df_score, df_avgAuc], axis=1)
df_export = pd.concat([df_bothScores, df_allAuc, df_features.sort_values(by="Average feature importance", ascending=False)], axis=0)
excelName = "FLRclass_" + testInst + "_" + str(cutOff) + ".xlsx"
df_export.to_excel(excelName)


## TLV prediction

In [None]:
#predict follow_up_TLV
cutOff = 0.1
if cutOff > 0:
    #linearly correlated features
    df_input = my.getCorrelatedInputs(df_X, df_ess, cutOff, "follow_up_TLV", verbose=2, heatmap = False)
else:
    #all features
    df_input = df_X

In [None]:
#predict TLV
#get columns used in fit
df_inputInst = df_testInst[df_input.columns]

#initiliaze dataframes and lists
df_important = pd.Series(0.0, index = df_input.columns, name="Average feature importance")
df_avgGuessV = pd.DataFrame(index=df_ess.index)
predInstV = pd.DataFrame(0.0, index=df_ess_inst.index, columns=["predicted TLV"])
df_used = pd.Series(index=df_input.columns, dtype='int', name="Features frequency used")
numRuns = 100
#save max value to get error in original units
vol_maxY = abs(np.amax(df_ess["follow_up_TLV"]))
allScores = []
allInstScores = []
for i in range(numRuns):
    #Create a Random Forest Regressor
    clf=RandomForestRegressor(n_estimators=400, max_depth=4)

    # Split dataset into training set and test set
    x_train_vol, x_test_vol, y_train_vol, y_test_vol = my.splitXY(df_input, df_ess, col = "follow_up_TLV", maxY = vol_maxY)

    #exclude highly correlated features
    x_train_vol, to_drop = my.exclude_correlated_features_trainingset(x_train_vol)
    x_test_vol = my.drop_correlated_features_evaluation(x_test_vol, to_drop)

    #Train the model using the training 
    clf.fit(x_train_vol, y_train_vol)
    y_pred=clf.predict(x_test_vol)

    # RMSE
    tScore = np.sqrt(metrics.mean_squared_error(y_test_vol, y_pred))*vol_maxY
    allScores = np.append(allScores, tScore)

    #important features
    feature_imp = pd.Series(clf.feature_importances_,index=x_train_vol.columns)
    for feat in feature_imp.index:
        df_important[feat] += feature_imp[feat]

    df_thisGuessV = pd.DataFrame(y_pred, index=x_test_vol.index)
    df_avgGuessV = df_avgGuessV.join(df_thisGuessV, lsuffix="_l")

    #SEPARATE INSTITUTION
    #get used features from other institution
    df_inst = my.drop_correlated_features_evaluation(df_inputInst, to_drop)
    #set x and y like other insts
    df_instX, df_instY = my.setXY(df_inst, df_ess_inst, col = "follow_up_TLV", maxY = vol_maxY)

    #test
    y_pred=clf.predict(df_instX)
    # RMSE
    instRMSE = np.sqrt(metrics.mean_squared_error(df_instY, y_pred))*vol_maxY
    allInstScores = np.append(allInstScores, instRMSE)
    predInstV["predicted TLV"] += y_pred

    #keep track of which ones were dropped
    for newName in x_train_vol.columns:
        if newName in df_used.index:
            df_used[newName] += 1
        else:
            df_used = df_used.reindex(np.append(df_used.index, newName), fill_value=1)

#change from total to average
df_important = df_important/numRuns
predInstV = predInstV/numRuns * vol_maxY



#average results
avgScore = sum(allScores)/numRuns
avgInstRMSE = sum(allInstScores)/numRuns
print("Test score: ", avgScore, "\tInst score: ", avgInstRMSE, "\n")

#export TLV predictions to spreadsheet
predV = df_avgGuessV.mean(axis=1)*vol_maxY
realV = df_ess["follow_up_TLV"]
df_predV = pd.DataFrame(predV, index=df_ess.index, columns=["predicted TLV"])
df_predInstV = pd.DataFrame(predV, index=df_ess_inst.index, columns=["predicted TLV"])
df_bothPredV = pd.concat([df_predV, predInstV], axis=0)
df_bothEss = pd.concat([df_ess["follow_up_TLV"], df_ess_inst["follow_up_TLV"]], axis=0)
df_allV = pd.concat([df_bothPredV, df_bothEss], axis=1)
df_RMSE = pd.Series([avgScore, avgInstRMSE], index = ["Testing Set", "Separate Inst"], name="RMSE")
df_allRMSE = pd.DataFrame({"Test RMSE":allScores, "Inst RMSE":allInstScores})
df_features = pd.concat([df_used, df_important], axis=1)
df_export = pd.concat([pd.concat([df_RMSE], axis=1), df_allRMSE, df_allV, df_features.sort_values(by="Average feature importance", ascending=False)], axis=0)
excelName = "TLVpred_" + testInst + "_" + str(cutOff) + ".xlsx"
df_export.to_excel(excelName)


## Kinetic Growth Rate classification (> 2%)

In [None]:
cutOff = 0.13
if cutOff > 0:
    #linearly correlated
    df_input = my.getCorrelatedInputs(df_X, df_ess, cutOff, "KGR", verbose=2, heatmap = False)
else:
    #all inputs
    df_input = df_X

In [None]:
#predict KGR
#predict above or below this value
classBounds = [0.00]

#initalize dataframes
df_inputInst = df_testInst[df_input.columns]
df_avgGuessK = pd.DataFrame(index=df_ess.index)
df_used = pd.Series(index=df_input.columns, dtype='int', name="Features frequency used")

#DEFINE NUMBER OF RUNS HERE
numRuns = 100

#initalize lists
avg_tpr = []
avg_fpr = []
avg_inst_fpr = []
avg_inst_tpr = []

allAcc = []
allAuc = []
allInstAcc = []
allInstAuc = []
for i in range(numRuns):
    #Create a Gaussian Classifier
    clf=RandomForestClassifier(n_estimators=100, max_depth=4)

    # Split dataset into training set and test set
    x_train, x_test, y_train, y_test = my.splitXY(df_input, df_ess, col="KGR", classes=classBounds)

    #exclude highly correlated features
    x_train, to_drop = my.exclude_correlated_features_trainingset(x_train)
    x_test = my.drop_correlated_features_evaluation(x_test, to_drop)

    #Train the model using the training 
    clf.fit(x_train, y_train)
    y_pred=clf.predict(x_test)

    # Model Accuracy, how often is the classifier correct?
    tScore = clf.score(x_test, y_test)
    probs = clf.predict_proba(x_test)
    preds = probs[1][:,1]

    # calculate the fpr and tpr for all thresholds of the classification
    fpr, tpr, threshold = metrics.roc_curve(y_test[:,1], preds)
    roc_auc = metrics.auc(fpr, tpr)
    avg_fpr = np.append(avg_fpr, fpr)
    avg_tpr = np.append(avg_tpr, tpr)
    allAuc = np.append(allAuc, roc_auc)
    allAcc = np.append(allAcc, tScore)

    #keep track of which ones were dropped
    for newName in x_train.columns:
        if newName in df_used.index:
            df_used[newName] += 1
        else:
            df_used = df_used.reindex(np.append(df_used.index, newName), fill_value=1)

    #record the cummulative predictions and accuracies
    df_thisGuessK = pd.DataFrame(y_pred.argmax(axis=1), index=x_test.index)
    df_avgGuessK = df_avgGuessK.join(df_thisGuessK, lsuffix="_l")

    #SEPARATE INSTITUTION
    df_inst = my.drop_correlated_features_evaluation(df_inputInst, to_drop)
    #set x and y like other insts
    df_instX, df_instY = my.setXY(df_inst, df_ess_inst, col="KGR", classes=classBounds)

    #test
    tInstScore = clf.score(df_instX, df_instY)
    probs = clf.predict_proba(df_instX)
    preds = probs[1][:,1]

    #auc curve
    fpr, tpr, threshold = metrics.roc_curve(df_instY[:,1], preds)
    inst_auc = metrics.auc(fpr, tpr)
    avg_inst_fpr = np.append(avg_inst_fpr, fpr)
    avg_inst_tpr = np.append(avg_inst_tpr, tpr)
    allInstAuc = np.append(allInstAuc, inst_auc)
    allInstAcc = np.append(allInstAcc, tInstScore)

#calculate average results
avgAcc = sum(allAcc)/numRuns
avgInstAcc = sum(allInstAcc)/numRuns
avg_roc_auc = sum(allAuc)/numRuns
avgInst_auc = sum(allInstAuc)/numRuns
print("Test acc: ", avgAcc, "\tInst acc: ", avgInstAcc)
print("Test auc: ", avg_roc_auc, "\tInst auc: ", avgInst_auc)

#export auc curve stats to excel spreadsheet
fpr_tpr = np.stack((avg_fpr, avg_tpr), axis=1)
inst_fpr_tpr = np.stack((avg_inst_fpr, avg_inst_tpr), axis=1)
df_auc = pd.DataFrame(fpr_tpr, columns=["Test FPR", "Test TPR"])
df_instAuc = pd.DataFrame(inst_fpr_tpr, columns=["Inst FPR", "Inst TPR"])
df_allFprTpr = pd.concat([df_auc, df_instAuc], axis=1)

#export all saved data to spreadsheet
df_score = pd.DataFrame([avgAcc, avgInstAcc], index = ["Testing Set", "Separate Inst"], columns=["Avg Accuracy"])
df_avgAuc = pd.DataFrame([avg_roc_auc, avgInst_auc], index = ["Testing Set", "Separate Inst"], columns=["Avg AUC"])
df_allScores = pd.DataFrame({'Testing Set Accuracy':allAcc, 'Inst Accuracy':allInstAcc})
df_allAuc = pd.DataFrame({'Testing Set AUC':allAuc, 'Inst AUC':allInstAuc})
df_bothScores = pd.concat([df_score, df_avgAuc], axis=1)
df_allAccAuc = pd.concat([df_allScores, df_allAuc], axis=1)

df_temp = pd.concat([df_bothScores, df_allAccAuc, df_allFprTpr], axis=0)
df_export = pd.concat([df_temp, pd.concat([df_used.sort_values(ascending=False)], axis=1)], axis=0)
excelName = "KGRclass_" + testInst + "_" + str(cutOff) + ".xlsx"
df_export.to_excel(excelName) 