In [None]:
import os
import pandas as pd
import numpy as np
import sys
import win32com.client
import getpass
import datetime
import pywintypes
import matplotlib.pyplot as plt
import pickle
%matplotlib inline
#ann model
import tensorflow as tf
from tensorflow.keras import layers
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sksurv.preprocessing import OneHotEncoder
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler


import matplotlib.ticker as mtick
import math
from sklearn.ensemble import IsolationForest
pd.set_option('display.width', None)
pd.set_option('display.max_column',None)
pd.set_option('display.max_rows',None)


<h2>ANN</h2>

In [None]:
def read_data():
    """
    Returns all the data that needs to be used for ANN.
    Output(3 dataframes): clinical data, clinical data (OHE), bills grouped by time period
    """
    master = pd.read_pickle("C:\\SMU_v2\\price_timeperiod.pkl")
    
    listToKeep = ['NRIC','dx_date','tstage','nstage', 'Mstage', 'ER', 'PR',\
               'Her2', 'size_precise', 'nodespos', 'Age_@_Dx']
    
    clinical = master[listToKeep]
    
    OHE = [i for i in clinical.columns if not (i in  ["NRIC", 'Age_@_Dx', 'size_precise', 'nodespos','dx_date'])]
    x_clinical = pd.get_dummies(clinical,columns=OHE,dummy_na=True).reset_index(drop=True)
    prices_grouped = master[["NRIC","before_6m", "after_6m", "after_1y", "after_2y", "after_3y", "after_4y",
               "after_5y", "after_6y", "after_7y","after_8y", "after_9y", "after_10y"]]
    return clinical.reset_index(drop=True), x_clinical.reset_index(drop=True), prices_grouped.reset_index(drop=True)

def scale_data(data,scale_obj):
    """
    transforms then scales data according to min-max
    """
#     data = data.apply(np.log1p)
    prices_grouped_scaled = pd.DataFrame(scale_obj.fit_transform(data))
    return prices_grouped_scaled

def scale_data_reverse(data,scale_obj):
    """
    returns a dataframe that reverses the min-max that was done previously
    """
    data = pd.DataFrame(scale_obj.inverse_transform(data))
#     predictions_scaled_reverse = data.apply(np.expm1)
    return data

def ann_structure(input_shape,output_units):
    """
    function to declare ANN structure. just for code cleaniness
    """
    model = tf.keras.Sequential()
    model.add(layers.Dense(64, input_shape=(input_shape,)))         # input layer
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(64, activation=tf.nn.leaky_relu))      # one hidden layer
    
    model.add(layers.Dropout(.5))
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dropout(.5))
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dropout(.5))
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dropout(.5))
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(16, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(16, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dropout(.5))
    model.add(layers.Dense(8, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(8, activation=tf.nn.leaky_relu))      # one hidden layerr
    model.add(layers.Dropout(.5))
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(8, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(8, activation=tf.nn.leaky_relu))      # one hidden layerr
    model.add(layers.Dropout(.5))
    model.add(layers.Dense(output_units, activation=tf.nn.leaky_relu))   # one output layer with 1 outputs
    return model

def remove_out_of_range(data):
    """
    determines index of data where there is no additional information
    """
    y1 = data[data["after_1y"].isnull()].index
    y2 = data[data["after_2y"].isnull()].index
    y5 = data[data["after_5y"].isnull()].index
    y10 = data[data["after_10y"].isnull()].index
    return {
        "y1":[4,y1], 
        "y2":[5,y2], 
        "y5":[8,y5], 
        "y10":[13,y10]}

def remove_meaningless_data(data):
    """
    returns index of all rows that do not add any additional input. aka all fields are 0
    """
    return data[data.sum(axis=1)==0].index

def drop_by_index(X,y,indexes):
    """
    helper function to drop rows of dataframe and return new dataframe without those rows with indexes resetted
    """
    y = y.drop(indexes)
    X = X.drop(indexes)
    X = X.reset_index(drop=True)
    y = y.reset_index(drop=True)
    return(X,y)

def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 100:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (10 - int(epoch)))
    
def process_time_period(data,scope):
    """
    Takes in yearly healthcare costs of patients and processes it into 1,2,5,10 year values
    """
    y = pd.DataFrame()
    y["6 months before"] = data.iloc[:,0]
    y["6 months after"] = data.iloc[:,1]
    y["1 year after"] = data.iloc[:,2]
    if scope != "y1":
        y["2 years after"] = data.iloc[:,3]
        if scope != "y2":
            y["5 years after"] = data.iloc[:,4:7].sum(axis=1)
            if scope != "y5":
                y["10 years after"] = data.iloc[:,7:].sum(axis=1)
    return y
    
def make_prediction(all_users,user,model,mms):
    """
    Given user data(dataframe) and the trained model, outputs the predicted values.
    
    Only works if all items in new user data has appeared at least once before
    """
    all_users = all_users.reset_index().drop(columns="index")
    last_row = all_users.shape[0]
    all_users = all_users.append(user)
    all_users = all_users.drop(columns=["NRIC","dx_date"])
    OHE = [i for i in all_users.columns if not (i in  ["NRIC", 'Age_@_Dx', 'size_precise', 'nodespos','dx_date'])]
    usersOHE = pd.get_dummies(all_users,columns=OHE,dummy_na=True).reset_index().drop(columns="index")
    prediction_x = usersOHE
#     print(prediction_x.shape)
    pred = model.predict(prediction_x)
    predictions_scaled_reverse = pd.DataFrame(mms.inverse_transform(pred),columns=["6 months before","6 months after",
                                                                                   "1 year after","2 years after",
                                                                                   "5 years after","10 years after"][:pred.shape[1]])
    return pd.DataFrame([pd.DataFrame(predictions_scaled_reverse).iloc[last_row]]).reset_index().drop(columns="index")

def make_comparison(all_users,all_users_OHE,NRIC,bills,model,mms):
    """
    Given a specific user, calculate out his actual cost and predicted costs
    """
    x = all_users[all_users["NRIC"] == NRIC]
    pred = make_prediction(all_users,x,model,mms)
    pred["Status"] = "Prediction"
    if pred.shape[1] == 7:
        limit = 0
    elif pred.shape[1] == 6:
        limit = 1
    else:
        limit = 2
    y_test = process_time_period(bills[all_users["NRIC"] == NRIC],limit)
    y_test["Status"] = "True data"
    y_test.columns = ["6 months before","6 months after","1 year after",
                      "2 years after","5 years after","10 years after"][:pred.shape[1]-1] +["Status"]
    
    to_return = y_test.append(pred)
    return to_return

def get_percentage(df1,df2,percentage):
    """
    Given 2 dataframes, get the difference between the dataframes, 
    and return number of records that fall within a given percentage.
    Eg: df1 contains 5 values [1,2,3,4,5]. Df2 contains [1,2,9,4,5]. 4 out of 5 values in 
    df1 fall within +- 5(percentage)% of the values in the same postion in df2. 
    Thus function will return 4/5 or 0.8
    """
    process = lambda s1,s2: abs(s1-s2)/s2 < percentage 
    combined = df1.combine(df2, process)
    total_count = (df1.shape[0] * df1.shape[1])
    minus = sum([pd.value_counts(df2[i].values)[0] for i in df2.columns])
    total_count -= minus
    return combined.sum().sum() / total_count

def display_graph(scope,predictions_scaled_reverse,y_test_scaled_reverse):
    graph = pd.DataFrame(np.arange(0,2,.01),columns=["Percentage"])
    graph["viz"] = graph.applymap(lambda x: get_percentage(predictions_scaled_reverse,y_test_scaled_reverse,x))
    show = graph.plot.area(x="Percentage")
    show.set_title("Model performance ({})".format(scope))
    show.set_xlabel("Percentage Difference from Ground Truth")
    show.set_ylabel("Percentage of all our predictions")
    show.xaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    show.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    
def get_data(scope,clinicalOHE, bills_grouped, outlier=False):
    index = remove_indexes[scope]

    X = clinicalOHE
    y = bills_grouped.iloc[:,:index[0]]  

    X = X.iloc[:,2:]
    y = process_time_period(y.iloc[:,1:],scope)

    print("Data shape original: {}".format(X.shape[0]))
    X,y = drop_by_index(X,y,index[1])

    print("Data shape removing data out of scope: {}".format(X.shape[0]))


    meaningless = remove_meaningless_data(y)
    X,y = drop_by_index(X,y,meaningless)

    print("Data shape meaningless data: {}".format(X.shape[0]))



    if outlier:
        clf = IsolationForest(contamination="auto",behaviour="new",random_state=42)
        out = clf.fit_predict(y)
        out_df = pd.DataFrame(out,columns=["outlier"])
        remove = out_df[out_df["outlier"] ==-1].index
        X,y = drop_by_index(X,y,remove)

        print("Data shape after removing outliers: {}".format(X.shape[0]))

    mms = MinMaxScaler()
    y_scaled = scale_data(y,mms)
    return X,y_scaled,mms

def loadOHE(df,OHE_LOCATION = "C:\\SMU_v2\\OHE\\", name=""):
    '''
    load enconder to OHE new raw data for prediction
    '''
    with open( "{}{}{}".format(OHE_LOCATION, name, '_encoder.pickle'), 'rb') as f:
        enc = pickle.load(f) 
    
    #type case object to category
    if len(list(df.select_dtypes(include=[object]).columns)) > 0:
        typeCastList = list(df.select_dtypes(include=[object]).columns)
        df[typeCastList] = df[typeCastList].astype("category")
    OHE_New_Data = enc.transform(df)
    
    return OHE_New_Data

def get_running_total(data):
    """
    
    """
    data = data.copy(deep=True)
    data1 = data.copy(deep=True)
    plot_attr_list = ['after_6m','after_1y','after_2y','after_3y','after_4y','after_5y','after_6y','after_7y',"after_8y","after_9y","after_10y"]
    for i in range(len(plot_attr_list)-1,-1,-1):
        current_attr = plot_attr_list[i]
        data.loc[:,current_attr] = data.iloc[:,:i+2].sum(axis=1)
        data.loc[data1[current_attr].isna(),current_attr] = np.NaN
    return data

def get_scoring(groundtruth,predictions,previous):
    median = np.median(previous)
    b_score = 0
    p_score = 0
    counter = 0
    for i in groundtruth:
        b_score += ((i - median) ** 2)
        p_score += (i - predictions[counter][0]) ** 2
        counter += 1
    b_score = b_score ** 0.5
    p_score = p_score ** 0.5
    return b_score,p_score

In [None]:
clinical, clinicalOHE, bills_grouped = read_data()

remove_indexes = remove_out_of_range(bills_grouped)


In [None]:
#Read in groupings
x_group1 = pd.read_pickle("C:\\SMU_v2\\Layered Folder\\group 1_layer 4.pkl").iloc[:,:-2]
x_group2 = pd.read_pickle("C:\\SMU_v2\\Layered Folder\\group 2_layer 1.pkl").iloc[:,:-2]
x_group3 = pd.read_pickle("C:\\SMU_v2\\Layered Folder\\group 3_layer 5.pkl").iloc[:,:-2]

In [None]:
x_group1.head()

In [None]:
x_group2.head()

In [None]:
x_group1_OHE = loadOHE(x_group1,name="group 1_layer 4")
x_group2_OHE = loadOHE(x_group2,name="group 2_layer 1")
x_group3_OHE = loadOHE(x_group3,name="group 3_layer 5")


In [None]:
x_group3.head()

In [None]:
#combine with bills
bills = pd.read_pickle("C:\\SMU_v2\\price_timeperiod.pkl")
y_group1 = get_running_total(bills.loc[x_group1.index.values].iloc[:,-12:])
y_group2 = get_running_total(bills.loc[x_group2.index.values].iloc[:,-12:])
y_group3 = get_running_total(bills.loc[x_group3.index.values].iloc[:,-12:])

In [None]:
y_group1.head()

In [None]:
X_all,y_all = drop_by_index(x_group1_OHE,y_group1,remove_meaningless_data(y_group1))

In [None]:
def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 1000:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (1000 - int(epoch)))
    
X = X_all[~y_all["after_10y"].isna()]
y = y_all[~y_all["after_10y"].isna()][["after_10y"]]
mms = MinMaxScaler()
y_scaled = mms.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.2, random_state=42)

model = ann_structure(X.shape[1],y.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 2000
filepath="C:\\SMU_v2\\ann\\model_group1_10y.h5"

pickle.dump(mms, open(filepath[:-3]+"_mms.sav", 'wb'))
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
# callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))

In [None]:
model.load_weights(filepath)
predictions = model.predict(X_test)
y_pred_unscaled = mms.inverse_transform(predictions)
y_test_unscaled = mms.inverse_transform(y_test)
y_actual_unscaled = mms.inverse_transform(y_train)
get_scoring(y_test_unscaled,y_pred_unscaled,y_actual_unscaled)

In [None]:
y_pred_unscaled

In [None]:
#5year
def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 50:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (10 - int(epoch)))

X = X_all[~y_all["after_5y"].isna()]
y = y_all[~y_all["after_5y"].isna()][["after_5y"]]

mms = MinMaxScaler()
y_scaled = mms.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.2, random_state=42)

model = ann_structure(X.shape[1],y.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 500
filepath="C:\\SMU_v2\\ann\\model_group1_5y.h5"
pickle.dump(mms, open(filepath[:-3]+"_mms.sav", 'wb'))
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
# callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))

In [None]:
model.load_weights(filepath)
predictions = model.predict(X_test)
y_pred_unscaled = mms.inverse_transform(predictions)
y_test_unscaled = mms.inverse_transform(y_test)
y_actual_unscaled = mms.inverse_transform(y_train)
get_scoring(y_test_unscaled,y_pred_unscaled,y_actual_unscaled)

In [None]:
pd.DataFrame(y_pred_unscaled).describe()

In [None]:
pd.DataFrame(y_actual_unscaled).describe()

In [None]:
#2year
def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 10:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (10 - int(epoch)))

X = X_all[~y_all["after_2y"].isna()]
y = y_all[~y_all["after_2y"].isna()][["after_2y"]]

mms = MinMaxScaler()
y_scaled = mms.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.2, random_state=42)

model = ann_structure(X.shape[1],y.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 300
filepath="C:\\SMU_v2\\ann\\model_group1_2y.h5"

pickle.dump(mms, open(filepath[:-3]+"_mms.sav", 'wb'))
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
# callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))

In [None]:
model.load_weights(filepath)
predictions = model.predict(X_test)
y_pred_unscaled = mms.inverse_transform(predictions)
y_test_unscaled = mms.inverse_transform(y_test)
y_actual_unscaled = mms.inverse_transform(y_train)
get_scoring(y_test_unscaled,y_pred_unscaled,y_actual_unscaled)

In [None]:
#1year
def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 20:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (10 - int(epoch)))

X = X_all[~y_all["after_1y"].isna()]
y = y_all[~y_all["after_1y"].isna()][["after_1y"]]

mms = MinMaxScaler()
y_scaled = mms.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.2, random_state=42)

model = ann_structure(X.shape[1],y.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 300
filepath="C:\\SMU_v2\\ann\\model_group1_1y.h5"

pickle.dump(mms, open(filepath[:-3]+"_mms.sav", 'wb'))
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
# callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))

In [None]:
model.load_weights(filepath)
predictions = model.predict(X_test)
y_pred_unscaled = mms.inverse_transform(predictions)
y_test_unscaled = mms.inverse_transform(y_test)
y_actual_unscaled = mms.inverse_transform(y_train)
get_scoring(y_test_unscaled,y_pred_unscaled,y_actual_unscaled)

In [None]:
#6month
def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 20:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (10 - int(epoch)))

X = X_all[~y_all["after_6m"].isna()]
y = y_all[~y_all["after_6m"].isna()][["after_6m"]]

mms = MinMaxScaler()
y_scaled = mms.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.2, random_state=42)

model = ann_structure(X.shape[1],y.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 400
filepath="C:\\SMU_v2\\ann\\model_group1_6m.h5"

pickle.dump(mms, open(filepath[:-3]+"_mms.sav", 'wb'))
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
# callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))

In [None]:
model.load_weights(filepath)
predictions = model.predict(X_test)
y_pred_unscaled = mms.inverse_transform(predictions)
y_test_unscaled = mms.inverse_transform(y_test)
y_actual_unscaled = mms.inverse_transform(y_train)
get_scoring(y_test_unscaled,y_pred_unscaled,y_actual_unscaled)

In [None]:
X_all,y_all = drop_by_index(x_group2_OHE,y_group2,remove_meaningless_data(y_group2))

In [None]:
#10year
def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 20:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (10 - int(epoch)))
    
X = X_all[~y_all["after_10y"].isna()]
y = y_all[~y_all["after_10y"].isna()][["after_10y"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = ann_structure(X.shape[1],y.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 500
filepath="C:\\SMU_v2\\ann\\model_group2_10y.h5"
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))


#5year
def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 20:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (10 - int(epoch)))
    
X = X_all[~y_all["after_5y"].isna()]
y = y_all[~y_all["after_5y"].isna()][["after_5y"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = ann_structure(X.shape[1],y.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 50
filepath="C:\\SMU_v2\\ann\\model_group2_5y.h5"
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))

#2year
def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 20:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (10 - int(epoch)))
    
X = X_all[~y_all["after_2y"].isna()]
y = y_all[~y_all["after_2y"].isna()][["after_2y"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = ann_structure(X.shape[1],y.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 50
filepath="C:\\SMU_v2\\ann\\model_group2_2y.h5"
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))

#1year
def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 20:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (10 - int(epoch)))
    
X = X_all[~y_all["after_1y"].isna()]
y = y_all[~y_all["after_1y"].isna()][["after_1y"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = ann_structure(X.shape[1],y.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 50
filepath="C:\\SMU_v2\\ann\\model_group2_1y.h5"
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))

# 6month
def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 20:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (10 - int(epoch)))
    
X = X_all[~y_all["after_6m"].isna()]
y = y_all[~y_all["after_6m"].isna()][["after_6m"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = ann_structure(X.shape[1],y.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 50
filepath="C:\\SMU_v2\\ann\\model_group2_6m.h5"
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))


In [None]:
X_all,y_all = drop_by_index(x_group3_OHE,y_group3,remove_meaningless_data(y_group3))

In [None]:
#10year
def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 100:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (10 - int(epoch)))
    
X = X_all[~y_all["after_10y"].isna()]
y = y_all[~y_all["after_10y"].isna()][["after_10y"]]

mms = MinMaxScaler()
y_scaled = mms.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.2, random_state=42)

model = ann_structure(X.shape[1],y.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 300
filepath="C:\\SMU_v2\\ann\\model_group3_10y.h5"

pickle.dump(mms, open(filepath[:-3]+"_mms.sav", 'wb'))
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
# callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))


#5year
def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 100:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (10 - int(epoch)))
    
X = X_all[~y_all["after_5y"].isna()]
y = y_all[~y_all["after_5y"].isna()][["after_5y"]]

mms = MinMaxScaler()
y_scaled = mms.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.2, random_state=42)
model = ann_structure(X.shape[1],y.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs =100
filepath="C:\\SMU_v2\\ann\\model_group3_5y.h5"

pickle.dump(mms, open(filepath[:-3]+"_mms.sav", 'wb'))
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
# callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))

#2year
def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 100:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (10 - int(epoch)))
    
X = X_all[~y_all["after_2y"].isna()]
y = y_all[~y_all["after_2y"].isna()][["after_2y"]]

mms = MinMaxScaler()
y_scaled = mms.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.2, random_state=42)
model = ann_structure(X.shape[1],y.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs =100
filepath="C:\\SMU_v2\\ann\\model_group3_2y.h5"

pickle.dump(mms, open(filepath[:-3]+"_mms.sav", 'wb'))
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
# callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))

#1year
def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 100:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (10 - int(epoch)))
    
X = X_all[~y_all["after_1y"].isna()]
y = y_all[~y_all["after_1y"].isna()][["after_1y"]]

mms = MinMaxScaler()
y_scaled = mms.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.2, random_state=42)
model = ann_structure(X.shape[1],y.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 50
filepath="C:\\SMU_v2\\ann\\model_group3_1y.h5"
pickle.dump(mms, open(filepath[:-3]+"_mms.sav", 'wb'))
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
# callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))

#6month
def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 100:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (10 - int(epoch)))
    
X = X_all[~y_all["after_6m"].isna()]
y = y_all[~y_all["after_6m"].isna()][["after_6m"]]

mms = MinMaxScaler()
y_scaled = mms.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.2, random_state=42)
model = ann_structure(X.shape[1],y.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 100
filepath="C:\\SMU_v2\\ann\\model_group3_6m.h5"
pickle.dump(mms, open(filepath[:-3]+"_mms.sav", 'wb'))
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))


In [None]:
def get_patient_prediction(patient_df,group):
    if group == 1:
        model = tf.keras.models.load_model('C:\\SMU_v2\\ann\\model_group1_10y.h5' ,custom_objects={'leaky_relu': tf.nn.leaky_relu})
        pred_10y = model.predict(raw_data)[0][0]
        model.load_weights('C:\\SMU_v2\\ann\\model_group1_5y.h5')
        pred_5y = model.predict(raw_data)[0][0]
        model.load_weights('C:\\SMU_v2\\ann\\model_group1_2y.h5')
        pred_2y = model.predict(raw_data)[0][0]
        model.load_weights('C:\\SMU_v2\\ann\\model_group1_1y.h5')
        pred_1y = model.predict(raw_data)[0][0]
        model.load_weights('C:\\SMU_v2\\ann\\model_group1_6m.h5')
        pred_6m = model.predict(raw_data)[0][0]

    elif group == 2:
        model = tf.keras.models.load_model('C:\\SMU_v2\\ann\\model_group2_10y.h5' ,custom_objects={'leaky_relu': tf.nn.leaky_relu})
        pred_10y = model.predict(raw_data)[0][0]
        model.load_weights('C:\\SMU_v2\\ann\\model_group2_5y.h5')
        pred_5y = model.predict(raw_data)[0][0]
        model.load_weights('C:\\SMU_v2\\ann\\model_group2_2y.h5')
        pred_2y = model.predict(raw_data)[0][0]
        model.load_weights('C:\\SMU_v2\\ann\\model_group2_1y.h5')
        pred_1y = model.predict(raw_data)[0][0]
        model.load_weights('C:\\SMU_v2\\ann\\model_group2_6m.h5')
        pred_6m = model.predict(raw_data)[0][0]

    elif group == 3:
        model = tf.keras.models.load_model('C:\\SMU_v2\\ann\\model_group3_10y.h5' ,custom_objects={'leaky_relu': tf.nn.leaky_relu})
        pred_10y = model.predict(raw_data)[0][0]
        model.load_weights('C:\\SMU_v2\\ann\\model_group3_5y.h5')
        pred_5y = model.predict(raw_data)[0][0]
        model.load_weights('C:\\SMU_v2\\ann\\model_group3_2y.h5')
        pred_2y = model.predict(raw_data)[0][0]
        model.load_weights('C:\\SMU_v2\\ann\\model_group3_1y.h5')
        pred_1y = model.predict(raw_data)[0][0]
        model.load_weights('C:\\SMU_v2\\ann\\model_group3_6m.h5')
        pred_6m = model.predict(raw_data)[0][0]

    to_return = pd.DataFrame([[pred_6m,pred_1y,pred_2y,pred_5y,pred_10y]],columns = ["6 months after","1 year after","2 year after","5 years after","10 years after"])
    return to_return

In [None]:
# group 1
group = 3

if group == 1:
    raw_data = {
            'ER': ['positive'],\
            'PR': ['positive'],\
            'Her2': ['negative'],\
            'size_precise': [1.3],\
            'nodespos': [0],\
            'Age_@_Dx': [21],\
            'diff': ['grade 3']
           }
    raw_data = pd.DataFrame.from_dict(raw_data)
    raw_data = loadOHE(raw_data,OHE_LOCATION = "C:\\SMU_v2\\OHE\\", name="group 1_layer 4")

# group 3
elif group == 3:
    raw_data = {
        'ER': ['negative'],\
        'PR': ['negative'],\
        'Her2': ['positive'],\
        'size_precise': [8.0],\
        'nodespos': [1],\
        'Age_@_Dx': [60],\
        'T':['tis'],\
        'N': ['n0'],\
        'M': ['m0']
    }
    raw_data = pd.DataFrame.from_dict(raw_data)
    raw_data = loadOHE(raw_data,OHE_LOCATION = "C:\\SMU_v2\\OHE\\", name="group 3_layer 5")

# group 2
else:
    raw_data = {
        'ER': ['positive'],\
        'PR': ['positive'],\
        'Her2': ['negative'],\
        'Size': ["1.01 - 2 cm"],\
        'Age_@_Dx': [21],\
        'diff': ['grade 2']
    }
    raw_data = pd.DataFrame.from_dict(raw_data)
    raw_data = loadOHE(raw_data,OHE_LOCATION = "C:\\SMU_v2\\OHE\\", name="group 2_layer 1")
    
# raw_data = pd.DataFrame.from_dict(raw_data)
# raw_data = loadOHE(raw_data,OHE_LOCATION = "C:\\SMU_v2\\OHE\\", name="group 1_layer 4")
# raw_data = loadOHE(raw_data,OHE_LOCATION = "C:\\SMU_v2\\OHE\\", name="group 2_layer 1")
# raw_data = loadOHE(raw_data,OHE_LOCATION = "C:\\SMU_v2\\OHE\\", name="group 3_layer 5")
raw_data

In [None]:
to_return = get_patient_prediction(raw_data,3)

In [None]:
to_return#her2 negative

In [None]:
to_return#her2 positive

In [None]:
to_return.T.plot()

In [None]:
# calculation of RMS 

#for each group

    #get median
    
    #for each patient
        
        #get actual cost
        
        #get predicted cost
        
        #calculate baseline error
        
        #calculate prediction error
        