In [None]:
import os
import pandas as pd
import numpy as np
import sys
import win32com.client
import getpass
import datetime
import pywintypes
import matplotlib.pyplot as plt
import pickle
%matplotlib inline
#ann model
import tensorflow as tf
from tensorflow.keras import layers
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler

import math
from sklearn.ensemble import IsolationForest

<h2>ANN</h2>

In [None]:
def read_data():
    """
    Returns all the data that needs to be used for ANN.
    Output(4 dataframes): all bills, clinical data, clinical data (OHE), bills grouped by time period
    """
    bills_clean = pd.read_pickle('C:\\SMU_v2\\bills_output.pkl')
    CDM = pd.read_pickle("C:\\SMU_v2\\clinical_output.pkl").reset_index().drop(columns="index")

    clinical = CDM.drop(['dob','cause_of_death','death_age',\
                         'Date_for_DFS','Date_for_OS', 'Date_for_CSS',\
                         'Count_as_DFS', 'Count_as_OS','Count_as_CSS'], axis=1)

    OHE = [i for i in clinical.columns if not (i in  ["NRIC", 'Age_@_Dx', 'size_precise', 'nodespos','dx_date'])]
    x_clinical = pd.get_dummies(clinical,columns=OHE,dummy_na=True).reset_index().drop(columns="index")
    prices_grouped = pd.read_pickle("C:\\SMU_v2\\price_timeperiod.pkl").reset_index().drop(columns="index")
    return bills_clean, clinical, x_clinical, prices_grouped

def scale_data(data,scale_obj):
    """
    scales data according to min-max
    """
    prices_grouped_scaled = pd.DataFrame(scale_obj.fit_transform(data))
    return prices_grouped_scaled

def scale_data_reverse(data,scale_obj):
    """
    returns a dataframe that reverses the min-max that was done previously
    """
    predictions_scaled_reverse = pd.DataFrame(scale_obj.inverse_transform(data))
    return predictions_scaled_reverse

def ann_structure(input_shape,output_units):
    """
    function to declare ANN structure. just for code cleaniness
    """
    model = tf.keras.Sequential()
    model.add(layers.Dense(32, input_shape=(input_shape,)))         # input layer
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dropout(.5))
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', moving_mean_initializer='zeros', moving_variance_initializer='ones', beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None)
) 
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(32, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(16, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dropout(.5))
    model.add(layers.Dense(16, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(16, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(16, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(16, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dropout(.5))
    model.add(layers.Dense(16, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(16, activation=tf.nn.leaky_relu))      # one hidden layer
    model.add(layers.Dense(output_units, activation=tf.nn.leaky_relu))   # one output layer with 1 outputs
    return model

def remove_out_of_range(data):
    """
    determines index of data where there is no additional information
    """
    y1 = data[data["after_1y"].isnull()].index
    y2 = data[data["after_2y"].isnull()].index
    y5 = data[data["after_5y"].isnull()].index
    y10 = data[data["after_10y"].isnull()].index
    return {
        "y1":[3,y1], 
        "y2":[4,y2], 
        "y5":[7,y5], 
        "y10":[12,y10]}

def remove_meaningless_data(data):
    """
    returns index of all rows that do not add any additional input. aka all fields are 0
    """
    return data[data.sum(axis=1)==0].index

def drop_by_index(X,y,indexes):
    """
    helper function to drop rows of dataframe and return new dataframe without those rows with indexes resetted
    """
    y = y.drop(indexes)
    X = X.drop(indexes)
    X = X.reset_index().drop(columns="index")
    y = y.reset_index().drop(columns="index")
    return(X,y)

def scheduler(epoch):
    """
    to reduce learning rate as epoch number increases
    """
    if epoch < 30:
        return 0.001
    else:
        return 0.001 * math.exp(0.1 * (10 - int(epoch)))
    
def process_time_period(data,limit=0):
    """
    Takes in yearly healthcare costs of patients and processes it into 1,2,5,10 year values
    """
    y = pd.DataFrame()
    y["6 months before"] = data.iloc[:,0]
    y["6 months after"] = data.iloc[:,1]
    y["1 year after"] = data.iloc[:,2]
    y["2 years after"] = data.iloc[:,3]
    if limit < 1:
        y["5 years after"] = data.iloc[:,4:7].sum(axis=1)
        if limit < 2:
            y["10 years after"] = data.iloc[:,7:].sum(axis=1)
    return y
    
def make_prediction(all_users,user,model,mms):
    """
    Given user data(dataframe) and the trained model, outputs the predicted values.
    
    Only works if all items in new user data has appeared at least once before
    """
    all_users = all_users.reset_index().drop(columns="index")
    last_row = all_users.shape[0]
    all_users = all_users.append(user)
    all_users = all_users.drop(columns=["NRIC","dx_date"])
    OHE = [i for i in all_users.columns if not (i in  ["NRIC", 'Age_@_Dx', 'size_precise', 'nodespos','dx_date'])]
    usersOHE = pd.get_dummies(all_users,columns=OHE,dummy_na=True).reset_index().drop(columns="index")
    prediction_x = usersOHE
    print(prediction_x.shape)
    pred = model.predict(prediction_x)
    predictions_scaled_reverse = pd.DataFrame(mms.inverse_transform(pred),columns=["6 months before","6 months after","1 year after","2 years after","5 years after","10 years after"][:pred.shape[1]])
    return pd.DataFrame([pd.DataFrame(predictions_scaled_reverse).iloc[last_row]]).reset_index().drop(columns="index")

def make_comparison(all_users,all_users_OHE,NRIC,bills,model,mms):
    """
    Given a specific user, calculate out his actual cost and predicted costs
    """
    x = all_users[all_users["NRIC"] == NRIC]
    pred = make_prediction(all_users,x,model,mms)
    pred["Status"] = "Prediction"
    if pred.shape[1] == 7:
        limit = 0
    elif pred.shape[1] == 6:
        limit = 1
    else:
        limit = 2
    y_test = process_time_period(bills[all_users["NRIC"] == NRIC],limit)
    y_test["Status"] = "True data"
    y_test.columns = ["6 months before","6 months after","1 year after","2 years after","5 years after","10 years after"][:pred.shape[1]-1] +["Status"]
    
    to_return = y_test.append(pred)
    return to_return

In [None]:
bills, clinical, clinicalOHE, bills_grouped = read_data()

to_drop = clinicalOHE[clinicalOHE["dx_date"] == "NA"].index
bills_grouped,clinicalOHE = drop_by_index(bills_grouped,clinicalOHE,to_drop)
remove_indexes = remove_out_of_range(bills_grouped)

In [None]:
#10 years
outlier = True
scope = "y10"
index = remove_indexes[scope]

y = bills_grouped.iloc[:,:index[0]]  
X = clinicalOHE.drop(columns=["NRIC","dx_date"])

print("Data shape original: {}".format(X.shape[0]))

X,y_small = drop_by_index(X,y,index[1])

print("Data shape removing data out of scope: {}".format(X.shape[0]))
y = process_time_period(y_small)

to_drop = X[X["size_precise"].isnull() | X["nodespos"].isnull()].index
X,y = drop_by_index(X,y,to_drop)
print("Data shape remove data that will cause errors: {}".format(X.shape[0]))

meaningless = remove_meaningless_data(y)
X,y = drop_by_index(X,y,meaningless)

print("Data shape meaningless data: {}".format(X.shape[0]))



if outlier:
    clf = IsolationForest(contamination="auto",behaviour="new",random_state=42)
    out = clf.fit_predict(y)
    out_df = pd.DataFrame(out,columns=["outlier"])
    remove = out_df[out_df["outlier"] ==-1].index
    X,y = drop_by_index(X,y,remove)
    
    print("Data shape after removing outliers: {}".format(X.shape[0]))

mms = MinMaxScaler()
y_scaled = scale_data(y,mms)

X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.33, random_state=42)

model = ann_structure(X.shape[1],y_scaled.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 100
filepath="weights.best.{}.h5".format(scope)
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
# callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))

In [None]:
pred = model.predict(X_test)
predictions = pd.DataFrame(pred)
predictions_scaled_reverse = pd.DataFrame(mms.inverse_transform(predictions),columns=["6 months before","6 months after","1 year after","2 years after","5 years after","10 years after"])
y_test_scaled_reverse = pd.DataFrame(mms.inverse_transform(y_test),columns=["6 months before","6 months after","1 year after","2 years after","5 years after","10 years after"])


In [None]:
predictions_scaled_reverse

In [None]:
y_test_scaled_reverse

In [None]:
#5 years
scope = "y5"
index = remove_indexes[scope]
outlier = True

y = bills_grouped.iloc[:,:index[0]]  
X = clinicalOHE.drop(columns=["NRIC","dx_date"])

print("Data shape original: {}".format(X.shape[0]))

X,y_small = drop_by_index(X,y,index[1])

print("Data shape removing data out of scope: {}".format(X.shape[0]))
y = process_time_period(y_small,1)

to_drop = X[X["size_precise"].isnull() | X["nodespos"].isnull()].index
X,y = drop_by_index(X,y,to_drop)
print("Data shape remove data that will cause errors: {}".format(X.shape[0]))

meaningless = remove_meaningless_data(y)
X,y = drop_by_index(X,y,meaningless)

print("Data shape meaningless data: {}".format(X.shape[0]))

if outlier:
    clf = IsolationForest(contamination="auto",behaviour="new",random_state=42)
    out = clf.fit_predict(y)
    out_df = pd.DataFrame(out,columns=["outlier"])
    remove = out_df[out_df["outlier"] ==-1].index
    X,y = drop_by_index(X,y,remove)    
    print("Data shape after removing outliers: {}".format(X.shape[0]))

mms = MinMaxScaler()
y_scaled = scale_data(y,mms)

X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.33, random_state=42)

model = ann_structure(X.shape[1],y_scaled.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 200
filepath="weights.best.{}.h5".format(scope)
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))

In [None]:
pred = model.predict(X_test)
predictions = pd.DataFrame(pred)
predictions_scaled_reverse = pd.DataFrame(mms.inverse_transform(predictions),columns=["6 months before","6 months after","1 year after","2 years after","5 years after"])
y_test_scaled_reverse = pd.DataFrame(mms.inverse_transform(y_test),columns=["6 months before","6 months after","1 year after","2 years after","5 years after"])


In [None]:
predictions_scaled_reverse.head()

In [None]:
#10 years
outlier = False
scope = "y10"
index = remove_indexes[scope]

y = bills_grouped.iloc[:,:index[0]]  
X = clinicalOHE.drop(columns=["NRIC","dx_date"])

X,y_small = drop_by_index(X,y,index[1])

y = process_time_period(y_small,2)

to_drop = X[X["size_precise"].isnull() | X["nodespos"].isnull()].index
X,y = drop_by_index(X,y,to_drop)

meaningless = remove_meaningless_data(y)
X,y = drop_by_index(X,y,meaningless)

mms = MinMaxScaler()
y_scaled = scale_data(y,mms)

if outlier:
    clf = IsolationForest(random_state=42)
    out = clf.fit_predict(y)
    out_df = pd.DataFrame(out,columns=["outlier"])
    remove = out_df[out_df["outlier"] ==-1].index
    X,y_scaled = drop_by_index(X,y_scaled,remove)
    

X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.33, random_state=42)

model = ann_structure(X.shape[1],y_scaled.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 200
filepath="weights.best.{}.h5".format(scope)
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))

In [None]:
y_test_scaled_reverse.head()

In [None]:
#2 years
scope = "y2"
index = remove_indexes[scope]
index = remove_indexes[scope]
outlier = True

y = bills_grouped.iloc[:,:index[0]]  
X = clinicalOHE.drop(columns=["NRIC","dx_date"])

print("Data shape original: {}".format(X.shape[0]))

X,y_small = drop_by_index(X,y,index[1])

print("Data shape removing data out of scope: {}".format(X.shape[0]))
y = pd.DataFrame()
y["6 months before"] = y_small.iloc[:,0]
y["6 months after"] = y_small.iloc[:,1]
y["1 year after"] = y_small.iloc[:,2]
y["2 years after"] = y_small.iloc[:,3]

to_drop = X[X["size_precise"].isnull() | X["nodespos"].isnull()].index
X,y = drop_by_index(X,y,to_drop)
print("Data shape remove data that will cause errors: {}".format(X.shape[0]))

meaningless = remove_meaningless_data(y)
X,y = drop_by_index(X,y,meaningless)

print("Data shape meaningless data: {}".format(X.shape[0]))


if outlier:
    clf = IsolationForest(contamination="auto",behaviour="new",random_state=42)
    out = clf.fit_predict(y)
    out_df = pd.DataFrame(out,columns=["outlier"])
    remove = out_df[out_df["outlier"] ==-1].index
    X,y = drop_by_index(X,y,remove)
    
    print("Data shape after removing outliers: {}".format(X.shape[0]))

mms = MinMaxScaler()
y_scaled = scale_data(y,mms)

X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.33, random_state=42)

model = ann_structure(X.shape[1],y_scaled.shape[1])
model.compile(optimizer=tf.keras.optimizers.Adam(), 
          loss='mean_squared_error')
# Run the stochastic gradient descent for specified epochs
epochs = 100
filepath="weights.best.{}.h5".format(scope)
callbacks_list = []
callbacks_list.append(ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True))
callbacks_list.append(LearningRateScheduler(scheduler))

model.fit(X_train, y_train, epochs=epochs, callbacks = callbacks_list, validation_data=(X_test, y_test))

In [None]:
pred = model.predict(X_test)
predictions = pd.DataFrame(pred)
predictions_scaled_reverse = pd.DataFrame(mms.inverse_transform(predictions))
y_test_scaled_reverse = pd.DataFrame(mms.inverse_transform(y_test))
print(predictions_scaled_reverse.head())
print(y_test_scaled_reverse.head())

# Assuming model is chosen and trained