In [17]:
import os
from IPython.display import Image

import pandas as pd
import numpy as np
import itertools
import timeit
import time

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')

from sklearn import tree
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Display all columns
pd.set_option('display.max_columns', None)
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV

from collections import Counter
from numpy import where
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from numpy import mean
from sklearn.decomposition import FastICA, PCA
from scipy.stats import kurtosis 
from sklearn import random_projection
from sklearn.decomposition import TruncatedSVD

from sklearn.neural_network import MLPClassifier

In [13]:
df_patient = pd.read_csv("data/patientSurvivalPredication.csv")

assert df_patient.shape[0] > 0
assert df_patient.shape[1] > 0

print(f"Data has {df_patient.shape[0]} rows and {df_patient.shape[1]} columns.")

df_patient.drop(['encounter_id', 'patient_id', 'hospital_id', 'Unnamed: 83'], inplace=True, axis=1)

df_patient.fillna(-1, inplace=True)

# Replace -1 with empty string in string columns types
df_patient['ethnicity'].replace(-1, '', inplace=True)
df_patient['gender'].replace(-1, '', inplace=True)
df_patient['icu_stay_type'].replace(-1, "", inplace=True)
df_patient['icu_admit_source'].replace(-1, '', inplace=True)
df_patient['icu_type'].replace(-1, '', inplace=True)
df_patient['apache_3j_bodysystem'].replace(-1, '', inplace=True)
df_patient['apache_2_bodysystem'].replace(-1, '', inplace=True)

# Encode ethinicity as decision tree can't handle string values
enc = LabelEncoder()
enc.fit(df_patient['ethnicity'])

# Correct different spelling for same values
df_patient['apache_2_bodysystem'].replace("Undefined diagnoses", "Undefined Diagnoses", inplace=True)

# Create dummy columns for string columns so decision tree can proccess them
df_patient = pd.get_dummies(df_patient, columns=["ethnicity", "gender", "icu_stay_type", "icu_admit_source", "icu_type", "apache_3j_bodysystem", "apache_2_bodysystem"], prefix=["ethnicity_is", "gender_is", "icu_stay_type_is", "icu_admit_source_is", "icu_type_is", "apache_3j_bodysystem_is", "apache_2_bodysystem_is"] )

print(df_patient.shape)

Data has 91713 rows and 85 columns.
(91713, 123)


In [14]:
df_insurance = pd.read_csv('data/insurance/train.csv', nrows=100000)

assert df_insurance.shape[0] > 0
assert df_insurance.shape[1] > 0

print(f"Data has {df_insurance.shape[0]} rows and {df_insurance.shape[1]} columns.")

def encode_labels(df):
    for column in df.columns:
        if df[column].dtype=='object':
            label = LabelEncoder()
            label.fit(list(df[column].values))
            df[column] = label.transform(list(df[column].values))
    return df

df_insurance = df_insurance.drop(['QuoteNumber'], axis=1)

# Now convert the date to day, month and week and drop the date
df_insurance['Date'] = pd.to_datetime(pd.Series(df_insurance['Original_Quote_Date']))
df_insurance = df_insurance.drop('Original_Quote_Date', axis=1)
df_insurance['Year'] = df_insurance['Date'].apply(lambda x: int(str(x)[:4]))
df_insurance['Month'] = df_insurance['Date'].apply(lambda x: int(str(x)[5:7]))
df_insurance['weekday'] = df_insurance['Date'].dt.dayofweek
df_insurance = df_insurance.drop('Date', axis=1)
df_insurance = encode_labels(df_insurance)
df_insurance = df_insurance.fillna(-1)

insurance_X = df_insurance.loc[:, df_insurance.columns != 'QuoteConversion_Flag']
insurance_Y = df_insurance['QuoteConversion_Flag']

Data has 100000 rows and 299 columns.


In [15]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline 


over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('over', over), ('under', under)]

patient_Y = df_patient['hospital_death']
patient_X = df_patient.loc[:, df_patient.columns != 'hospital_death']
patient_X_resample, patient_Y_resample = Pipeline(steps=steps).fit_resample(patient_X, patient_Y)

patient_X_resample_scaled = StandardScaler().fit_transform(patient_X_resample) 

over = SMOTE(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('over', over), ('under', under)]

insurance_X = df_insurance.loc[:, df_insurance.columns != 'QuoteConversion_Flag']
insurance_Y = df_insurance['QuoteConversion_Flag']
insurance_X_resample, insurance_Y_resample = Pipeline(steps=steps).fit_resample(insurance_X, insurance_Y)

insurance_X_resample_scaled = StandardScaler().fit_transform(insurance_X_resample) 

patient_X_train, patient_X_test, patient_y_train, patient_y_test = train_test_split(np.array(patient_X_resample),np.array(patient_Y_resample), test_size=0.15)
insurance_X_train, insurance_X_test, insurance_y_train, insurance_y_test = train_test_split(np.array(insurance_X_resample),np.array(insurance_Y_resample), test_size=0.15)


In [25]:
clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.01)
X_train, X_test, y_train, y_test = train_test_split(insurance_X_resample_scaled, insurance_Y_resample, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20, scoring="roc_auc").mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = roc_auc_score(y_test, y_pred)
print("Test ROC AUC: " + str(score))

Cross validation score: 0.9594078030316471
Train time: 19.47288703918457
Query time: 0.02012801170349121
Test ROC AUC: 0.8655171070064688


In [21]:
pca = PCA(n_components=80).fit(insurance_X_resample_scaled)
pca_insurance_X_resample_scaled = pca.transform(insurance_X_resample_scaled)

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.01)
X_train, X_test, y_train, y_test = train_test_split(pca_insurance_X_resample_scaled, insurance_Y_resample, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20, scoring="roc_auc").mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = roc_auc_score(y_test, y_pred)
print("Test ROC AUC: " + str(score))

Cross validation score: 0.9417885449940547
Train time: 7.147889852523804
Query time: 0.003490924835205078
Test ROC AUC: 0.8454013654562001


In [22]:
ica = FastICA(n_components=59, max_iter=10000, tol=0.1).fit(insurance_X_resample_scaled)
ica_insurance_X_resample_scaled = ica.transform(insurance_X_resample_scaled)

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.01)
X_train, X_test, y_train, y_test = train_test_split(ica_insurance_X_resample_scaled, insurance_Y_resample, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20, scoring="roc_auc").mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = roc_auc_score(y_test, y_pred)
print("Test ROC AUC: " + str(score))


Cross validation score: 0.860120087646347
Train time: 5.598322868347168
Query time: 0.0029449462890625
Test ROC AUC: 0.766790758240345


In [23]:
rp = random_projection.SparseRandomProjection(n_components=24)
rp_insurance_X_resample_scaled=rp.fit_transform(insurance_X_resample_scaled)

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.01)
X_train, X_test, y_train, y_test = train_test_split(rp_insurance_X_resample_scaled, insurance_Y_resample, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20, scoring="roc_auc").mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = roc_auc_score(y_test, y_pred)
print("Test ROC AUC: " + str(score))

Cross validation score: 0.7655831858223455
Train time: 3.5977699756622314
Query time: 0.003618955612182617
Test ROC AUC: 0.6484668704798361


In [24]:
tsvd = TruncatedSVD(n_components=94)
tsvd_insurance_X_resample_scaled = tsvd.fit_transform(insurance_X_resample_scaled)

clf = MLPClassifier(max_iter= 5000, hidden_layer_sizes=(5,2), activation='logistic', verbose=False, learning_rate_init=0.01)
X_train, X_test, y_train, y_test = train_test_split(tsvd_insurance_X_resample_scaled, insurance_Y_resample, test_size=0.2)

cv_score = cross_val_score(clf, X_train, y_train, cv=20, scoring="roc_auc").mean()
print("Cross validation score: " + str(cv_score))

start = time.time()
clf.fit(X_train, y_train)
train_time = time.time() - start
print("Train time: " + str(train_time))

start = time.time()
y_pred = clf.predict(X_test)
query_time = time.time() - start
print("Query time: " + str(query_time))

score = roc_auc_score(y_test, y_pred)
print("Test ROC AUC: " + str(score))

Cross validation score: 0.9461288144325737
Train time: 9.098109006881714
Query time: 0.00372314453125
Test ROC AUC: 0.8580776281818915
