In [11]:
import pandas as pd

# import dataset

df = pd.read_excel(r'C:\Users\djzil\Documents\UTSA\Fall 2020\ME 6543 Machine Learning\Project\ICU_Prediction.xlsx')
# patients 199 and 287 have little to no data besides age and gender. This code gets rid of him
df = df.drop([995,996,997,998,999,1435,1436,1437,1438,1439])

# little bit of data manipulation

import numpy as np

#create new column for target variable, set = 0
df['ICU_Target'] = 0

# go through each patient and determine whether they were sent to ICU at all
# or not. Then create new column that represents actual Target value
for i in range(0,len(df),5):
    if df.loc[i:i+4, 'ICU'].any() == 1:
        df.loc[i:i+4, 'ICU_Target'] = 1

# this block of code fills every NaN for every patient that is missing data for WINDOW ABOVE_12
df.loc[214, 'ALBUMIN_MEDIAN':'DIMER_DIFF'] = df.loc[211, 'ALBUMIN_MEDIAN':'DIMER_DIFF']
df.loc[374, 'ALBUMIN_MEDIAN':'DIMER_DIFF'] = df.loc[370, 'ALBUMIN_MEDIAN':'DIMER_DIFF']
df.loc[939, 'ALBUMIN_MEDIAN':'DIMER_DIFF'] = df.loc[938, 'ALBUMIN_MEDIAN':'DIMER_DIFF']
df.loc[1184, 'ALBUMIN_MEDIAN':'DIMER_DIFF'] = df.loc[1180, 'ALBUMIN_MEDIAN':'DIMER_DIFF']
df.loc[1194, 'ALBUMIN_MEDIAN':'DIMER_DIFF'] = df.loc[1190, 'ALBUMIN_MEDIAN':'DIMER_DIFF']
df.loc[1239, 'ALBUMIN_MEDIAN':'DIMER_DIFF'] = df.loc[1235, 'ALBUMIN_MEDIAN':'DIMER_DIFF']
df.loc[1664, 'ALBUMIN_MEDIAN':'DIMER_DIFF'] = df.loc[1661, 'ALBUMIN_MEDIAN':'DIMER_DIFF']
df.loc[1854, 'ALBUMIN_MEDIAN':'DIMER_DIFF'] = df.loc[1851, 'ALBUMIN_MEDIAN':'DIMER_DIFF']
df.loc[1909, 'ALBUMIN_MEDIAN':'DIMER_DIFF'] = df.loc[1906, 'ALBUMIN_MEDIAN':'DIMER_DIFF']

# this code gets rid of all redundant columns. For example, ALBUMIN_MEDIAN
# through ALBUMIN_MAX are the same. Therefore, ALBUMIN_MEAN was kept and 
# the rest were dropped. Additionally, ALBUMIN_DIFF = -1 always which
# provides no value to the model.
df2 = df.iloc[:, 14:194:5]
df1 = df.iloc[:, 0:13]
df3 = df.iloc[:, 193:233]
df = pd.concat([df1,df2,df3], axis=1)

# this block of code goes through the whole dataset and backfills NaNs for
# each patient
for column in df:
    for i in range(0,len(df),5):
        df[column][i:i+5].bfill(inplace=True)


# remove all rows where ICU = 1
df = df[df.ICU == 0]

from sklearn.preprocessing import OrdinalEncoder

# this code converts AGE_PERCENTIL from string to double and scales it btwn
# -1 and 1
df_cat = df[["AGE_PERCENTIL"]]
ordinal_encoder = OrdinalEncoder()
df_cat_encoded = ordinal_encoder.fit_transform(df_cat)
b, a = 1, -1
cat_min = df_cat_encoded.min()
cat_max = df_cat_encoded.max()
df_cat_encoded_scaled = (b-a)*((df_cat_encoded - cat_min)/(cat_max - cat_min)) + a
df = df.assign(AGE_PERCENTIL=df_cat_encoded_scaled)

df = df.drop(columns=['PATIENT_VISIT_IDENTIFIER','WINDOW','ICU',])
print(df)
print("NaNs in dataset = ", df.isnull().sum().sum())
# there should be zero NaN values

      AGE_ABOVE65  AGE_PERCENTIL  GENDER  DISEASE GROUPING 1  \
0               1       0.111111       0                 0.0   
1               1       0.111111       0                 0.0   
2               1       0.111111       0                 0.0   
3               1       0.111111       0                 0.0   
10              0      -1.000000       0                 0.0   
...           ...            ...     ...                 ...   
1920            0      -0.111111       1                 0.0   
1921            0      -0.111111       1                 0.0   
1922            0      -0.111111       1                 0.0   
1923            0      -0.111111       1                 0.0   
1924            0      -0.111111       1                 0.0   

      DISEASE GROUPING 2  DISEASE GROUPING 3  DISEASE GROUPING 4  \
0                    0.0                 0.0                 0.0   
1                    0.0                 0.0                 0.0   
2                    0.0   

In [2]:
from sklearn.model_selection import train_test_split

# split data into training and test sets
train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
# split training set into predictor data and target values
predictors = train_set.drop("ICU_Target", axis=1)
target = train_set["ICU_Target"].copy()
# split test set into predictor data and target values
predictors_test = test_set.drop("ICU_Target", axis=1)
target_test = test_set["ICU_Target"].copy()

In [12]:
##### SVM with RBF Kernel #####

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score
from sklearn.pipeline import Pipeline

## Finding Best Hyperparameters

param_grid = [{'gamma': [.01, .1, 1, 10, 100], 'C':[.01,.1,1,10,100]},]
gaussian_svm_clf = SVC(kernel="rbf")
grid_search = GridSearchCV(gaussian_svm_clf, param_grid, cv=3, return_train_score=True)
grid_search.fit(predictors, target)

## Finding Best Hyperparameters Part 2

param_grid2 = [{'gamma': [.01,.055, .1, .55, 1], 'C':[1,5.5,10,55,100]},]
grid_search2 = GridSearchCV(gaussian_svm_clf, param_grid2, cv=3, return_train_score=True)
grid_search2.fit(predictors, target)

## Creating model
gaussian_svm_clf = Pipeline([("svm_clf", SVC(kernel="rbf", gamma=grid_search2.best_params_['gamma'], C=grid_search2.best_params_['C']))])
gaussian_svm_clf.fit(predictors,target)

model = gaussian_svm_clf

## Cross-Validate model
y_train_pred = cross_val_predict(model, predictors, target, cv=3)
recall = recall_score(target, y_train_pred)
print("Validation Recall =", recall)

## Evaluate test set
test_predictions = model.predict(predictors_test)
test_recall = recall_score(target_test, test_predictions)
print("Test Recall =", test_recall)

Validation Recall = 0.8092643051771117
Test Recall = 0.8651685393258427


In [9]:
##### SVM with Linear Kernel #####

## Finding best Hyperparameters
param_grid = [{'C':[.01,.1,1,10,100]},]
lin_svm_clf = SVC(kernel="linear")
grid_search = GridSearchCV(lin_svm_clf, param_grid, cv=3, return_train_score=True)
grid_search.fit(predictors, target)

## Finding Best Hyperparameters Part 2
param_grid2 = [{'C':[10,55,100,550,1000]},]
grid_search2 = GridSearchCV(lin_svm_clf, param_grid2, cv=3, return_train_score=True)
grid_search2.fit(predictors, target)

## Creating model
lin_svm_clf = Pipeline([("lin_svc", SVC(kernel="linear", C=grid_search2.best_params_['C'])),])
lin_svm_clf.fit(predictors, target)

model = lin_svm_clf

## Cross-Validate model
y_train_pred = cross_val_predict(model, predictors, target, cv=3)
recall = recall_score(target, y_train_pred)
print("Validation Recall =", recall)

## Evaluate test set
test_predictions = model.predict(predictors_test)
test_recall = recall_score(target_test, test_predictions)
print("Test Recall =", test_recall)

Validation Recall = 0.6485013623978202
Test Recall = 0.5393258426966292


In [8]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = [{'max_depth':[3,4,5,6,7], 'n_estimators':[3,4,5,6,7], 'learning_rate':[.01,.1,1,10,100]},]
tree_clf = GradientBoostingClassifier()
grid_search = GridSearchCV(tree_clf, param_grid, cv=3, return_train_score=True)
grid_search.fit(predictors, target)

param_grid2 = [{'max_depth':[6,7,8,9,10], 'n_estimators':[6,7,8,9,10], 'learning_rate':[.1,.55,1,5.5,10]},]
grid_search2 = GridSearchCV(tree_clf, param_grid2, cv=3, return_train_score=True)
grid_search2.fit(predictors, target)

tree_clf = GradientBoostingClassifier(max_depth=grid_search2.best_params_['max_depth'], n_estimators=grid_search2.best_params_['n_estimators'], learning_rate=grid_search2.best_params_['learning_rate'])
tree_clf.fit(predictors, target)

model = tree_clf

y_train_pred = cross_val_predict(model, predictors, target, cv=3)
recall = recall_score(target, y_train_pred)
print("Validation Recall =", recall)

## Evaluate test set
test_predictions = model.predict(predictors_test)
test_recall = recall_score(target_test, test_predictions)
print("Test Recall =", test_recall)

{'learning_rate': 1, 'max_depth': 7, 'n_estimators': 7}
Validation Recall = 0.7384196185286104
Test Recall = 0.8314606741573034
