In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import seaborn as sns
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np

## Prep Data for training

In [14]:
conditions_diabetes = pd.read_csv('conditions_diabetes.csv')
# conditions_cancer = pd.read_csv('conditions_cancer.csv')
observations = pd.read_csv('observations_pivot.csv')
patients = pd.read_csv('patients.csv')

In [15]:
le = LabelEncoder()

def prep_data(patients, conditions, illness_descriptions, observations):
    patients.rename(columns={'patient':'PATIENT'}, inplace=True)
    patients = patients.drop(columns=['birthdate', 'marital','deathdate', 'address','ssn', 'drivers', 'passport', 'prefix', 'first', 'last', 'suffix', 'maiden'])
    
    patients = patients.dropna()
    conditions = conditions.dropna()

    # MERGE DATASETS
    merged_df = pd.merge(patients, conditions, on='PATIENT', how='left')
    merged_df = pd.merge(merged_df, observations, on='PATIENT', how='left')

    merged_df["y"] = (merged_df[illness_descriptions] == 1).any(axis=1).astype(int)
    
    merged_df = merged_df.drop(columns=illness_descriptions)
    merged_df["race"] = le.fit_transform(merged_df["race"])  
    merged_df["ethnicity"] = le.fit_transform(merged_df["ethnicity"])
    merged_df["gender"] = le.fit_transform(merged_df["gender"])  
    merged_df["birthplace"] = le.fit_transform(merged_df["birthplace"]) 
    
    # split into test and train
    train, test = train_test_split(merged_df, test_size=0.2, random_state=42)
    
    # Y column to predict is diabetes
    X_train = train.drop(columns=['y'])
    y_train = train['y']
    
    X_test = test.drop(columns=['y'])
    y_test = test['y']
    
    return X_train, y_train, X_test, y_test

illness_descriptions = ['PATIENT','Diabetes_CONDITIONS','Prediabetes_CONDITIONS','Diabetic retinopathy associated with type II diabetes mellitus (disorder)_CONDITIONS', 
                        'Nonproliferative diabetic retinopathy due to type 2 diabetes mellitus (disorder)_CONDITIONS', 'Macular edema and retinopathy due to type 2 diabetes mellitus (disorder)_CONDITIONS', 
                        'Microalbuminuria due to type 2 diabetes mellitus (disorder)_CONDITIONS', 'Diabetic renal disease (disorder)_CONDITIONS', 'Neuropathy due to type 2 diabetes mellitus (disorder)_CONDITIONS']
X_train, y_train, X_test, y_test = prep_data(patients, conditions_diabetes, illness_descriptions, observations)

In [21]:
#LogisticRegression
LR = LogisticRegression(max_iter=10000000000000000000)
LRScore = cross_val_score(LR, X_train, y_train, cv=5).mean()

# keep track of best Logistic Regression Score

#DecisionTreeClassifier
param_grid = { 'max_depth': [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None ]}

tree = DecisionTreeClassifier()
grid_search = GridSearchCV(tree, param_grid, cv=5)
grid_search.fit(X_train, y_train)
DTCScore  = grid_search.best_score_
bestDTCDepth = grid_search.best_params_


# Random Forrest Classifier    
forrest = RandomForestClassifier(random_state=0)
grid_search = GridSearchCV(forrest, param_grid, cv=5)
grid_search.fit(X_train, y_train)

RFCScore  = grid_search.best_score_
bestRFCDepth = grid_search.best_params_

#SVC
SVM = SVC()

# use grid search to find best gamma for SVM
g = {'gamma': 10.0 ** np.arange(-5, 5) }
grid_search = GridSearchCV(SVM, g, cv=5)
grid_search.fit(X_train, y_train)

SVMScore  = grid_search.best_score_   


print("best LR :", LRScore)
print("best DTC:", DTCScore)
print("best max depth: ", bestDTCDepth)
print("best RFC: ", RFCScore)
print("best max depth: ", bestRFCDepth)
print("best SVM: ", SVMScore)

max_score = 0
max_model = ""
if LRScore > max_score:
    max_score = LRScore
    max_model = "LR"
if DTCScore > max_score:
    max_score = DTCScore
    max_model = "DTC"
if RFCScore > max_score:
    max_score = RFCScore
    max_model = "RFC"
if SVMScore > max_score:
    max_score = SVMScore
    max_model = "SVM"

print("best score overall is: ", max_score, " with model: ", max_model)

best LR : 0.9067532372253403
best DTC: 0.9178790213124979
best max depth:  {'max_depth': 3}
best RFC:  0.9195847547778879
best max depth:  {'max_depth': 5}
best SVM:  0.9144492131616596
best score overall is:  0.9195847547778879  with model:  RFC


Next would compute risk scores!

In [18]:


SVM.predict_proba()

AttributeError: predict_proba is not available when  probability=False