In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import seaborn as sns
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np

## Prep Data for training

In [21]:
conditions_diabetes = pd.read_csv('conditions_diabetes.csv')
patients = pd.read_csv('patients.csv')

In [22]:
le = LabelEncoder()

def prep_data(patients, conditions_diabetes):
    patients.rename(columns={'patient':'PATIENT'}, inplace=True)
    patients = patients.drop(columns=['birthdate', 'marital','deathdate', 'address','ssn', 'drivers', 'passport', 'prefix', 'first', 'last', 'suffix', 'maiden'])
    
    patients = patients.dropna()
    conditions_diabetes = conditions_diabetes.dropna()

    # MERGE DATASETS
    merged_df = pd.merge(patients, conditions_diabetes, on='PATIENT', how='left')
    
    illness_descriptions = ['PATIENT','Diabetes_CONDITIONS','Prediabetes_CONDITIONS','Diabetic retinopathy associated with type II diabetes mellitus (disorder)_CONDITIONS', 
                        'Nonproliferative diabetic retinopathy due to type 2 diabetes mellitus (disorder)_CONDITIONS', 'Macular edema and retinopathy due to type 2 diabetes mellitus (disorder)_CONDITIONS', 
                        'Microalbuminuria due to type 2 diabetes mellitus (disorder)_CONDITIONS', 'Diabetic renal disease (disorder)_CONDITIONS', 'Neuropathy due to type 2 diabetes mellitus (disorder)_CONDITIONS']
    merged_df["y"] = (merged_df[illness_descriptions] == 1).any(axis=1).astype(int)
    
    X_train = merged_df.drop(columns=['y'])
    X_train = X_train.drop(columns=illness_descriptions)
    
    
    X_train["race"] = le.fit_transform(X_train["race"])  
    X_train["ethnicity"] = le.fit_transform(X_train["ethnicity"])
    X_train["gender"] = le.fit_transform(X_train["gender"])  
    X_train["birthplace"] = le.fit_transform(X_train["birthplace"])  

    
    # Y column to predict is diabetes
    y_train = merged_df['y']
    
    return X_train, y_train

X_train, y_train = prep_data(patients, conditions_diabetes)

In [23]:
X_train

Unnamed: 0,race,ethnicity,gender,birthplace
0,1,5,0,182
1,3,1,0,139
2,1,0,1,78
3,3,6,0,107
4,2,15,1,246
...,...,...,...,...
1457,1,5,0,213
1458,3,13,0,27
1459,3,13,1,135
1460,3,10,0,131


In [28]:
bestLRScore = 0.0
bestLRCols=[]
col_combos = []

bestDTCScore = 0.0
bestDTCCols=[]

bestSVMScore = 0.0
bestSVMCols=[]

bestRFCScore = 0.0
bestRFCCols=[]


cols = list(X_train.columns)

#LogisticRegression
LR = LogisticRegression(max_iter=10000000000000000000)
    
# print("cols", cols)
# print("X_train[cols]", X_train[cols])
# print("y_train", y_train)
LRScore = cross_val_score(LR, X_train[cols], y_train, cv=5).mean()

# keep track of best Logistic Regression Score

if LRScore >= bestLRScore :
  bestLRScore = LRScore
  bestLRCols = cols
    
#DecisionTreeClassifier
param_grid = { 'max_depth': [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None ]}

tree = DecisionTreeClassifier()
grid_search = GridSearchCV(tree, param_grid, cv=5)
grid_search.fit(X_train[cols], y_train)
DTCScore  = grid_search.best_score_
    
if(DTCScore > bestDTCScore):
  bestDTCScore = DTCScore
  bestDTCCols = cols
  bestDTCDepth = grid_search.best_params_
  

# Random Forrest Classifier    
forrest = RandomForestClassifier(random_state=0)
grid_search = GridSearchCV(forrest, param_grid, cv=5)
grid_search.fit(X_train[cols], y_train)

RFCScore  = grid_search.best_score_

# keep track of best RFC Score  
if(RFCScore > bestRFCScore):
  bestRFCScore = RFCScore
  bestRFCCols = cols
  bestRFCDepth = grid_search.best_params_

#SVC
SVM = SVC()

# use grid search to find best gamma for SVM
g = {'gamma': 10.0 ** np.arange(-5, 5) }

grid_search = GridSearchCV(SVM, g, cv=5)
grid_search.fit(X_train[cols], y_train)

SVMScore  = grid_search.best_score_   

# keep track of best SVM Score
if(SVMScore > bestSVMScore):
  bestSVMScore = SVMScore
  bestSVMCols = cols
      

  
print("best LR",bestLRCols, ":", bestLRScore)
print("best DTC",bestDTCCols, ":", bestDTCScore)
print("best max depth:", bestDTCDepth)
print("best RFC", bestRFCCols, ":", bestRFCScore)
print("best max depth:", bestRFCDepth)
print("best SVM", bestSVMCols, ":", bestSVMScore)

best LR ['race', 'ethnicity', 'gender', 'birthplace'] : 0.6593716396278461
best DTC ['race', 'ethnicity', 'gender', 'birthplace'] : 0.6593716396278461
best max depth: {'max_depth': 1}
best RFC ['race', 'ethnicity', 'gender', 'birthplace'] : 0.6600565711346953
best max depth: {'max_depth': 4}
best SVM ['race', 'ethnicity', 'gender', 'birthplace'] : 0.6600565711346954


Seems as though most accurate SVM is just over 66% accurate at predicting diabetes based on race, ethnicity, gender, and birthplace.

Next would compute risk scores!