In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import seaborn as sns
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [45]:
conditions_diabetes = pd.read_csv('conditions_diabetes.csv')
conditions_pregnancy = pd.read_csv('conditions_pregnancy.csv')
conditions_cancer = pd.read_csv('conditions_cancer.csv')
conditions_heart = pd.read_csv('conditions_heart.csv')
conditions_lungs = pd.read_csv('conditions_lungs.csv')

observations = pd.read_csv('observations_pivot.csv')
patients = pd.read_csv('patient_clean.csv')

In [46]:
le = LabelEncoder()
# our data-prepping function for modeling
def prep_data(df):
    
    # label encode all quantitative vars
    df["race"] = le.fit_transform(df["race"]) 
    race_code = {code: race for code, race in enumerate(le.classes_)}

    df["ethnicity"] = le.fit_transform(df["ethnicity"])
    eth_code = {code: ethnicity for code, ethnicity in enumerate(le.classes_)}

    df["gender"] = le.fit_transform(df["gender"])
    gen_code = {code: gender for code, gender in enumerate(le.classes_)}

    df["birthplace"] = le.fit_transform(df["birthplace"])
    bp_code = {code: bp for code, bp in enumerate(le.classes_)}

    df["curr_town"] = le.fit_transform(df["curr_town"]) 
    curr_code = {code: bp for code, bp in enumerate(le.classes_)}
    
    # split data into test and train
    train, test = train_test_split(df, test_size=0.2, random_state=42)
    
    X_train = train.drop(columns=['y'])
    y_train = train['y']
    
    X_test = test.drop(columns=['y'])
    y_test = test['y']
    
    # return split x, y, and all of the code tracking dicts
    return X_train, y_train, X_test, y_test, race_code, eth_code, gen_code, bp_code, curr_code

In [47]:
X_train, y_train, X_test, y_test, race_code, eth_code, gen_code, bp_code, curr_code = prep_data(conditions_pregnancy)

In [48]:
# our model-finding function
def train_model(X_train, y_train):
    
    #LogisticRegression
    LR = LogisticRegression(max_iter=10000000000000000000)
    LRScore = cross_val_score(LR, X_train, y_train, cv=5).mean()

    #DecisionTreeClassifier
    param_grid = { 'max_depth': [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None ]}

    tree = DecisionTreeClassifier()
    grid_search = GridSearchCV(tree, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    DTCScore  = grid_search.best_score_
    bestDTCDepth = grid_search.best_params_


    # Random Forrest Classifier    
    forrest = RandomForestClassifier(random_state=0)
    grid_search = GridSearchCV(forrest, param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    RFCScore  = grid_search.best_score_
    bestRFCDepth = grid_search.best_params_

    #SVC
    SVM = SVC()

    # use grid search to find best gamma for SVM
    g = {'gamma': 10.0 ** np.arange(-5, 5) }
    grid_search = GridSearchCV(SVM, g, cv=5)
    grid_search.fit(X_train, y_train)

    SVMScore  = grid_search.best_score_   


    print("best LR :", LRScore)
    print("best DTC:", DTCScore)
    print("best max depth: ", bestDTCDepth)
    print("best RFC: ", RFCScore)
    print("best max depth: ", bestRFCDepth)
    print("best SVM: ", SVMScore)

    # store the scores of each model
    max_score = 0
    max_model = ""
    if LRScore > max_score:
        max_score = LRScore
        max_model = "LR"
    if DTCScore > max_score:
        max_score = DTCScore
        max_model = "DTC"
    if RFCScore > max_score:
        max_score = RFCScore
        max_model = "RFC"
    if SVMScore > max_score:
        max_score = SVMScore
        max_model = "SVM"

    print("best score overall is: ", max_score, " with model: ", max_model)
    
# run model finding function on our pregnancy data
train_model(X_train, y_train)

best LR : 0.9538094714060378
best DTC: 0.9632185172957705
best max depth:  {'max_depth': 1}
best RFC:  0.9632185172957705
best max depth:  {'max_depth': 1}
best SVM:  0.9632185172957705
best score overall is:  0.9632185172957705  with model:  DTC


In [49]:
DTC = DecisionTreeClassifier(max_depth=1)
DTC.fit(X_train, y_train)
pred_prob = DTC.predict_proba(X_test)

In [50]:
def find_risk(code, col, probs):
    # finds the corresponding subset of our probability data
    indices = (X_test[col] == code)
    prob_subset = probs[indices]
    # finds the average of this subset
    av_prob = np.mean(prob_subset[:, 1]) 
    return av_prob   

In [51]:
pregRaceRisk = []

for code, race in race_code.items():
    avRisk = find_risk(code, 'race', pred_prob)
    newRow = {'race': race, 'risk': avRisk}
    pregRaceRisk.append(newRow)

pregRaceRisk = pd.DataFrame(pregRaceRisk)
pregRaceRisk = pregRaceRisk.sort_values(by='risk', ascending=False)
pregRaceRisk

Unnamed: 0,race,risk
1,black,0.051395
2,hispanic,0.038217
0,asian,0.037262
3,white,0.03426


In [52]:
pregGenderRisk = []

for code, gender in gen_code.items():
    avRisk = find_risk(code, 'gender', pred_prob)
    newRow = {'gender': gender, 'risk': avRisk}
    pregGenderRisk.append(newRow)

pregGenderRisk = pd.DataFrame(pregGenderRisk)
pregGenderRisk = pregGenderRisk.sort_values(by='risk', ascending=False)
pregGenderRisk

Unnamed: 0,gender,risk
0,F,0.074523
1,M,0.0
