In [259]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import seaborn as sns
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np

## Prep Data for training

In [260]:
conditions_diabetes = pd.read_csv('conditions_diabetes.csv')
# conditions_cancer = pd.read_csv('conditions_cancer.csv')
observations = pd.read_csv('observations_pivot.csv')
patients = pd.read_csv('patients.csv')

In [261]:
le = LabelEncoder()

def prep_data(patients, conditions, illness_descriptions, observations):
    patients.rename(columns={'patient':'PATIENT'}, inplace=True)
    patients = patients.drop(columns=['birthdate', 'marital','deathdate', 'address','ssn', 'drivers', 'passport', 'prefix', 'first', 'last', 'suffix', 'maiden'])
    
    patients = patients.dropna()
    conditions = conditions.dropna()

    # MERGE DATASETS
    merged_df = pd.merge(patients, conditions, on='PATIENT', how='left')
    merged_df = pd.merge(merged_df, observations, on='PATIENT', how='left')

    merged_df["y"] = (merged_df[illness_descriptions] == 1).any(axis=1).astype(int)
    
    merged_df = merged_df.drop(columns=illness_descriptions)
    merged_df["race"] = le.fit_transform(merged_df["race"]) 
    race_code = {code: race for code, race in enumerate(le.classes_)}


    merged_df["ethnicity"] = le.fit_transform(merged_df["ethnicity"])
    eth_code = {code: ethnicity for code, ethnicity in enumerate(le.classes_)}

    merged_df["gender"] = le.fit_transform(merged_df["gender"])  
    gen_code = {code: gender for code, gender in enumerate(le.classes_)}

    merged_df["birthplace"] = le.fit_transform(merged_df["birthplace"]) 
    bp_code = {code: bp for code, bp in enumerate(le.classes_)}


    # split into test and train
    train, test = train_test_split(merged_df, test_size=0.2, random_state=42)
    
    # Y column to predict is diabetes
    X_train = train.drop(columns=['y'])
    y_train = train['y']
    
    X_test = test.drop(columns=['y'])
    y_test = test['y']
    
    return X_train, y_train, X_test, y_test, race_code, eth_code, gen_code, bp_code

illness_descriptions = ['PATIENT','Diabetes_CONDITIONS','Prediabetes_CONDITIONS','Diabetic retinopathy associated with type II diabetes mellitus (disorder)_CONDITIONS', 
                        'Nonproliferative diabetic retinopathy due to type 2 diabetes mellitus (disorder)_CONDITIONS', 'Macular edema and retinopathy due to type 2 diabetes mellitus (disorder)_CONDITIONS', 
                        'Microalbuminuria due to type 2 diabetes mellitus (disorder)_CONDITIONS', 'Diabetic renal disease (disorder)_CONDITIONS', 'Neuropathy due to type 2 diabetes mellitus (disorder)_CONDITIONS']
X_train, y_train, X_test, y_test, race_code, eth_code, gen_code, bp_code = prep_data(patients, conditions_diabetes, illness_descriptions, observations)

In [262]:
#LogisticRegression
LR = LogisticRegression(max_iter=10000000000000000000)
LRScore = cross_val_score(LR, X_train, y_train, cv=5).mean()

# keep track of best Logistic Regression Score

#DecisionTreeClassifier
param_grid = { 'max_depth': [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None ]}

tree = DecisionTreeClassifier()
grid_search = GridSearchCV(tree, param_grid, cv=5)
grid_search.fit(X_train, y_train)
DTCScore  = grid_search.best_score_
bestDTCDepth = grid_search.best_params_


# Random Forrest Classifier    
forrest = RandomForestClassifier(random_state=0)
grid_search = GridSearchCV(forrest, param_grid, cv=5)
grid_search.fit(X_train, y_train)

RFCScore  = grid_search.best_score_
bestRFCDepth = grid_search.best_params_

#SVC
SVM = SVC()

# use grid search to find best gamma for SVM
g = {'gamma': 10.0 ** np.arange(-5, 5) }
grid_search = GridSearchCV(SVM, g, cv=5)
grid_search.fit(X_train, y_train)

SVMScore  = grid_search.best_score_   


print("best LR :", LRScore)
print("best DTC:", DTCScore)
print("best max depth: ", bestDTCDepth)
print("best RFC: ", RFCScore)
print("best max depth: ", bestRFCDepth)
print("best SVM: ", SVMScore)

max_score = 0
max_model = ""
if LRScore > max_score:
    max_score = LRScore
    max_model = "LR"
if DTCScore > max_score:
    max_score = DTCScore
    max_model = "DTC"
if RFCScore > max_score:
    max_score = RFCScore
    max_model = "RFC"
if SVMScore > max_score:
    max_score = SVMScore
    max_model = "SVM"

print("best score overall is: ", max_score, " with model: ", max_model)

best LR : 0.9067532372253403
best DTC: 0.9178790213124979
best max depth:  {'max_depth': 3}
best RFC:  0.9195847547778879
best max depth:  {'max_depth': 5}
best SVM:  0.9144492131616596
best score overall is:  0.9195847547778879  with model:  RFC


Next would compute risk scores!

In [263]:
race_code

{0: 'asian', 1: 'black', 2: 'hispanic', 3: 'white'}

Predict probabilities for all our entries using the best model we found

In [264]:
forrest = RandomForestClassifier(max_depth=5)
forrest.fit(X_train, y_train)
pred_prob = forrest.predict_proba(X_test)

Define average risk score finding function

In [265]:
def find_risk(code, col, probs):
    indices = (X_test[col] == code)
    prob_subset = probs[indices]
    av_prob = np.mean(prob_subset[:, 1]) 
    return av_prob   

Compute av. risk score for Asian patients

In [266]:
find_risk(0, 'race', pred_prob)

0.480179786835909

Compute av. risk score for Black patients

In [267]:
find_risk(1, 'race', pred_prob)

0.2382530898722196

Compute av. risk score for Hispanic patients

In [268]:
find_risk(2, 'race', pred_prob)

0.3263173407187264

Compute av. risk score for white patients

In [269]:
find_risk(3, 'race', pred_prob)

0.3169193686398422

Compute av. risk for women

In [270]:
gen_code

{0: 'F', 1: 'M'}

In [271]:
find_risk(0, 'gender', pred_prob)

0.3705511053037141

In [272]:
find_risk(1, 'gender', pred_prob)

0.26757624516308265

ethnicity

In [286]:
av_risk_eth = []

for code, name in eth_code.items():
    av = find_risk(code, 'ethnicity', pred_prob)
    new_row = {'eth': name, 'risk': av}
    av_risk_eth.append(new_row)

av_risk_eth_df = pd.DataFrame(av_risk_eth)
av_risk_eth_df = av_risk_eth_df.sort_values(by='risk', ascending=False)


In [287]:
av_risk_eth_df

Unnamed: 0,eth,risk
2,asian_indian,0.714533
13,polish,0.578034
9,german,0.497687
12,mexican,0.433813
1,american,0.426106
14,portuguese,0.396401
6,english,0.373317
17,scottish,0.334757
11,italian,0.31745
5,dominican,0.309996


keep taking people from the top and bottom until its 20 on each side, then find av risk score

In [276]:
richTowns = ["Dover", "Weston", "Wellesley", "Lexington", "Sherborn", "Cohasset", "Lincoln", "Carlisle", "Hingham", "Winchester", 
                "Medfield", "Concord", "Needham", "Sudbury", "Hopkinton", "Boxford", "Brookline", "Andover",  
                  "Southborough", "Belmont", "Acton", "Marblehead", "Newton", "Nantucket", "Duxbury", "Boxborough", "Westwood","Natick", 
                  "Longmeadow", "Marion", "Groton", "Newbury", "North Andover", "Sharon", "Arlington", "Norwell", "Reading", 
                  "Lynnfield", "Marshfield", "Holliston", "Medway", "Canton", "Milton", "Ipswich", "Littleton", "Westford", "North Reading", "Chelmsford", "Dedham",
                  "Walpole", "Mansfield", "Shrewsbury", "Norwood", "Hanover", "Stow", "Newburyport", "Chatham", "Orleans", "Harwich",
                  "Swampscott","Fairhaven", "Salem"]

poorTowns = ["Springfield", "Lawrence", "Holyoke", "Amherst", "New Bedford", "Chelsea", "Fall River", "Athol", "Orange", "Lynn", "Fitchburg", "Gardner", "Brockton", "Malden", "Worcester", "Chicopee", "North Adams", "Everett",
    "Ware", "Dudley", "Greenfield Town", "Weymouth Town", "Montague", "Revere", "Taunton", "Adams", "Huntington", "Charlemont", "Leominster", "Florida", "Colrain", "Hardwick",
    "Palmer Town", "Peabody", "Somerville", "Lowell", "Westfield", "Billerica"]

Having trouble appending town names to dataframe with town codes and people counts

need that so that we can then get the top twenty of each 
to then average to get rich likelihood and poor likelihood of diabetes 
yayyyy 

Create a df with all the information for teh rich and poor towns

In [278]:
def find_town_info(town, bp_code_swapped, townCounts_df):
    town_full = f'{town} MA US'
    code = bp_code_swapped[town_full]
    
    if not townCounts_df[townCounts_df['birthplace'] == code].empty:
        count = townCounts_df[townCounts_df['birthplace'] == code]['count'].values[0]
    else:
        count = 0
    
    new_row = {'birthplace': town_full, 'code': code, 'count': count}
    
    new_row_df = pd.DataFrame([new_row])
    
    return new_row_df

In [279]:
birthplace_counts = X_test.groupby('birthplace').size().reset_index(name='count')

townCounts_df = pd.merge(X_test, birthplace_counts, on='birthplace')
town_info_rich = pd.DataFrame(columns=['birthplace', 'code', 'count'])
town_info_poor = pd.DataFrame(columns=['birthplace', 'code', 'count'])

bp_code_swapped = {value: key for key, value in bp_code.items()}

for town in richTowns:
    
    new_row_df = find_town_info(town, bp_code_swapped, townCounts_df)
    town_info_rich = pd.concat([town_info_rich, new_row_df], ignore_index=True)

for town in poorTowns:
    
    new_row_df = find_town_info(town, bp_code_swapped, townCounts_df)
    town_info_poor= pd.concat([town_info_poor, new_row_df], ignore_index=True)


## proceed with the following part to get top 65 people from each rich and poor 

In [281]:
def get_towns_by_sum_pop(town_info):
    
    townsUsed = set()
    peopleCount = 0

    for index, row in town_info.iterrows():
        
        if peopleCount > 65:
            break
        
        birthplace = row['birthplace']
        count = row['count']
        townsUsed.add(birthplace)
        peopleCount += count
    
    return townsUsed, peopleCount

richTownsUsed, richPeopleCount = get_towns_by_sum_pop(town_info_rich)
poorTownsUsed, poorPeopleCount = get_towns_by_sum_pop(town_info_poor)

In [282]:
rich_town_codes = []

for town_full in richTownsUsed:
    rich_town_codes.append(bp_code_swapped[town_full])

In [283]:
indices = X_test['birthplace'].isin(rich_town_codes)
prob_subset = pred_prob[indices]
av_rich_prob = np.mean(prob_subset[:, 1]) 

av_rich_prob

0.3328582558648457

In [284]:
poor_town_codes = []

for town_full in poorTownsUsed:
    poor_town_codes.append(bp_code_swapped[town_full])

In [285]:
indices = X_test['birthplace'].isin(poor_town_codes)
prob_subset = pred_prob[indices]
av_poor_prob = np.mean(prob_subset[:, 1]) 

av_poor_prob

0.3212567199706269

This is a really annoying result...