In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import seaborn as sns
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np

## Prep Data for training

In [3]:
conditions_diabetes = pd.read_csv('conditions_diabetes.csv')
conditions_pregnancy = pd.read_csv('conditions_pregnancy.csv')

# Haven't done cancer yet
conditions_cancer = pd.read_csv('conditions_cancer.csv')

observations = pd.read_csv('observations_pivot.csv')
patients = pd.read_csv('patient_clean.csv')

# Diabetes Analysis

## Prep data and train models

In [4]:
le = LabelEncoder()

def prep_data(patients, conditions, illness_descriptions, observations):
    patients.rename(columns={'patient':'PATIENT'}, inplace=True)
    patients = patients.drop(columns=['birthdate', 'marital','deathdate','ssn', 'address', 'drivers', 'passport', 'prefix', 'first', 'last', 'suffix', 'maiden'])
    
    patients = patients.dropna()
    conditions = conditions.dropna()

    # MERGE DATASETS
    merged_df = pd.merge(patients, conditions, on='PATIENT', how='left')
    merged_df = pd.merge(merged_df, observations, on='PATIENT', how='left')

    merged_df["y"] = (merged_df[illness_descriptions] == 1).any(axis=1).astype(int)
    
    merged_df = merged_df.drop(columns=illness_descriptions)
    merged_df["race"] = le.fit_transform(merged_df["race"]) 
    race_code = {code: race for code, race in enumerate(le.classes_)}

    merged_df["ethnicity"] = le.fit_transform(merged_df["ethnicity"])
    eth_code = {code: ethnicity for code, ethnicity in enumerate(le.classes_)}

    merged_df["gender"] = le.fit_transform(merged_df["gender"])  
    gen_code = {code: gender for code, gender in enumerate(le.classes_)}

    merged_df["birthplace"] = le.fit_transform(merged_df["birthplace"]) 
    bp_code = {code: bp for code, bp in enumerate(le.classes_)}


    merged_df["curr_town"] = le.fit_transform(merged_df["curr_town"]) 
    curr_code = {code: bp for code, bp in enumerate(le.classes_)}

    # split into test and train
    train, test = train_test_split(merged_df, test_size=0.2, random_state=42)
    
    # Y column to predict is diabetes
    X_train = train.drop(columns=['y'])
    y_train = train['y']
    
    X_test = test.drop(columns=['y'])
    y_test = test['y']
    
    return X_train, y_train, X_test, y_test, race_code, eth_code, gen_code, bp_code, curr_code

In [5]:
illness_descriptions = ['PATIENT','Diabetes_CONDITIONS','Prediabetes_CONDITIONS','Diabetic retinopathy associated with type II diabetes mellitus (disorder)_CONDITIONS', 
                        'Nonproliferative diabetic retinopathy due to type 2 diabetes mellitus (disorder)_CONDITIONS', 'Macular edema and retinopathy due to type 2 diabetes mellitus (disorder)_CONDITIONS', 
                        'Microalbuminuria due to type 2 diabetes mellitus (disorder)_CONDITIONS', 'Diabetic renal disease (disorder)_CONDITIONS', 'Neuropathy due to type 2 diabetes mellitus (disorder)_CONDITIONS']
X_train, y_train, X_test, y_test, race_code, eth_code, gen_code, bp_code, curr_code = prep_data(patients, conditions_diabetes, illness_descriptions, observations)

In [36]:
def train_model(X_train, y_train):

    #LogisticRegression
    LR = LogisticRegression(max_iter=10000000000000000000)
    LRScore = cross_val_score(LR, X_train, y_train, cv=5).mean()

    # keep track of best Logistic Regression Score

    #DecisionTreeClassifier
    param_grid = { 'max_depth': [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None ]}

    tree = DecisionTreeClassifier()
    grid_search = GridSearchCV(tree, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    DTCScore  = grid_search.best_score_
    bestDTCDepth = grid_search.best_params_


    # Random Forrest Classifier    
    forrest = RandomForestClassifier(random_state=0)
    grid_search = GridSearchCV(forrest, param_grid, cv=5)
    grid_search.fit(X_train, y_train)

    RFCScore  = grid_search.best_score_
    bestRFCDepth = grid_search.best_params_

    #SVC
    SVM = SVC()

    # use grid search to find best gamma for SVM
    g = {'gamma': 10.0 ** np.arange(-5, 5) }
    grid_search = GridSearchCV(SVM, g, cv=5)
    grid_search.fit(X_train, y_train)

    SVMScore  = grid_search.best_score_   


    print("best LR :", LRScore)
    print("best DTC:", DTCScore)
    print("best max depth: ", bestDTCDepth)
    print("best RFC: ", RFCScore)
    print("best max depth: ", bestRFCDepth)
    print("best SVM: ", SVMScore)

    max_score = 0
    max_model = ""
    if LRScore > max_score:
        max_score = LRScore
        max_model = "LR"
    if DTCScore > max_score:
        max_score = DTCScore
        max_model = "DTC"
    if RFCScore > max_score:
        max_score = RFCScore
        max_model = "RFC"
    if SVMScore > max_score:
        max_score = SVMScore
        max_model = "SVM"

    print("best score overall is: ", max_score, " with model: ", max_model)
    
train_model(X_train, y_train)

best LR : 0.9538094714060378
best DTC: 0.9632185172957705
best max depth:  {'max_depth': 1}
best RFC:  0.9632185172957705
best max depth:  {'max_depth': 1}
best SVM:  0.9632185172957705
best score overall is:  0.9632185172957705  with model:  DTC


## Next compute risk scores!

In [7]:
race_code

{0: 'asian', 1: 'black', 2: 'hispanic', 3: 'white'}

Predict probabilities for all our entries using the best model we found

In [8]:
forrest = RandomForestClassifier(max_depth=5)
forrest.fit(X_train, y_train)
pred_prob = forrest.predict_proba(X_test)

Define average risk score finding function

In [9]:
def find_risk(code, col, probs):
    indices = (X_test[col] == code)
    prob_subset = probs[indices]
    av_prob = np.mean(prob_subset[:, 1]) 
    return av_prob   

Compute av. risk score for Asian patients

In [10]:
find_risk(0, 'race', pred_prob)

0.48057648183079466

Compute av. risk score for Black patients

In [11]:
find_risk(1, 'race', pred_prob)

0.2373632702484378

Compute av. risk score for Hispanic patients

In [12]:
find_risk(2, 'race', pred_prob)

0.3289644469861832

Compute av. risk score for white patients

In [13]:
find_risk(3, 'race', pred_prob)

0.3149784576151381

Compute av. risk for women

In [14]:
gen_code

{0: 'F', 1: 'M'}

In [15]:
find_risk(0, 'gender', pred_prob)

0.3684271695336779

In [16]:
find_risk(1, 'gender', pred_prob)

0.2674373232163859

ethnicity

In [17]:
av_risk_eth = []

for code, name in eth_code.items():
    av = find_risk(code, 'ethnicity', pred_prob)
    new_row = {'eth': name, 'risk': av}
    av_risk_eth.append(new_row)

av_risk_eth_df = pd.DataFrame(av_risk_eth)
av_risk_eth_df = av_risk_eth_df.sort_values(by='risk', ascending=False)


In [18]:
av_risk_eth_df

Unnamed: 0,eth,risk
2,asian_indian,0.71502
13,polish,0.581707
9,german,0.494888
12,mexican,0.429846
1,american,0.4218
14,portuguese,0.39669
6,english,0.371051
17,scottish,0.334208
15,puerto_rican,0.31611
11,italian,0.314707


keep taking people from the top and bottom until its 20 on each side, then find av risk score

In [19]:
richTowns = ["Dover", "Weston", "Wellesley", "Lexington", "Sherborn", "Cohasset", "Lincoln", "Carlisle", "Hingham", "Winchester", 
                "Medfield", "Concord", "Needham", "Sudbury", "Hopkinton", "Boxford", "Brookline", "Andover",  
                  "Southborough", "Belmont", "Acton", "Marblehead", "Newton", "Nantucket", "Duxbury", "Boxborough", "Westwood","Natick", 
                  "Longmeadow", "Marion", "Groton", "Newbury", "North Andover", "Sharon", "Arlington", "Norwell", "Reading", 
                  "Lynnfield", "Marshfield", "Holliston", "Medway", "Canton", "Milton", "Ipswich", "Littleton", "Westford", "North Reading", "Chelmsford", "Dedham",
                  "Walpole", "Mansfield", "Shrewsbury", "Norwood", "Hanover", "Stow", "Newburyport", "Chatham", "Orleans", "Harwich",
                  "Swampscott","Fairhaven", "Salem"]

poorTowns = ["Springfield", "Lawrence", "Holyoke", "Amherst", "New Bedford", "Chelsea", "Fall River", "Athol", "Orange", "Lynn", "Fitchburg", "Gardner", "Brockton", "Malden", "Worcester", "Chicopee", "North Adams", "Everett",
    "Ware", "Dudley", "Greenfield Town", "Weymouth Town", "Montague", "Revere", "Taunton", "Adams", "Huntington", "Charlemont", "Leominster", "Florida", "Colrain", "Hardwick",
    "Palmer Town", "Peabody", "Somerville", "Lowell", "Westfield", "Billerica"]

Having trouble appending town names to dataframe with town codes and people counts

need that so that we can then get the top twenty of each 
to then average to get rich likelihood and poor likelihood of diabetes 
yayyyy 

Create a df with all the information for teh rich and poor towns

In [20]:
def find_town_info_row(town, bp_code_swapped, townCounts_df, code_name):
    code = bp_code_swapped[town]
    
    if not townCounts_df[townCounts_df[code_name] == code].empty:
        count = townCounts_df[townCounts_df[code_name] == code]['count'].values[0]
    else:
        count = 0
    
    new_row = {code_name: town, 'code': code, 'count': count}
    
    new_row_df = pd.DataFrame([new_row])
    
    return new_row_df

In [21]:
def find_town_info_all(counts, code_name):
    
    townCounts_df = pd.merge(X_test, counts, on=code_name)
    town_info_rich = pd.DataFrame(columns=[code_name, 'code', 'count'])
    town_info_poor = pd.DataFrame(columns=[code_name, 'code', 'count'])

    bp_code_swapped = {value: key for key, value in bp_code.items()}

    for town in richTowns:
        
        new_row_df = find_town_info_row(town, bp_code_swapped, townCounts_df, code_name)
        town_info_rich = pd.concat([town_info_rich, new_row_df], ignore_index=True)

    for town in poorTowns:
        
        new_row_df = find_town_info_row(town, bp_code_swapped, townCounts_df, code_name)
        town_info_poor= pd.concat([town_info_poor, new_row_df], ignore_index=True)
        
    return town_info_rich, town_info_poor

birthplace_counts = X_test.groupby('birthplace').size().reset_index(name='count')

town_info_rich, town_info_poor = find_town_info_all(birthplace_counts, 'birthplace')

In [22]:
birthplace_counts

Unnamed: 0,birthplace,count
0,1,2
1,3,1
2,4,5
3,7,1
4,8,1
...,...,...
139,259,1
140,260,2
141,261,3
142,262,9


## proceed with the following part to get top 65 people from each rich and poor 

In [23]:
def get_towns_by_sum_pop(town_info, code_name):
    
    townsUsed = set()
    peopleCount = 0

    for index, row in town_info.iterrows():
        
        if peopleCount > 65:
            break
        
        name = row[code_name]
        count = row['count']
        townsUsed.add(name)
        peopleCount += count
    
    return townsUsed, peopleCount

richTownsUsed, richPeopleCount = get_towns_by_sum_pop(town_info_rich, 'birthplace')
poorTownsUsed, poorPeopleCount = get_towns_by_sum_pop(town_info_poor, 'birthplace')

In [24]:
def get_av_prob_bp(townsUsed, code_name, bp_code):
    
    town_codes = []
    bp_code_swapped = {value: key for key, value in bp_code.items()}


    for town_full in townsUsed:
        town_codes.append(bp_code_swapped[town_full])
        
    indices = X_test[code_name].isin(town_codes)
    prob_subset = pred_prob[indices]
    av_prob = np.mean(prob_subset[:, 1]) 

    return av_prob

av_rich_prob = get_av_prob_bp(richTownsUsed, 'birthplace', bp_code)
av_rich_prob

0.33114880799819624

In [25]:
av_poor_prob = get_av_prob_bp(poorTownsUsed, 'birthplace', bp_code)
av_poor_prob

0.3211917984326292

This is a really annoying result...

## Repeating Process for Current Town of Residence

Not sure what this code does lol... 

In [26]:
# def find_town_info_sep_df(town, curr_code_swapped, townCounts_df):
#     code = curr_code_swapped[town]
    
#     if not townCounts_df[townCounts_df['curr_town'] == code].empty:
#         count = townCounts_df[townCounts_df['curr_town'] == code]['count'].values[0]
#     else:
#         count = 0
    
#     new_row = {'curr_town': town, 'code': code, 'count': count}
    
#     new_row_df = pd.DataFrame([new_row])
    
#     return new_row_df

Create a dataframe with the information for rich and poor towns

In [27]:
curr_counts = X_test.groupby('curr_town').size().reset_index(name='count')

town_info_rich, town_info_poor = find_town_info_all(curr_counts, 'curr_town')

Again get the towns with total for 65 people

In [28]:
richTownsUsed, richPeopleCount = get_towns_by_sum_pop(town_info_rich, 'curr_town')
poorTownsUsed, poorPeopleCount = get_towns_by_sum_pop(town_info_poor, 'curr_town')

In [29]:
av_rich_prob = get_av_prob_bp(richTownsUsed, 'curr_town', bp_code)
av_rich_prob

0.24299260615778653

In [30]:
av_rich_prob = get_av_prob_bp(richTownsUsed, 'curr_town', bp_code)

av_poor_prob

0.3211917984326292

People from rich towns have lower rates of diabetes

# Pregnancy Analysis

## First, clean data and get test and train datasets

In [37]:
illness_descriptions = ['PATIENT', 'Miscarriage in first trimester_CONDITIONS',
                        'Miscarriage in second trimester_CONDITIONS',
                        'Complication occuring during pregnancy_CONDITIONS',
                        'Preeclampsia_CONDITIONS', 'Antepartum eclampsia_CONDITIONS',
                        'Tubal pregnancy_CONDITIONS', 'Congenital uterine anomaly_CONDITIONS',
                        'Blighted ovum_CONDITIONS']
X_train, y_train, X_test, y_test, race_code, eth_code, gen_code, bp_code, curr_code = prep_data(patients, conditions_pregnancy, illness_descriptions, observations)

In [38]:
train_model(X_train, y_train)

best LR : 0.9538094714060378
best DTC: 0.9632185172957705
best max depth:  {'max_depth': 1}
best RFC:  0.9632185172957705
best max depth:  {'max_depth': 1}
best SVM:  0.9632185172957705
best score overall is:  0.9632185172957705  with model:  DTC


## Next, compute Risk scores

Predict probabilities for all our entries using the best model we found

In [39]:
DTC = DecisionTreeClassifier(max_depth=5)
DTC.fit(X_train, y_train)
pred_prob = DTC.predict_proba(X_test)

### Race

In [None]:
race_code

{0: 'asian', 1: 'black', 2: 'hispanic', 3: 'white'}

Compute av. risk score for Asian patients

In [40]:
find_risk(0, 'race', pred_prob)

0.0

Compute av. risk score for Black patients

In [41]:
find_risk(1, 'race', pred_prob)

0.07568857080893517

Compute av. risk score for Hispanic patients

In [42]:
find_risk(2, 'race', pred_prob)

0.0612804386389292

Compute av. risk score for white patients

In [43]:
find_risk(3, 'race', pred_prob)

0.036454141703180426

### Gender

In [None]:
gen_code

{0: 'F', 1: 'M'}

In [44]:
find_risk(0, 'gender', pred_prob)

0.08525506638714186

In [45]:
find_risk(1, 'gender', pred_prob)

0.0

This makes sense as generally men do not get pregnant.

### Ethnicity

In [46]:
av_risk_eth = []

for code, name in eth_code.items():
    av = find_risk(code, 'ethnicity', pred_prob)
    new_row = {'eth': name, 'risk': av}
    av_risk_eth.append(new_row)

av_risk_eth_df = pd.DataFrame(av_risk_eth)
av_risk_eth_df = av_risk_eth_df.sort_values(by='risk', ascending=False)


In [47]:
av_risk_eth_df

Unnamed: 0,eth,risk
17,scottish,0.159329
5,dominican,0.147799
19,west_indian,0.119497
1,american,0.112636
15,puerto_rican,0.073537
7,french,0.047799
8,french_canadian,0.039832
12,mexican,0.039832
11,italian,0.038756
6,english,0.038547


### Birthplace

Since we already found the top poorest and richest cities to sum to 65 people, we do not have to do this again.

In [48]:
av_rich_prob = get_av_prob_bp(richTownsUsed, 'birthplace', bp_code)
av_poor_prob = get_av_prob_bp(poorTownsUsed, 'birthplace', bp_code)

print("av_rich_prob: ", av_rich_prob, "av_poor_prob: ", av_poor_prob)

av_rich_prob:  0.02573778422835027 av_poor_prob:  0.04847605224963716


### Current Address

In [49]:
av_rich_prob = get_av_prob_bp(richTownsUsed, 'curr_town', bp_code)
av_poor_prob = get_av_prob_bp(poorTownsUsed, 'curr_town', bp_code)

print("av_rich_prob: ", av_rich_prob, "av_poor_prob: ", av_poor_prob)

av_rich_prob:  0.04847605224963716 av_poor_prob:  0.018384131591678763
