In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import seaborn as sns
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Prep Data for training

In [5]:
conditions_diabetes = pd.read_csv('conditions_diabetes.csv')
# conditions_cancer = pd.read_csv('conditions_cancer.csv')
observations = pd.read_csv('observations_pivot.csv')
patients = pd.read_csv('patients.csv')

In [6]:
le = LabelEncoder()

def prep_data(patients, conditions, illness_descriptions, observations):
    patients.rename(columns={'patient':'PATIENT'}, inplace=True)
    patients = patients.drop(columns=['birthdate', 'marital','deathdate', 'address','ssn', 'drivers', 'passport', 'prefix', 'first', 'last', 'suffix', 'maiden'])
    
    patients = patients.dropna()
    conditions = conditions.dropna()

    # MERGE DATASETS
    merged_df = pd.merge(patients, conditions, on='PATIENT', how='left')
    merged_df = pd.merge(merged_df, observations, on='PATIENT', how='left')

    merged_df["y"] = (merged_df[illness_descriptions] == 1).any(axis=1).astype(int)
    
    merged_df = merged_df.drop(columns=illness_descriptions)
    merged_df["race"] = le.fit_transform(merged_df["race"]) 
    race_code = {code: race for code, race in enumerate(le.classes_)}


    merged_df["ethnicity"] = le.fit_transform(merged_df["ethnicity"])
    eth_code = {code: ethnicity for code, ethnicity in enumerate(le.classes_)}

    merged_df["gender"] = le.fit_transform(merged_df["gender"])  
    gen_code = {code: gender for code, gender in enumerate(le.classes_)}

    merged_df["birthplace"] = le.fit_transform(merged_df["birthplace"]) 
    bp_code = {code: bp for code, bp in enumerate(le.classes_)}


    # split into test and train
    train, test = train_test_split(merged_df, test_size=0.2, random_state=42)
    
    # Y column to predict is diabetes
    X_train = train.drop(columns=['y'])
    y_train = train['y']
    
    X_test = test.drop(columns=['y'])
    y_test = test['y']
    
    return X_train, y_train, X_test, y_test, race_code, eth_code, gen_code, bp_code

illness_descriptions = ['PATIENT','Diabetes_CONDITIONS','Prediabetes_CONDITIONS','Diabetic retinopathy associated with type II diabetes mellitus (disorder)_CONDITIONS', 
                        'Nonproliferative diabetic retinopathy due to type 2 diabetes mellitus (disorder)_CONDITIONS', 'Macular edema and retinopathy due to type 2 diabetes mellitus (disorder)_CONDITIONS', 
                        'Microalbuminuria due to type 2 diabetes mellitus (disorder)_CONDITIONS', 'Diabetic renal disease (disorder)_CONDITIONS', 'Neuropathy due to type 2 diabetes mellitus (disorder)_CONDITIONS']
X_train, y_train, X_test, y_test, race_code, eth_code, gen_code, bp_code = prep_data(patients, conditions_diabetes, illness_descriptions, observations)

In [7]:
#LogisticRegression
LR = LogisticRegression(max_iter=10000000000000000000)
LRScore = cross_val_score(LR, X_train, y_train, cv=5).mean()

# keep track of best Logistic Regression Score

#DecisionTreeClassifier
param_grid = { 'max_depth': [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, None ]}

tree = DecisionTreeClassifier()
grid_search = GridSearchCV(tree, param_grid, cv=5)
grid_search.fit(X_train, y_train)
DTCScore  = grid_search.best_score_
bestDTCDepth = grid_search.best_params_


# Random Forrest Classifier    
forrest = RandomForestClassifier(random_state=0)
grid_search = GridSearchCV(forrest, param_grid, cv=5)
grid_search.fit(X_train, y_train)

RFCScore  = grid_search.best_score_
bestRFCDepth = grid_search.best_params_

#SVC
SVM = SVC()

# use grid search to find best gamma for SVM
g = {'gamma': 10.0 ** np.arange(-5, 5) }
grid_search = GridSearchCV(SVM, g, cv=5)
grid_search.fit(X_train, y_train)

SVMScore  = grid_search.best_score_   


print("best LR :", LRScore)
print("best DTC:", DTCScore)
print("best max depth: ", bestDTCDepth)
print("best RFC: ", RFCScore)
print("best max depth: ", bestRFCDepth)
print("best SVM: ", SVMScore)

max_score = 0
max_model = ""
if LRScore > max_score:
    max_score = LRScore
    max_model = "LR"
if DTCScore > max_score:
    max_score = DTCScore
    max_model = "DTC"
if RFCScore > max_score:
    max_score = RFCScore
    max_model = "RFC"
if SVMScore > max_score:
    max_score = SVMScore
    max_model = "SVM"

print("best score overall is: ", max_score, " with model: ", max_model)

best LR : 0.9067532372253403
best DTC: 0.9178790213124979
best max depth:  {'max_depth': 3}
best RFC:  0.9195847547778879
best max depth:  {'max_depth': 5}
best SVM:  0.9144492131616596
best score overall is:  0.9195847547778879  with model:  RFC


Next would compute risk scores!

In [8]:
race_code

{0: 'asian', 1: 'black', 2: 'hispanic', 3: 'white'}

Predict probabilities for all our entries using the best model we found

In [9]:
forrest = RandomForestClassifier(max_depth=5)
forrest.fit(X_train, y_train)
pred_prob = forrest.predict_proba(X_test)

Define average risk score finding function

In [10]:
def find_risk(code, col, probs):
    indices = (X_test[col] == code)
    prob_subset = probs[indices]
    av_prob = np.mean(prob_subset[:, 1]) 
    return av_prob   

Compute av. risk score for Asian patients

In [11]:
find_risk(0, 'race', pred_prob)

0.48188497404065167

Compute av. risk score for Black patients

In [12]:
find_risk(1, 'race', pred_prob)

0.24022361621907298

Compute av. risk score for Hispanic patients

In [13]:
find_risk(2, 'race', pred_prob)

0.32889915821708876

Compute av. risk score for white patients

In [14]:
find_risk(3, 'race', pred_prob)

0.3159515925650529

Compute av. risk for women

In [15]:
gen_code

{0: 'F', 1: 'M'}

In [16]:
find_risk(0, 'gender', pred_prob)

0.37010866159600764

In [17]:
find_risk(1, 'gender', pred_prob)

0.26785288885093805

ethnicity

In [18]:
eth_code

{0: 'african',
 1: 'american',
 2: 'asian_indian',
 3: 'central_american',
 4: 'chinese',
 5: 'dominican',
 6: 'english',
 7: 'french',
 8: 'french_canadian',
 9: 'german',
 10: 'irish',
 11: 'italian',
 12: 'mexican',
 13: 'polish',
 14: 'portuguese',
 15: 'puerto_rican',
 16: 'russian',
 17: 'scottish',
 18: 'swedish',
 19: 'west_indian'}

In [19]:
av_risk_eth = []

for code, name in eth_code.items():
    av = find_risk(code, 'ethnicity', pred_prob)
    new_row = {'eth': name, 'risk': av}
    av_risk_eth.append(new_row)

av_risk_eth_df = pd.DataFrame(av_risk_eth)
av_risk_eth_df = av_risk_eth_df.sort_values(by='risk', ascending=False)

av_risk_eth_df


Unnamed: 0,eth,risk
2,asian_indian,0.714286
13,polish,0.581169
9,german,0.49595
12,mexican,0.435955
1,american,0.42088
14,portuguese,0.395776
6,english,0.373404
17,scottish,0.334445
11,italian,0.319817
5,dominican,0.314767


In [20]:
bp_code

{0: 'Abington MA US',
 1: 'Acton MA US',
 2: 'Acushnet MA US',
 3: 'Adams MA US',
 4: 'Agawam Town MA US',
 5: 'Alford MA US',
 6: 'Amesbury Town MA US',
 7: 'Amherst MA US',
 8: 'Andover MA US',
 9: 'Arlington MA US',
 10: 'Ashburnham MA US',
 11: 'Ashby MA US',
 12: 'Ashfield MA US',
 13: 'Ashland MA US',
 14: 'Athol MA US',
 15: 'Attleboro MA US',
 16: 'Auburn MA US',
 17: 'Avon MA US',
 18: 'Barnstable Town MA US',
 19: 'Barre MA US',
 20: 'Becket MA US',
 21: 'Bedford MA US',
 22: 'Bellingham MA US',
 23: 'Belmont MA US',
 24: 'Berkley MA US',
 25: 'Beverly MA US',
 26: 'Billerica MA US',
 27: 'Boston MA US',
 28: 'Bourne MA US',
 29: 'Boxborough MA US',
 30: 'Boxford MA US',
 31: 'Boylston MA US',
 32: 'Braintree Town MA US',
 33: 'Brewster MA US',
 34: 'Bridgewater MA US',
 35: 'Brockton MA US',
 36: 'Brookfield MA US',
 37: 'Brookline MA US',
 38: 'Burlington MA US',
 39: 'Cambridge MA US',
 40: 'Canton MA US',
 41: 'Carlisle MA US',
 42: 'Carver MA US',
 43: 'Charlemont MA US'

Bottom 3:
Springfield: 213 - 5
Lawrence: 113 - 3
Holyoke: 104 - 2

Top 3:
Dover: 16 - 2
Lexington: 119 - 0
Wellesley: 238 - 2

keep taking people from the top and bottom until its 20 on each side, then find av risk score

In [68]:
richTowns = ["Dover", "Weston", "Wellesley", "Lexington", "Sherborn", "Cohasset", "Lincoln", "Carlisle", "Hingham", "Winchester", 
                "Medfield", "Concord", "Needham", "Sudbury", "Hopkinton", "Boxford", "Brookline", "Andover",  
                  "Southborough", "Belmont", "Acton", "Marblehead", "Newton", "Nantucket", "Duxbury", "Boxborough", "Westwood","Natick", 
                  "Longmeadow", "Marion", "Groton", "Newbury", "North Andover", "Sharon", "Arlington", "Norwell", "Reading", 
                  "Lynnfield", "Marshfield", "Holliston", "Medway", "Canton", "Milton", "Ipswich", "Littleton", "Westford", "North Reading"]

poorTowns = ["Springfield", "Lawrence", "Holyoke", "Amherst", "New Bedford", "Chelsea", "Fall River", "Athol", "Orange", "Lynn", "Fitchburg", "Gardner", "Brockton", "Malden", "Worcester", "Chicopee", "North Adams", "Everett",
    "Ware", "Dudley", "Greenfield Town", "Weymouth Town", "Montague", "Revere", "Taunton", "Adams", "Huntington", "Charlemont", "Leominster", "Florida", "Colrain", "Hardwick",
    "Palmer Town", "Peabody", "Somerville", "Lowell", "Westfield", "Billerica"]

Having trouble appending town names to dataframe with town codes and people counts

need that so that we can then get the top twenty of each 
to then average to get rich likelihood and poor likelihood of diabetes 
yayyyy 

In [101]:
townCounts = X_test.groupby('birthplace').size()

townCounts_df = pd.DataFrame(townCounts, columns=['birthplace', 'Count'])

townName = pd.DataFrame(columns=['name', 'birthplace', 'pop'])

for currBP in richTowns: 
    bp_code_swapped = {value: key for key, value in bp_code.items()}
    town = currBP + ' MA US'
    code = bp_code_swapped[town]

    count = townCounts_df.loc[townCounts_df['birthplace'] == code, 'Count'].iloc[0]
    new_row = {'name': town, 'birthplace': code, 'pop': count}
    new_row_df = pd.DataFrame([new_row])
    print(new_row_df)
    townName = pd.concat([townName, new_row_df], ignore_index=True)


ValueError: Shape of passed values is (144, 1), indices imply (144, 2)

In [100]:
townCounts_df.columns

Index(['Count'], dtype='object')

## proceed with the following part to get top 20 people from each rich and poor 

In [23]:
# richTownsUsed = set()
# peopleCount = 0
# for personRow, town in poorTowns:
#     while peopleCount <= 20: 
#         richTownsUsed.add(town)
#         peopleCount += 1
#         poorTownsUsed = set()
# peopleCount = 0
# for personRow, town in poorTowns:
#     while peopleCount <= 20: 
#         poorTownsUsed.add(town)
#         peopleCount += 1

ValueError: too many values to unpack (expected 2)

In [None]:
springfield = find_risk(213, 'birthplace', pred_prob)
lawrence = find_risk(113, 'birthplace', pred_prob)
holyoke = find_risk(104, 'birthplace', pred_prob)

dover = find_risk(16, 'birthplace', pred_prob)
lexington = find_risk(119, 'birthplace', pred_prob)
wellesley = find_risk(238, 'birthplace', pred_prob)

In [None]:
indices = (X_test['birthplace'] == 238)
prob_subset = pred_prob[indices]

len(prob_subset)

2

In [None]:
print(springfield, "\n", lawrence, "\n", holyoke, "\n", dover, "\n", lexington, "\n", wellesley, "\n")

0.0984442595290341 
 0.3341135562139574 
 0.001060267857142857 
 0.5650708866146601 
 nan 
 0.7997913691073069 

