# Food Derserts and Birth Outcomes | ML for cities final project

- controlling for zip code specific factors (controlling for everything else, does being in a food desert matter to the outcome of low birth weight?) 
- pick one definition of a food desert OR take some of the variables that make up whether it is a fd and put that in the predictor variables 
- number for feature (sklearn variable importance) 
- need to use a different ml technique to really get interpretability / variable importance 
- compare accuracy of different models
- bayes net vs random forests 
- equal frequency bins (quartiles/quintiles) 

## Merging data before analysis

In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pylab as plt
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

## Read in the cleaned ACS data set

In [3]:
dfACS = pd.read_csv('dataACS.csv')
# dfACS = dfACS.set_index('zipcode')
# dfACS = dfACS.drop('Unnamed: 0', axis=1)
dfACS.zipcode = dfACS.zipcode.astype(str)
dfACS.head()

Unnamed: 0,zipcode,prctForeign,%Uninsured,%vehicle,%foodStamp,%poverty,urban/rural
0,10001,16.230927,6.313218,77.51007,51.290074,17.5,1
1,10002,16.301731,7.873643,76.87685,42.674401,27.8,1
2,10003,12.463227,5.066324,77.210118,48.017334,9.9,1
3,10004,15.177398,6.241787,76.513672,52.299606,4.8,1
4,10005,16.211251,5.774971,85.056784,49.311137,12.1,1


In [4]:
dfACS.index.shape

(5212,)

In [5]:
dfACS.dtypes

zipcode         object
prctForeign    float64
%Uninsured     float64
%vehicle       float64
%foodStamp     float64
%poverty       float64
urban/rural      int64
dtype: object

## Read in Birth outcomes data set

In [6]:
birth = pd.read_csv('birthNYS.csv')
birth = birth.drop(['Unnamed: 0'], axis = 1)
# birth = birth.set_index('ZipCode')
birth.head(3)

Unnamed: 0,TeenBirthRate,ZipCode,TotalBirths2012-2014,PrematureBirth,LowBirthWeight
0,5.6,14008,34.0,2.9,0.0
1,13.6,14012,67.0,10.4,6.0
2,5.1,14028,47.0,8.5,10.6


In [7]:
df = dfACS.merge(birth, left_on = 'zipcode', right_on = 'ZipCode')
df.head()

Unnamed: 0,zipcode,prctForeign,%Uninsured,%vehicle,%foodStamp,%poverty,urban/rural,TeenBirthRate,ZipCode,TotalBirths2012-2014,PrematureBirth,LowBirthWeight
0,10001,16.230927,6.313218,77.51007,51.290074,17.5,1,5.7,10001,527.0,10.5,11.0
1,10002,16.301731,7.873643,76.87685,42.674401,27.8,1,17.7,10002,3044.0,8.9,6.6
2,10003,12.463227,5.066324,77.210118,48.017334,9.9,1,0.6,10003,1268.0,9.7,8.7
3,10004,15.177398,6.241787,76.513672,52.299606,4.8,1,0.0,10004,195.0,8.7,6.7
4,10005,16.211251,5.774971,85.056784,49.311137,12.1,1,0.0,10005,364.0,10.7,6.0


In [8]:
print(df.shape)
df.columns

(2059, 12)


Index(['zipcode', 'prctForeign', '%Uninsured', '%vehicle', '%foodStamp',
       '%poverty', 'urban/rural', 'TeenBirthRate', 'ZipCode',
       'TotalBirths2012-2014', 'PrematureBirth', 'LowBirthWeight'],
      dtype='object')

In [9]:
df = df.drop(['ZipCode', 'TotalBirths2012-2014'], axis=1)

print(df.shape)
df.columns

(2059, 10)


Index(['zipcode', 'prctForeign', '%Uninsured', '%vehicle', '%foodStamp',
       '%poverty', 'urban/rural', 'TeenBirthRate', 'PrematureBirth',
       'LowBirthWeight'],
      dtype='object')

In [10]:
print(len(df.zipcode))
print(len(df.zipcode.unique()))

2059
1454


In [11]:
df = df.drop_duplicates(subset=['zipcode'])
print(df.shape)

(1454, 10)


In [12]:
# df = df.set_index('zipcode')
df.columns = ['zipcode', 'Foreign', 'Uninsured', 'vehicle', 'foodStamp', 'poverty', 'urban', 
              'TeenBirthRate', 'PrematureBirth', 'LowBirthWeight']
df.head()

Unnamed: 0,zipcode,Foreign,Uninsured,vehicle,foodStamp,poverty,urban,TeenBirthRate,PrematureBirth,LowBirthWeight
0,10001,16.230927,6.313218,77.51007,51.290074,17.5,1,5.7,10.5,11.0
1,10002,16.301731,7.873643,76.87685,42.674401,27.8,1,17.7,8.9,6.6
2,10003,12.463227,5.066324,77.210118,48.017334,9.9,1,0.6,9.7,8.7
3,10004,15.177398,6.241787,76.513672,52.299606,4.8,1,0.0,8.7,6.7
4,10005,16.211251,5.774971,85.056784,49.311137,12.1,1,0.0,10.7,6.0


In [13]:
df.to_csv('dataAll.csv')

In [26]:
df.to_json(path_or_buf = 'dataAll.json', orient='records')#[1:-1].replace('},{', '} {')
# dataAll

In [51]:
# dataBirth
dataBirth = df[['zipcode', 'PrematureBirth', 'LowBirthWeight']]

dataBirth = dataBirth.set_index('zipcode')
dataBirth = dataBirth.T

dataBirth.head()

zipcode,10001,10002,10003,10004,10005,10006,10007,10009,10010,10011,...,14891,14892,14894,14895,14897,14898,14901,14903,14904,14905
PrematureBirth,10.5,8.9,9.7,8.7,10.7,10.6,10.4,9.8,9.8,10.6,...,15.2,7.7,17.4,9.0,17.1,5.9,12.2,10.6,9.4,9.4
LowBirthWeight,11.0,6.6,8.7,6.7,6.0,14.2,7.0,8.0,8.6,8.7,...,8.5,9.0,10.6,6.4,11.4,7.4,9.6,5.5,5.3,6.5


In [56]:
dataBirth.to_json(path_or_buf = 'dataBirth1.json', orient='index')

In [62]:
dataBirth.T.mean()

PrematureBirth    10.316437
LowBirthWeight     7.149931
dtype: float64

## Read in the Zipcodes shp

In [141]:
zipshp = gpd.read_file('cb_2017_us_zcta510_500k/cb_2017_us_zcta510_500k.shp')
zipshp.shape

(33144, 6)

In [142]:
len(zipshp.ZCTA5CE10.unique())

33144

In [143]:
zipshp.head(3)

Unnamed: 0,ZCTA5CE10,AFFGEOID10,GEOID10,ALAND10,AWATER10,geometry
0,35442,8600000US35442,35442,610213891,10838694,"(POLYGON ((-88.252618 32.92675, -88.249724 32...."
1,85365,8600000US85365,85365,3545016067,9766486,"(POLYGON ((-114.684663 32.687389, -114.676063 ..."
2,71973,8600000US71973,71973,204670474,1264709,"POLYGON ((-94.46643176650841 34.330735, -94.46..."


## Merging data to shp

In [144]:
dfshp = zipshp.merge(df, left_on = 'ZCTA5CE10', right_on = 'zipcode')
dfshp = dfshp.set_index('zipcode')
dfshp = dfshp.drop(['ZCTA5CE10', 'GEOID10', 'AFFGEOID10', 'ALAND10', 'AWATER10'], axis=1)
print(dfshp.shape)
print(dfshp.columns)

dfshp.head(3)

(1454, 10)
Index(['geometry', 'Foreign', 'Uninsured', 'vehicle', 'foodStamp', 'poverty',
       'urban', 'TeenBirthRate', 'PrematureBirth', 'LowBirthWeight'],
      dtype='object')


Unnamed: 0_level_0,geometry,Foreign,Uninsured,vehicle,foodStamp,poverty,urban,TeenBirthRate,PrematureBirth,LowBirthWeight
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
12832,"POLYGON ((-73.414208 43.388595, -73.4105869999...",0.647249,8.223007,1.463581,40.232421,14.9,1,32.9,7.5,3.7
13421,"POLYGON ((-75.709535 43.088988, -75.704686 43....",1.448316,5.763858,4.992306,39.111895,16.2,1,30.1,8.8,6.7
10456,"POLYGON ((-73.91902899999999 40.83309, -73.914...",20.066442,11.60044,60.622253,32.111269,40.8,1,41.3,12.4,10.2


In [17]:
len(dfshp.index.unique())

1454

In [18]:
dfshp.to_csv('dfshp.csv')

----

# Analysis

# 1. Random Forest

### Split into training and test data

In [58]:
df = df.set_index('zipcode')
df.head()

Unnamed: 0_level_0,%Foreign,%Uninsured,%vehicle,%foodStamp,%poverty,urban,%TeenBirthRate,%PrematureBirth,%LowBirthWeight
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10001,16.230927,6.313218,77.51007,51.290074,17.5,1,5.7,10.5,11.0
10002,16.301731,7.873643,76.87685,42.674401,27.8,1,17.7,8.9,6.6
10003,12.463227,5.066324,77.210118,48.017334,9.9,1,0.6,9.7,8.7
10004,15.177398,6.241787,76.513672,52.299606,4.8,1,0.0,8.7,6.7
10005,16.211251,5.774971,85.056784,49.311137,12.1,1,0.0,10.7,6.0


In [59]:
df.shape

(1454, 9)

# 1.1 dependent: _Low birth weight_

In [60]:
df['%LowBirthWeight'].describe()

count    1454.000000
mean        7.149931
std         3.696764
min         0.000000
25%         5.100000
50%         7.100000
75%         8.900000
max        50.000000
Name: %LowBirthWeight, dtype: float64

In [22]:
df.loc[df['%LowBirthWeight'] <= 5.1] = 1
df.loc[(df['%LowBirthWeight'] > 5.1) & (df['%LowBirthWeight'] <= 7.1), '%LowBirthWeight'] = 2
df.loc[(df['%LowBirthWeight'] > 7.1) & (df['%LowBirthWeight'] <= 8.9), '%LowBirthWeight'] = 3
df.loc[df['%LowBirthWeight'] > 8.9, '%LowBirthWeight'] = 4

df['%LowBirthWeight'].value_counts()

2.0    382
1.0    370
4.0    354
3.0    348
Name: %LowBirthWeight, dtype: int64

In [23]:
df['%LowBirthWeight'][::100]

zipcode
10001    4.0
10532    1.0
10998    4.0
11434    4.0
11778    2.0
12090    4.0
12456    2.0
12760    2.0
12981    1.0
13214    3.0
13607    4.0
13830    1.0
14201    4.0
14559    2.0
14818    4.0
Name: %LowBirthWeight, dtype: float64

In [24]:
df.dropna(0, inplace=True)
df.head(3)

Unnamed: 0_level_0,%Foreign,%Uninsured,%vehicle,%foodStamp,%poverty,urban,%TeenBirthRate,%PrematureBirth,%LowBirthWeight
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10001,16.230927,6.313218,77.51007,51.290074,17.5,1,5.7,10.5,4.0
10002,16.301731,7.873643,76.87685,42.674401,27.8,1,17.7,8.9,2.0
10003,12.463227,5.066324,77.210118,48.017334,9.9,1,0.6,9.7,3.0


In [25]:
df.dtypes

%Foreign           float64
%Uninsured         float64
%vehicle           float64
%foodStamp         float64
%poverty           float64
urban                int64
%TeenBirthRate     float64
%PrematureBirth    float64
%LowBirthWeight    float64
dtype: object

In [40]:
X = df.iloc[:,:7]
y = df['%LowBirthWeight']

# X = X.astype(int)
# Y = Y.astype(int)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                   random_state=123)

In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid ={'max_depth':range(1,11)}
rf = RandomForestClassifier(n_estimators=100)
gr=GridSearchCV(rf,param_grid=param_grid)
rs=gr.fit(X_train,y_train)
print('Best parameter value:',rs.best_params_)
y_predict = rs.predict(X_test)
print('Accuracy = ',((y_predict == y_test).value_counts(normalize=True)[True]))

Best parameter value: {'max_depth': 8}
Accuracy =  0.629268292683


In [43]:
rf1 = RandomForestClassifier(max_depth=8)
rf1 = rf1.fit(X_train, y_train)
Feature_importance=pd.DataFrame([list(X_train.columns),list(rf1.feature_importances_)]).T
Feature_importance.columns=["variables","importance"]

# list the top 5 most important features in order
Feature_importance.sort_values(by="importance",ascending=False)

Unnamed: 0,variables,importance
1,%Uninsured,0.255918
3,%foodStamp,0.246174
4,%poverty,0.236059
6,%TeenBirthRate,0.105216
0,%Foreign,0.076929
2,%vehicle,0.0706497
5,urban,0.00905481


----
## 1.2 dependent: _Preterm Birth_

In [61]:
df['%PrematureBirth'].describe()

count    1454.000000
mean       10.316437
std         4.492350
min         0.000000
25%         8.000000
50%        10.200000
75%        12.700000
max        50.000000
Name: %PrematureBirth, dtype: float64

In [62]:
df.loc[df['%PrematureBirth'] <= 8] = 1
df.loc[(df['%PrematureBirth'] > 8) & (df['%PrematureBirth'] <= 10.2), '%PrematureBirth'] = 2
df.loc[(df['%PrematureBirth'] > 10.2) & (df['%PrematureBirth'] <= 12.7), '%PrematureBirth'] = 3
df.loc[df['%PrematureBirth'] > 12.7, '%PrematureBirth'] = 4

df['%PrematureBirth'].value_counts()

2.0    369
1.0    368
3.0    362
4.0    355
Name: %PrematureBirth, dtype: int64

In [63]:
X = df.iloc[:,:7]
Y = df['%PrematureBirth']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3,
                                                   random_state=123)

In [69]:
X_train = (X_train).fillna(X_train.mean())
X_test = (X_test).fillna(X_test.mean())

In [70]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid ={'max_depth':range(1,11)}
rf = RandomForestClassifier(n_estimators=100)
gr=GridSearchCV(rf,param_grid=param_grid)
rs=gr.fit(X_train,y_train)
print('Best parameter value:',rs.best_params_)
y_predict = rs.predict(X_test)
print('Accuracy = ',((y_predict == y_test).value_counts(normalize=True)[True]))

Best parameter value: {'max_depth': 9}
Accuracy =  0.604118993135


In [71]:
rf2 = RandomForestClassifier(max_depth=9)
rf2 = rf1.fit(X_train, y_train)
Feature_importance=pd.DataFrame([list(X_train.columns),list(rf2.feature_importances_)]).T
Feature_importance.columns=["variables","importance"]

# list the top 5 most important features in order
Feature_importance.sort_values(by="importance",ascending=False)

Unnamed: 0,variables,importance
1,%Uninsured,0.240602
3,%foodStamp,0.239732
4,%poverty,0.237192
6,%TeenBirthRate,0.114532
2,%vehicle,0.0733406
0,%Foreign,0.058583
5,urban,0.0360183
