In [15]:
import numpy as np
import pandas as pd 

from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [16]:
# defining dtypes upon reading in data to keep FIPS as 5 digit code
dtypes_dict = {
 'FIPS': 'object',
 'State': 'object',
 'County': 'object',
 'Deaths': 'float64',
 'Years of Potential Life Lost Rate': 'float64',
 '% Fair or Poor Health': 'float64',
 'Average Number of Physically Unhealthy Days': 'float64',
 'Average Number of Mentally Unhealthy Days': 'float64',
 '% Low birthweight': 'float64',
 '% Smokers': 'float64',
 '% Adults with Obesity': 'float64',
 'Food Environment Index': 'float64',
 '% Physically Inactive': 'float64',
 '% With Access to Exercise Opportunities': 'float64',
 '% Excessive Drinking': 'float64',
 '# Alcohol-Impaired Driving Deaths': 'float64',
 '# Driving Deaths': 'float64',
 '% Driving Deaths with Alcohol Involvement': 'float64',
 '# Chlamydia Cases': 'float64',
 'Chlamydia Rate': 'float64',
 'Teen Birth Rate': 'float64',
 '# Uninsured': 'float64',
 '% Uninsured': 'float64',
 '# Primary Care Physicians': 'float64',
 'Primary Care Physicians Rate': 'float64',
 'Primary Care Physicians Ratio': 'object',
 '# Dentists': 'float64',
 'Dentist Rate': 'float64',
 'Dentist Ratio': 'object',
 '# Mental Health Providers': 'float64',
 'Mental Health Provider Rate': 'float64',
 'Mental Health Provider Ratio': 'object',
 'Preventable Hospitalization Rate': 'float64',
 '% With Annual Mammogram': 'float64',
 '% Vaccinated': 'float64',
 '# Completed High School': 'int64',
 'Population': 'int64',
 '% Completed High School': 'float64',
 '# Some College': 'int64',
 'Population.1': 'int64',
 '% Some College': 'float64',
 '# Unemployed': 'float64',
 'Labor Force': 'float64',
 '% Unemployed': 'float64',
 '% Children in Poverty': 'float64',
 'Income Ratio': 'float64',
 '# Children in Single-Parent Households': 'float64',
 '# Children in Households': 'float64',
 '% Children in Single-Parent Households': 'float64',
 '# Associations': 'int64',
 'Social Association Rate': 'float64',
 'Annual Average Violent Crimes': 'float64',
 'Violent Crime Rate': 'float64',
 '# Injury Deaths': 'float64',
 'Injury Death Rate': 'float64',
 'Average Daily PM2.5': 'float64',
 '% Severe Housing Problems': 'float64',
 'Severe Housing Cost Burden': 'float64',
 'Overcrowding': 'float64',
 'Inadequate Facilities': 'float64',
 '% Drive Alone to Work': 'float64',
 '# Workers who Drive Alone': 'int64',
 '% Long Commute - Drives Alone': 'float64',
 'Water_Violation': 'int64'}

In [24]:
mei = pd.read_csv('../data/mei_clean.csv')
chrankings = pd.read_csv('../data/demographic_clean.csv', dtype=dtypes_dict)
police = pd.read_csv('../data/police_scorecard_clean.csv', dtype={'location_name': object, 
                                                                  'state': object,
                                                                  'calc_overall_score': np.int64, 
                                                                  'FIPS': object})

In [18]:
mei.shape

(506, 5)

In [29]:
mei.head()

Unnamed: 0,state,city,standard_score,flex_score,total_mei
0,AL,auburn,21,2,23
1,AL,birmingham,94,8,100
2,AL,florence,0,0,0
3,AL,hoover,12,0,12
4,AL,huntsville,20,0,20


In [19]:
mei.isnull().sum()

state             0
city              0
standard_score    0
flex_score        0
total_mei         0
dtype: int64

In [25]:
# need this in order to map cities in MEI to FIPS county codes
police.shape

(13147, 4)

In [26]:
police.head()

Unnamed: 0,location_name,state,calc_overall_score,FIPS
0,new york,NY,32,36061
1,los angeles,CA,33,6037
2,chicago,IL,24,17031
3,houston,TX,48,48201
4,phoenix,AZ,39,4013


In [31]:
df = pd.merge(police[['FIPS', 'location_name', 'state']], mei, how='right', 
              left_on=['location_name', 'state'], right_on=['city', 'state'])

In [34]:
df.drop(columns=['location_name'], inplace=True)

In [38]:
# consider imputing later
# for now, dropping these to explore
df[df['FIPS'].isnull()]

Unnamed: 0,FIPS,state,city,standard_score,flex_score,total_mei
11,,AK,juneau,84,4,88
13,,AK,sitka,36,0,36
50,,CA,guerneville (sonoma county),81,13,94
54,,CA,lancaster,73,4,77
58,,CA,moreno valley,56,4,60
64,,CA,palm desert,83,11,94
66,,CA,palmdale,71,5,76
69,,CA,rancho cucamonga,62,6,68
70,,CA,rancho mirage,87,16,100
80,,CA,santa clarita,70,4,74


In [40]:
df.dropna(inplace=True)

In [41]:
df.head()

Unnamed: 0,FIPS,state,city,standard_score,flex_score,total_mei
0,1081,AL,auburn,21,2,23
1,1073,AL,birmingham,94,8,100
2,1077,AL,florence,0,0,0
3,1073,AL,hoover,12,0,12
4,1089,AL,huntsville,20,0,20


In [23]:
chrankings.head()

Unnamed: 0,index,FIPS,State,County,% Fair or Poor Health,Average Number of Physically Unhealthy Days,Average Number of Mentally Unhealthy Days,% Low birthweight,% Smokers,% Adults with Obesity,% Physically Inactive,% With Access to Exercise Opportunities,% Excessive Drinking,# Alcohol-Impaired Driving Deaths,# Driving Deaths,% Driving Deaths with Alcohol Involvement,# Uninsured,% Uninsured,Preventable Hospitalization Rate,% With Annual Mammogram,% Vaccinated,# Completed High School,Population,% Completed High School,# Some College,Population.1,% Some College,# Unemployed,Labor Force,% Unemployed,% Children in Poverty,Income Ratio,# Children in Single-Parent Households,# Children in Households,% Children in Single-Parent Households,# Associations,Social Association Rate,Violent Crime Rate,# Injury Deaths,Injury Death Rate,Average Daily PM2.5,% Severe Housing Problems,Severe Housing Cost Burden,Overcrowding,Inadequate Facilities,% Drive Alone to Work,# Workers who Drive Alone,% Long Commute - Drives Alone,Water_Violation
0,0,1001,Alabama,Autauga,19.839179,4.501499,4.850214,9.201774,19.810766,33.0,30.6,69.130124,14.473886,19.0,57.0,33.333333,4693.0,10.047314,6650.0,39.0,42.0,33076,37367,88.516606,8689,14362,60.49993,714.0,26172.0,2.728106,15.9,5.090408,3267.0,13205.0,24.740629,68,12.229996,272.28222,190.0,68.410043,10.4,13.630967,12.032598,1.116124,1.543576,85.788512,24635,38.3,0
1,1,1003,Alabama,Baldwin,16.460675,3.647978,4.768294,8.292009,18.513318,30.0,24.7,73.713549,18.736958,53.0,179.0,29.608939,22660.0,13.168983,3471.0,43.0,46.0,137219,151112,90.806157,33404,49776,67.108647,2653.0,97328.0,2.725834,13.5,4.385135,10321.0,46133.0,22.372271,229,10.503527,203.660396,777.0,72.878641,7.2,12.996651,11.987507,1.017929,0.334931,83.760106,93141,40.4,0
2,2,1005,Alabama,Barbour,29.8415,5.569267,5.590494,11.355311,25.530768,41.2,28.0,53.16677,12.466974,13.0,33.0,39.393939,2310.0,13.504034,5314.0,44.0,39.0,13152,17964,73.213093,2379,6680,35.613772,324.0,8537.0,3.795244,41.0,5.981993,2707.0,5307.0,51.008103,19,7.636349,414.277861,98.0,76.988947,9.4,13.710555,12.59426,1.958651,0.652884,82.116389,8231,30.9,0
3,3,1007,Alabama,Bibb,23.853284,4.894377,5.271114,10.220994,23.084813,37.4,33.4,16.251364,15.420603,8.0,27.0,29.62963,1764.0,10.599051,6690.0,33.0,40.0,12782,16168,79.057397,2483,6186,40.139024,266.0,8685.0,3.062752,25.9,5.002608,1404.0,4586.0,30.614915,19,8.482143,89.349126,119.0,105.601306,10.0,9.255242,8.263736,0.563991,0.563991,87.057671,8167,52.0,0
4,4,1009,Alabama,Blount,21.98561,4.986622,5.357809,7.430541,22.954959,33.0,33.3,15.634486,15.833284,15.0,82.0,18.292683,6642.0,14.115997,4440.0,37.0,40.0,32028,39791,80.490563,7862,14102,55.750957,676.0,25331.0,2.668667,21.0,4.427838,3003.0,13372.0,22.457374,45,7.780083,482.690611,311.0,107.591609,10.6,10.125665,7.468859,1.81247,1.159981,87.519594,21690,57.9,0


In [42]:
df = pd.merge(df, chrankings, how='left', 
              left_on='FIPS', right_on="FIPS")

In [44]:
df.shape

(435, 54)

In [47]:
df[df['State'].isnull()]

Unnamed: 0,FIPS,state,city,standard_score,flex_score,total_mei,index,State,County,% Fair or Poor Health,Average Number of Physically Unhealthy Days,Average Number of Mentally Unhealthy Days,% Low birthweight,% Smokers,% Adults with Obesity,% Physically Inactive,% With Access to Exercise Opportunities,% Excessive Drinking,# Alcohol-Impaired Driving Deaths,# Driving Deaths,% Driving Deaths with Alcohol Involvement,# Uninsured,% Uninsured,Preventable Hospitalization Rate,% With Annual Mammogram,% Vaccinated,# Completed High School,Population,% Completed High School,# Some College,Population.1,% Some College,# Unemployed,Labor Force,% Unemployed,% Children in Poverty,Income Ratio,# Children in Single-Parent Households,# Children in Households,% Children in Single-Parent Households,# Associations,Social Association Rate,Violent Crime Rate,# Injury Deaths,Injury Death Rate,Average Daily PM2.5,% Severe Housing Problems,Severe Housing Cost Burden,Overcrowding,Inadequate Facilities,% Drive Alone to Work,# Workers who Drive Alone,% Long Commute - Drives Alone,Water_Violation
10,2122,AK,homer,5,2,7,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [48]:
df.dropna(inplace=True)

In [50]:
df.corr()['standard_score'].sort_values()

% Drive Alone to Work                         -0.347112
% Uninsured                                   -0.328484
% Adults with Obesity                         -0.320691
% Physically Inactive                         -0.301323
% Smokers                                     -0.242205
% Fair or Poor Health                         -0.184155
index                                         -0.161620
Average Number of Physically Unhealthy Days   -0.154094
% Driving Deaths with Alcohol Involvement     -0.117975
% Children in Poverty                         -0.116476
Social Association Rate                       -0.101350
Average Number of Mentally Unhealthy Days     -0.095583
% Low birthweight                             -0.093281
Injury Death Rate                             -0.063925
% Unemployed                                  -0.058019
Inadequate Facilities                         -0.031989
Preventable Hospitalization Rate              -0.024102
% With Annual Mammogram                        0

In [None]:
cols_not_using = ['FIPS', 'state', 'city']

