In [1]:
import numpy as np
import pandas as pd 

from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# defining dtypes upon reading in data to keep FIPS as 5 digit code
dtypes_dict = {
 'FIPS': 'object',
 'State': 'object',
 'County': 'object',
 'Deaths': 'float64',
 'Years of Potential Life Lost Rate': 'float64',
 '% Fair or Poor Health': 'float64',
 'Average Number of Physically Unhealthy Days': 'float64',
 'Average Number of Mentally Unhealthy Days': 'float64',
 '% Low birthweight': 'float64',
 '% Smokers': 'float64',
 '% Adults with Obesity': 'float64',
 'Food Environment Index': 'float64',
 '% Physically Inactive': 'float64',
 '% With Access to Exercise Opportunities': 'float64',
 '% Excessive Drinking': 'float64',
 '# Alcohol-Impaired Driving Deaths': 'float64',
 '# Driving Deaths': 'float64',
 '% Driving Deaths with Alcohol Involvement': 'float64',
 '# Chlamydia Cases': 'float64',
 'Chlamydia Rate': 'float64',
 'Teen Birth Rate': 'float64',
 '# Uninsured': 'float64',
 '% Uninsured': 'float64',
 '# Primary Care Physicians': 'float64',
 'Primary Care Physicians Rate': 'float64',
 'Primary Care Physicians Ratio': 'object',
 '# Dentists': 'float64',
 'Dentist Rate': 'float64',
 'Dentist Ratio': 'object',
 '# Mental Health Providers': 'float64',
 'Mental Health Provider Rate': 'float64',
 'Mental Health Provider Ratio': 'object',
 'Preventable Hospitalization Rate': 'float64',
 '% With Annual Mammogram': 'float64',
 '% Vaccinated': 'float64',
 '# Completed High School': 'int64',
 'Population': 'int64',
 '% Completed High School': 'float64',
 '# Some College': 'int64',
 'Population.1': 'int64',
 '% Some College': 'float64',
 '# Unemployed': 'float64',
 'Labor Force': 'float64',
 '% Unemployed': 'float64',
 '% Children in Poverty': 'float64',
 'Income Ratio': 'float64',
 '# Children in Single-Parent Households': 'float64',
 '# Children in Households': 'float64',
 '% Children in Single-Parent Households': 'float64',
 '# Associations': 'int64',
 'Social Association Rate': 'float64',
 'Annual Average Violent Crimes': 'float64',
 'Violent Crime Rate': 'float64',
 '# Injury Deaths': 'float64',
 'Injury Death Rate': 'float64',
 'Average Daily PM2.5': 'float64',
 '% Severe Housing Problems': 'float64',
 'Severe Housing Cost Burden': 'float64',
 'Overcrowding': 'float64',
 'Inadequate Facilities': 'float64',
 '% Drive Alone to Work': 'float64',
 '# Workers who Drive Alone': 'int64',
 '% Long Commute - Drives Alone': 'float64',
 'Water_Violation': 'int64'}

In [3]:
police = pd.read_csv('../data/police_scorecard_clean.csv', dtype={'location_name': object, 
                                                                  'state': object,
                                                                  'calc_overall_score': np.int64, 
                                                                  'FIPS': object})
mei = pd.read_csv('../data/mei_clean.csv')
ch_rankings = pd.read_csv('../data/chr_clean.csv', dtype=dtypes_dict)

In [4]:
ch_rankings.head()

Unnamed: 0,FIPS,State,County,Deaths,Years of Potential Life Lost Rate,% Fair or Poor Health,Average Number of Physically Unhealthy Days,Average Number of Mentally Unhealthy Days,% Low birthweight,% Smokers,% Adults with Obesity,Food Environment Index,% Physically Inactive,% With Access to Exercise Opportunities,% Excessive Drinking,# Alcohol-Impaired Driving Deaths,# Driving Deaths,% Driving Deaths with Alcohol Involvement,# Chlamydia Cases,Chlamydia Rate,Teen Birth Rate,# Uninsured,% Uninsured,# Primary Care Physicians,Primary Care Physicians Rate,Primary Care Physicians Ratio,# Dentists,Dentist Rate,Dentist Ratio,# Mental Health Providers,Mental Health Provider Rate,Mental Health Provider Ratio,Preventable Hospitalization Rate,% With Annual Mammogram,% Vaccinated,# Completed High School,Population,% Completed High School,# Some College,Population.1,% Some College,# Unemployed,Labor Force,% Unemployed,% Children in Poverty,Income Ratio,# Children in Single-Parent Households,# Children in Households,% Children in Single-Parent Households,# Associations,Social Association Rate,Annual Average Violent Crimes,Violent Crime Rate,# Injury Deaths,Injury Death Rate,Average Daily PM2.5,% Severe Housing Problems,Severe Housing Cost Burden,Overcrowding,Inadequate Facilities,% Drive Alone to Work,# Workers who Drive Alone,% Long Commute - Drives Alone,Water_Violation
0,1001,Alabama,Autauga,787.0,7830.053484,19.839179,4.501499,4.850214,9.201774,19.810766,33.0,6.7,30.6,69.130124,14.473886,19.0,57.0,33.333333,280.0,504.5,23.764041,4693.0,10.047314,26.0,46.76175,2139:1,19.0,34.00813,2940:1,16.0,28.63842,3492:1,6650.0,39.0,42.0,33076,37367,88.516606,8689,14362,60.49993,714.0,26172.0,2.728106,15.9,5.090408,3267.0,13205.0,24.740629,68,12.229996,148.5,272.28222,190.0,68.410043,10.4,13.630967,12.032598,1.116124,1.543576,85.788512,24635,38.3,0
1,1003,Alabama,Baldwin,3147.0,7680.47727,16.460675,3.647978,4.768294,8.292009,18.513318,30.0,7.8,24.7,73.713549,18.736958,53.0,179.0,29.608939,632.0,297.2,26.006904,22660.0,13.168983,153.0,70.1764,1425:1,110.0,49.27565,2029:1,220.0,98.5513,1015:1,3471.0,43.0,46.0,137219,151112,90.806157,33404,49776,67.108647,2653.0,97328.0,2.725834,13.5,4.385135,10321.0,46133.0,22.372271,229,10.503527,408.0,203.660396,777.0,72.878641,7.2,12.996651,11.987507,1.017929,0.334931,83.760106,93141,40.4,0
2,1005,Alabama,Barbour,515.0,11476.629416,29.8415,5.569267,5.590494,11.355311,25.530768,41.2,5.5,28.0,53.16677,12.466974,13.0,33.0,39.393939,191.0,755.8,37.114262,2310.0,13.504034,8.0,32.15305,3110:1,9.0,36.45791,2743:1,3.0,12.15264,8229:1,5314.0,44.0,39.0,13152,17964,73.213093,2379,6680,35.613772,324.0,8537.0,3.795244,41.0,5.981993,2707.0,5307.0,51.008103,19,7.636349,105.5,414.277861,98.0,76.988947,9.4,13.710555,12.59426,1.958651,0.652884,82.116389,8231,30.9,0
3,1007,Alabama,Bibb,476.0,12172.562382,23.853284,4.894377,5.271114,10.220994,23.084813,37.4,7.6,33.4,16.251364,15.420603,8.0,27.0,29.62963,139.0,613.2,37.786446,1764.0,10.599051,12.0,53.57143,1867:1,5.0,22.32741,4479:1,6.0,26.79289,3732:1,6690.0,33.0,40.0,12782,16168,79.057397,2483,6186,40.139024,266.0,8685.0,3.062752,25.9,5.002608,1404.0,4586.0,30.614915,19,8.482143,19.5,89.349126,119.0,105.601306,10.0,9.255242,8.263736,0.563991,0.563991,87.057671,8167,52.0,0
4,1009,Alabama,Blount,1100.0,11096.050649,21.98561,4.986622,5.357809,7.430541,22.954959,33.0,7.9,33.3,15.634486,15.833284,15.0,82.0,18.292683,176.0,303.4,31.239778,6642.0,14.115997,12.0,20.74689,4820:1,11.0,19.02258,5257:1,10.0,17.29326,5783:1,4440.0,37.0,40.0,32028,39791,80.490563,7862,14102,55.750957,676.0,25331.0,2.668667,21.0,4.427838,3003.0,13372.0,22.457374,45,7.780083,279.0,482.690611,311.0,107.591609,10.6,10.125665,7.468859,1.81247,1.159981,87.519594,21690,57.9,0


In [5]:
police.head()

Unnamed: 0,location_name,state,calc_overall_score,FIPS
0,new york,NY,32,36061
1,los angeles,CA,33,6037
2,chicago,IL,24,17031
3,houston,TX,48,48201
4,phoenix,AZ,39,4013


In [6]:
police.dtypes

location_name         object
state                 object
calc_overall_score     int64
FIPS                  object
dtype: object

In [7]:
police['location_name'].nunique()

10368

In [8]:
mei.head()

Unnamed: 0,state,city,standard_score,flex_score,total_mei
0,AL,auburn,21,2,23
1,AL,birmingham,94,8,100
2,AL,florence,0,0,0
3,AL,hoover,12,0,12
4,AL,huntsville,20,0,20


In [9]:
mei.shape

(506, 5)

In [10]:
mei.dtypes

state             object
city              object
standard_score     int64
flex_score         int64
total_mei          int64
dtype: object

In [11]:
ch_rankings.head()

Unnamed: 0,FIPS,State,County,Deaths,Years of Potential Life Lost Rate,% Fair or Poor Health,Average Number of Physically Unhealthy Days,Average Number of Mentally Unhealthy Days,% Low birthweight,% Smokers,% Adults with Obesity,Food Environment Index,% Physically Inactive,% With Access to Exercise Opportunities,% Excessive Drinking,# Alcohol-Impaired Driving Deaths,# Driving Deaths,% Driving Deaths with Alcohol Involvement,# Chlamydia Cases,Chlamydia Rate,Teen Birth Rate,# Uninsured,% Uninsured,# Primary Care Physicians,Primary Care Physicians Rate,Primary Care Physicians Ratio,# Dentists,Dentist Rate,Dentist Ratio,# Mental Health Providers,Mental Health Provider Rate,Mental Health Provider Ratio,Preventable Hospitalization Rate,% With Annual Mammogram,% Vaccinated,# Completed High School,Population,% Completed High School,# Some College,Population.1,% Some College,# Unemployed,Labor Force,% Unemployed,% Children in Poverty,Income Ratio,# Children in Single-Parent Households,# Children in Households,% Children in Single-Parent Households,# Associations,Social Association Rate,Annual Average Violent Crimes,Violent Crime Rate,# Injury Deaths,Injury Death Rate,Average Daily PM2.5,% Severe Housing Problems,Severe Housing Cost Burden,Overcrowding,Inadequate Facilities,% Drive Alone to Work,# Workers who Drive Alone,% Long Commute - Drives Alone,Water_Violation
0,1001,Alabama,Autauga,787.0,7830.053484,19.839179,4.501499,4.850214,9.201774,19.810766,33.0,6.7,30.6,69.130124,14.473886,19.0,57.0,33.333333,280.0,504.5,23.764041,4693.0,10.047314,26.0,46.76175,2139:1,19.0,34.00813,2940:1,16.0,28.63842,3492:1,6650.0,39.0,42.0,33076,37367,88.516606,8689,14362,60.49993,714.0,26172.0,2.728106,15.9,5.090408,3267.0,13205.0,24.740629,68,12.229996,148.5,272.28222,190.0,68.410043,10.4,13.630967,12.032598,1.116124,1.543576,85.788512,24635,38.3,0
1,1003,Alabama,Baldwin,3147.0,7680.47727,16.460675,3.647978,4.768294,8.292009,18.513318,30.0,7.8,24.7,73.713549,18.736958,53.0,179.0,29.608939,632.0,297.2,26.006904,22660.0,13.168983,153.0,70.1764,1425:1,110.0,49.27565,2029:1,220.0,98.5513,1015:1,3471.0,43.0,46.0,137219,151112,90.806157,33404,49776,67.108647,2653.0,97328.0,2.725834,13.5,4.385135,10321.0,46133.0,22.372271,229,10.503527,408.0,203.660396,777.0,72.878641,7.2,12.996651,11.987507,1.017929,0.334931,83.760106,93141,40.4,0
2,1005,Alabama,Barbour,515.0,11476.629416,29.8415,5.569267,5.590494,11.355311,25.530768,41.2,5.5,28.0,53.16677,12.466974,13.0,33.0,39.393939,191.0,755.8,37.114262,2310.0,13.504034,8.0,32.15305,3110:1,9.0,36.45791,2743:1,3.0,12.15264,8229:1,5314.0,44.0,39.0,13152,17964,73.213093,2379,6680,35.613772,324.0,8537.0,3.795244,41.0,5.981993,2707.0,5307.0,51.008103,19,7.636349,105.5,414.277861,98.0,76.988947,9.4,13.710555,12.59426,1.958651,0.652884,82.116389,8231,30.9,0
3,1007,Alabama,Bibb,476.0,12172.562382,23.853284,4.894377,5.271114,10.220994,23.084813,37.4,7.6,33.4,16.251364,15.420603,8.0,27.0,29.62963,139.0,613.2,37.786446,1764.0,10.599051,12.0,53.57143,1867:1,5.0,22.32741,4479:1,6.0,26.79289,3732:1,6690.0,33.0,40.0,12782,16168,79.057397,2483,6186,40.139024,266.0,8685.0,3.062752,25.9,5.002608,1404.0,4586.0,30.614915,19,8.482143,19.5,89.349126,119.0,105.601306,10.0,9.255242,8.263736,0.563991,0.563991,87.057671,8167,52.0,0
4,1009,Alabama,Blount,1100.0,11096.050649,21.98561,4.986622,5.357809,7.430541,22.954959,33.0,7.9,33.3,15.634486,15.833284,15.0,82.0,18.292683,176.0,303.4,31.239778,6642.0,14.115997,12.0,20.74689,4820:1,11.0,19.02258,5257:1,10.0,17.29326,5783:1,4440.0,37.0,40.0,32028,39791,80.490563,7862,14102,55.750957,676.0,25331.0,2.668667,21.0,4.427838,3003.0,13372.0,22.457374,45,7.780083,279.0,482.690611,311.0,107.591609,10.6,10.125665,7.468859,1.81247,1.159981,87.519594,21690,57.9,0


In [12]:
ch_rankings.shape

(3141, 64)

In [13]:
df = pd.merge(police, mei, how='right', 
              left_on=['location_name', 'state'], right_on=['city', 'state'])

In [14]:
df.shape

(510, 8)

In [15]:
df.isnull().sum()

location_name         75
state                  0
calc_overall_score    75
FIPS                  75
city                   0
standard_score         0
flex_score             0
total_mei              0
dtype: int64

In [16]:
df.dropna(inplace=True)

In [17]:
df.head()

Unnamed: 0,location_name,state,calc_overall_score,FIPS,city,standard_score,flex_score,total_mei
0,auburn,AL,42.0,1081,auburn,21,2,23
1,birmingham,AL,43.0,1073,birmingham,94,8,100
2,florence,AL,40.0,1077,florence,0,0,0
3,hoover,AL,44.0,1073,hoover,12,0,12
4,huntsville,AL,46.0,1089,huntsville,20,0,20


In [18]:
# df['fips_county_code'] = df['fips_county_code'].astype(int)
df['calc_overall_score'] = df['calc_overall_score'].astype(int)

In [19]:
df = pd.merge(df, ch_rankings, how='left', 
         left_on='FIPS', right_on='FIPS')

In [20]:
df.shape

(435, 71)

In [21]:
df.columns[df.isna().any()].tolist()

['Food Environment Index',
 'Preventable Hospitalization Rate',
 '% With Annual Mammogram',
 '% Vaccinated',
 'Annual Average Violent Crimes',
 'Violent Crime Rate',
 'Average Daily PM2.5']

In [22]:
df.describe()

Unnamed: 0,calc_overall_score,standard_score,flex_score,total_mei,Deaths,Years of Potential Life Lost Rate,% Fair or Poor Health,Average Number of Physically Unhealthy Days,Average Number of Mentally Unhealthy Days,% Low birthweight,% Smokers,% Adults with Obesity,Food Environment Index,% Physically Inactive,% With Access to Exercise Opportunities,% Excessive Drinking,# Alcohol-Impaired Driving Deaths,# Driving Deaths,% Driving Deaths with Alcohol Involvement,# Chlamydia Cases,Chlamydia Rate,Teen Birth Rate,# Uninsured,% Uninsured,# Primary Care Physicians,Primary Care Physicians Rate,# Dentists,Dentist Rate,# Mental Health Providers,Mental Health Provider Rate,Preventable Hospitalization Rate,% With Annual Mammogram,% Vaccinated,# Completed High School,Population,% Completed High School,# Some College,Population.1,% Some College,# Unemployed,Labor Force,% Unemployed,% Children in Poverty,Income Ratio,# Children in Single-Parent Households,# Children in Households,% Children in Single-Parent Households,# Associations,Social Association Rate,Annual Average Violent Crimes,Violent Crime Rate,# Injury Deaths,Injury Death Rate,Average Daily PM2.5,% Severe Housing Problems,Severe Housing Cost Burden,Overcrowding,Inadequate Facilities,% Drive Alone to Work,# Workers who Drive Alone,% Long Commute - Drives Alone,Water_Violation
count,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,434.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,434.0,434.0,434.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,431.0,431.0,435.0,435.0,433.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0,435.0
mean,44.786207,60.650575,4.422989,63.829885,9115.855172,6861.056238,17.621227,4.001297,4.361272,8.082824,17.1418,29.284138,7.935023,21.281839,87.904528,19.414151,105.218391,396.710345,28.569387,5695.154023,584.038391,19.735131,87112.406897,10.006514,770.94023,87.700819,784.404598,78.368561,2868.76092,320.656537,3997.986175,42.771889,49.451613,538193.4,626692.3,89.3657,178189.4,264595.7,68.713584,17467.558621,478742.9,3.523391,15.931494,4.748562,54142.094253,210960.6,25.773798,662.535632,9.765989,4150.726218,403.376498,2818.243678,72.612729,8.218245,17.806902,14.741836,3.216277,1.051555,77.450207,447742.2,30.585287,0.004598
std,6.889034,28.470086,4.52137,29.908671,13906.428754,1924.977647,3.975897,0.556506,0.526717,1.480801,3.50233,5.239754,0.788819,4.48921,10.339473,3.025919,160.756408,634.468978,8.08751,10386.729614,221.952119,9.448062,163748.984509,4.406453,1203.133199,31.708065,1403.524929,25.751695,5343.051066,168.922715,1096.811836,6.493631,6.764171,874279.1,1075828.0,4.776734,303212.5,467173.4,8.154469,33949.400428,809155.3,0.981177,6.169047,0.725179,90976.244167,357172.1,7.272161,916.143752,3.698258,7822.522791,235.069489,3958.648914,21.800056,1.997954,4.799517,3.666992,2.479282,0.741545,7.966473,756471.4,12.154005,0.067728
min,24.0,0.0,0.0,0.0,98.0,2872.67189,9.847407,2.667134,2.974246,4.929721,7.07565,14.3,2.8,8.9,43.281766,6.453001,1.0,3.0,8.024691,46.0,130.4,2.098695,1019.0,2.764273,7.0,31.16958,8.0,24.33557,7.0,25.50695,455.0,27.0,26.0,6603.0,6953.0,65.72132,2241.0,2878.0,43.647279,235.0,6884.0,1.791881,3.6,3.52352,562.0,2352.0,9.157569,16.0,1.816098,11.0,42.28684,29.0,30.83415,3.9,8.424818,6.04782,0.358491,0.0,5.990142,7104.0,5.1,0.0
25%,40.0,41.5,0.0,44.0,1704.5,5479.707892,14.482033,3.560381,4.026301,6.976638,14.874051,25.35,7.5,17.8,83.023146,17.631394,20.0,70.0,24.095715,727.0,437.9,12.416092,10050.0,6.919239,122.0,67.8137,98.0,63.06763,382.0,195.408275,3162.25,39.0,45.25,86817.0,97517.0,87.446157,25018.0,36354.0,63.613493,2383.5,75919.5,2.83306,11.4,4.276928,7268.5,29421.0,20.567968,149.0,7.005754,397.5,236.501945,550.5,56.584489,7.0,14.508992,12.131729,1.604802,0.729415,75.374711,70523.5,20.9,0.0
50%,45.0,61.0,4.0,64.0,4377.0,6681.863975,17.304336,4.034772,4.352698,7.970693,17.182413,29.8,8.0,21.8,90.343894,19.046271,50.0,173.0,28.286853,2421.0,563.6,18.507527,29160.0,9.254661,352.0,84.66001,309.0,75.32491,1179.0,298.80375,4008.0,43.0,50.0,227198.0,258258.0,90.064548,71627.0,103804.0,68.963853,6369.0,196991.0,3.343981,15.0,4.655328,22719.0,86807.0,24.973469,368.0,9.527044,1456.0,361.235828,1352.0,70.807222,8.2,16.762452,14.078525,2.286609,0.931382,78.993448,179225.0,30.3,0.0
75%,49.0,87.5,6.5,93.5,10226.0,7954.911928,20.248989,4.380608,4.756732,8.789596,19.493794,33.0,8.4,24.1,96.311757,21.262426,108.0,372.0,32.547397,5820.0,689.05,25.205033,71739.5,12.432301,884.5,103.03261,797.5,90.18351,3371.0,389.999815,4695.75,47.0,54.0,579090.0,638328.0,92.730411,177566.0,255221.5,73.819614,18224.0,483842.5,4.018283,19.6,5.064777,56476.5,214210.0,30.494012,822.0,11.778234,4522.25,489.965382,3380.0,85.225997,9.4,19.964195,17.075198,4.116118,1.161805,82.096025,463235.0,39.75,0.0
max,65.0,100.0,19.0,100.0,84748.0,16913.389014,34.857096,6.469784,6.178428,14.531523,27.555392,42.6,9.6,33.7,100.0,31.013699,866.0,3594.0,75.0,67878.0,1421.8,57.262283,935213.0,32.0725,7412.0,290.95746,8999.0,230.2501,36404.0,991.90784,8826.0,64.0,67.0,5449836.0,6886895.0,97.156184,1919317.0,3003060.0,87.569544,227288.0,5121584.0,9.622022,39.0,9.146811,556280.0,2208226.0,52.71363,5843.0,28.328612,49548.5,1819.514406,20745.0,196.634796,16.0,32.922433,25.47869,14.043935,11.054895,88.392654,4811408.0,69.6,1.0


In [23]:
# Imputing Alexandria, VA Food Environment Index (from 2017) 8.7
# source: http://www.healthmattersalexandria.org/indicators/index/view?indicatorId=2362&localeId=2967

df.loc[416, 'Food Environment Index'] = 8.7

In [24]:
# When investigating other null values, Homer, AK was missing many values
# so I removed it instead of imputing multiple values for the same city
df.drop(index=10, inplace=True)

In [25]:
# imputing number of violent crimes per 
# source: https://www.neighborhoodscout.com/

df.loc[9, 'Annual Average Violent Crimes'] = 251
df.loc[12, 'Annual Average Violent Crimes'] = 48
df.loc[331, 'Annual Average Violent Crimes'] = 29

df.drop(columns='Violent Crime Rate', inplace=True)

In [26]:
df.loc[11, 'Average Daily PM2.5'] = 2.1

In [27]:
df.columns[df.isna().any()].tolist()

[]

In [28]:
# need to fix 3 ratio columns
cols_to_fix = ['Primary Care Physicians Ratio', 'Dentist Ratio', 'Mental Health Provider Ratio']

for col in cols_to_fix:
    df[col] = df[col].str.replace(':1', '')
    df[col] = df[col].astype(int)

### Ready to Look at Correlation/Models

In [30]:
df.corr()['standard_score'].sort_values()

% Drive Alone to Work                         -0.347112
Teen Birth Rate                               -0.334926
% Uninsured                                   -0.328484
% Adults with Obesity                         -0.320691
% Physically Inactive                         -0.301323
Primary Care Physicians Ratio                 -0.299463
Dentist Ratio                                 -0.288834
% Smokers                                     -0.242205
Mental Health Provider Ratio                  -0.230495
Years of Potential Life Lost Rate             -0.208716
calc_overall_score                            -0.186174
% Fair or Poor Health                         -0.184155
Average Number of Physically Unhealthy Days   -0.154094
% Driving Deaths with Alcohol Involvement     -0.117975
% Children in Poverty                         -0.116476
Social Association Rate                       -0.101350
Average Number of Mentally Unhealthy Days     -0.095583
% Low birthweight                             -0

In [29]:
df.corr()['calc_overall_score']

calc_overall_score                             1.000000
standard_score                                -0.186174
flex_score                                    -0.167786
total_mei                                     -0.177405
Deaths                                        -0.063295
Years of Potential Life Lost Rate             -0.203956
% Fair or Poor Health                         -0.083703
Average Number of Physically Unhealthy Days   -0.098194
Average Number of Mentally Unhealthy Days     -0.160483
% Low birthweight                             -0.175929
% Smokers                                     -0.147456
% Adults with Obesity                         -0.014285
Food Environment Index                         0.060355
% Physically Inactive                         -0.155367
% With Access to Exercise Opportunities       -0.003536
% Excessive Drinking                          -0.029535
# Alcohol-Impaired Driving Deaths              0.004855
# Driving Deaths                              -0

In [30]:
cols_not_using = ['location_name', 'state', 'FIPS', 'city', 'State', 
                  'County', 'total_mei', 'standard_score', 'flex_score', 'calc_overall_score']

features = [col for col in df.columns if col not in cols_not_using]
X = df[features]
y = df['total_mei']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=73)

lr = LinearRegression()

lr.fit(X_train, y_train)

lr.score(X_train, y_train), lr.score(X_test, y_test)

(0.5489244302403777, 0.2579849738148734)

In [31]:
lasso = LassoCV()

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

lasso.fit(X_train_sc, y_train)
lasso.score(X_train_sc, y_train), lasso.score(X_test_sc, y_test)

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


(0.47628095240292134, 0.3700780542871931)

In [32]:
lasso.coef_

array([ 0.        ,  0.        ,  8.95909039, -5.45588094,  3.21055296,
        0.        ,  0.        ,  1.30933903,  2.99609655, -7.26531352,
        7.6184727 ,  0.93532741,  5.28569791,  0.        , -2.23834405,
       -0.        ,  2.34989667, -2.55927128,  0.        , -8.75751843,
       -0.        ,  0.        , -3.59979908, -4.16552678,  0.97681361,
        1.15079305, -1.35219961, -2.12411433, -0.        ,  1.61731733,
       -0.        ,  0.24654513, -0.        , -0.        , -0.        ,
       -0.        , -0.        , -0.        , -0.        , -0.        ,
        0.51189738,  2.48108256, -3.55494061, -0.        , -0.        ,
        0.        , -0.        , -1.01564844, -0.51072044,  2.85842326,
        3.40864757, -0.        ,  0.        ,  5.32476589, -2.07530847,
        0.        , -5.66604458, -0.        , -0.        ,  0.59700912])

## Exploring Clustering

In [33]:
df.columns.values[0]

'location_name'

In [34]:

cols_not_using = ['location_name', 'state', 'FIPS', 'city', 'State', 
                  'County']

features = [col for col in df.columns if col not in cols_not_using]
X = df[features]
ss = StandardScaler()
X_sc = ss.fit_transform(X)

model = KMeans(random_state=42)
model.fit(X_sc)
X_sc = pd.DataFrame(X_sc, columns=features)

In [35]:
labels = model.labels_
centroids = model.cluster_centers_

print("Predicted clusters to points: ", labels)
print("Location of centroids: ")
print(centroids)

Predicted clusters to points:  [6 0 6 0 6 7 7 6 3 3 6 6 1 1 0 1 1 1 1 1 1 1 0 6 6 6 7 7 0 0 6 4 7 2 2 4 4
 2 4 4 4 4 2 7 4 4 5 2 4 4 5 5 7 2 4 4 4 4 4 5 5 2 4 4 4 4 4 2 2 4 5 2 5 4
 2 5 3 7 3 3 2 3 2 3 3 3 2 2 2 2 0 2 2 0 6 6 0 6 0 6 4 6 4 0 4 4 4 4 4 4 0
 0 4 4 0 0 7 0 0 0 0 3 6 3 3 6 6 3 0 1 0 0 3 0 6 7 0 0 6 7 6 3 3 3 3 3 2 6
 3 3 6 6 6 6 2 2 3 0 0 4 4 3 6 2 2 2 2 3 2 0 3 2 0 0 2 3 0 3 3 3 3 3 6 6 3
 3 6 3 6 4 4 4 4 0 6 3 3 3 3 3 3 3 3 2 4 3 3 0 3 3 3 3 6 3 3 0 0 0 0 0 0 0
 0 6 0 7 6 6 0 6 0 0 3 2 0 2 3 2 6 0 6 3 6 0 7 6 0 7 4 2 7 6 7 1 3 7 4 1 1
 1 1 6 7 6 7 3 1 1 3 3 4 7 3 3 6 0 0 7 0 6 0 3 3 6 2 0 3 6 6 6 6 3 6 3 6 6
 7 3 6 6 6 6 6 0 0 7 6 7 0 6 7 7 0 7 0 2 6 3 6 2 3 2 2 2 3 7 7 7 7 7 6 6 7
 6 3 0 6 0 6 3 0 0 3 3 6 6 4 4 3 3 0 4 0 0 7 7 7 6 7 0 3 0 3 2 0 0 2 2 3 2
 3 0 0 0 0 0 0 6 0 6 3 0 0 3 0 0 3 0 6 0 6 3 6 3 6 3 3 3 3 3 3 3 3 3 3 3 3
 3 2 3 3 3 3 2 2 2 2 3 0 0 0 0 0 3 3 2 3 2 3 3 2 3 0 3]
Location of centroids: 
[[-3.41201244e-01  4.21260816e-01  3.12868518e-01  4.37040746e-0

In [36]:
X_sc['labels'] = labels
X_sc.head()

Unnamed: 0,calc_overall_score,standard_score,flex_score,total_mei,Deaths,Years of Potential Life Lost Rate,% Fair or Poor Health,Average Number of Physically Unhealthy Days,Average Number of Mentally Unhealthy Days,% Low birthweight,% Smokers,% Adults with Obesity,Food Environment Index,% Physically Inactive,% With Access to Exercise Opportunities,% Excessive Drinking,# Alcohol-Impaired Driving Deaths,# Driving Deaths,% Driving Deaths with Alcohol Involvement,# Chlamydia Cases,Chlamydia Rate,Teen Birth Rate,# Uninsured,% Uninsured,# Primary Care Physicians,Primary Care Physicians Rate,Primary Care Physicians Ratio,# Dentists,Dentist Rate,Dentist Ratio,# Mental Health Providers,Mental Health Provider Rate,Mental Health Provider Ratio,Preventable Hospitalization Rate,% With Annual Mammogram,% Vaccinated,# Completed High School,Population,% Completed High School,# Some College,Population.1,% Some College,# Unemployed,Labor Force,% Unemployed,% Children in Poverty,Income Ratio,# Children in Single-Parent Households,# Children in Households,% Children in Single-Parent Households,# Associations,Social Association Rate,Annual Average Violent Crimes,# Injury Deaths,Injury Death Rate,Average Daily PM2.5,% Severe Housing Problems,Severe Housing Cost Burden,Overcrowding,Inadequate Facilities,% Drive Alone to Work,# Workers who Drive Alone,% Long Commute - Drives Alone,Water_Violation,labels
0,-0.403206,-1.40342,-0.53731,-1.375276,-0.526597,0.198582,0.586866,1.033394,1.423416,0.630865,0.616273,0.291103,-1.569317,-0.307856,-1.949761,-0.814387,-0.488146,-0.492787,0.394165,-0.471908,-0.383374,-0.436297,-0.443399,0.122985,-0.572593,-1.151074,1.458395,-0.521797,-1.769811,3.318341,-0.509419,-1.345486,2.022156,0.473748,-1.044054,-0.658876,-0.51621,-0.493731,0.319311,-0.485097,-0.476753,0.670115,-0.454427,-0.498485,-0.825758,-0.282057,2.113888,-0.492388,-0.495934,0.250425,-0.580802,-0.461311,-0.445339,-0.587632,-0.489548,1.288383,0.268524,0.584274,-0.144474,-0.516683,0.702032,-0.495696,-0.133731,-0.068041,6
1,-0.257985,1.172064,0.790162,1.210029,0.159125,2.194468,0.663303,0.243459,0.677362,2.212774,0.379709,1.01675,-1.696124,1.741503,-0.895941,-0.719641,-0.257889,0.049601,-1.689305,-0.056529,0.867797,0.740723,-0.165777,0.257334,-0.046245,0.663106,-0.794679,-0.127661,0.535208,-0.660119,-0.303108,-0.770408,0.320956,0.511173,-0.119006,-0.362859,-0.153656,-0.165789,0.147982,-0.183622,-0.184362,-0.010923,-0.247863,-0.199083,-0.685415,1.063851,0.840721,0.028799,-0.169659,1.640639,0.32012,1.290845,0.192429,0.138974,1.36686,1.288383,-0.313612,0.021019,-0.768,-0.144657,0.72296,-0.196382,0.253747,-0.068041,0
2,-0.693648,-2.144312,-0.979801,-2.14751,-0.548683,0.917274,0.716607,0.826856,1.400644,1.148326,0.820908,0.577543,-0.681664,1.273715,-2.168398,-1.28133,-0.544154,-0.500671,-0.75165,-0.510143,-0.635958,0.223071,-0.481615,0.388943,-0.604191,-1.19673,1.580655,-0.523222,-0.866493,0.755255,-0.510917,-0.96739,0.668076,1.043327,-0.427355,0.229174,-0.553542,-0.524833,-0.41398,-0.548788,-0.522293,-1.367328,-0.474815,-0.540558,-0.261905,0.72332,-0.275729,-0.548148,-0.541007,-0.223161,-0.577526,1.317153,-0.497176,-0.621751,0.281175,0.692795,-1.293701,-1.327093,-0.811057,-0.440489,1.36637,-0.540903,0.055886,-0.068041,6
3,-0.112764,-1.720945,-0.979801,-1.744605,0.159125,2.194468,0.663303,0.243459,0.677362,2.212774,0.379709,1.01675,-1.696124,1.741503,-0.895941,-0.719641,-0.257889,0.049601,-1.689305,-0.056529,0.867797,0.740723,-0.165777,0.257334,-0.046245,0.663106,-0.794679,-0.127661,0.535208,-0.660119,-0.303108,-0.770408,0.320956,0.511173,-0.119006,-0.362859,-0.153656,-0.165789,0.147982,-0.183622,-0.184362,-0.010923,-0.247863,-0.199083,-0.685415,1.063851,0.840721,0.028799,-0.169659,1.640639,0.32012,1.290845,0.192429,0.138974,1.36686,1.288383,-0.313612,0.021019,-0.768,-0.144657,0.72296,-0.196382,0.253747,-0.068041,0
4,0.177678,-1.4387,-0.979801,-1.476002,-0.312219,0.638987,0.058165,-0.413849,0.158223,1.175607,-0.011047,0.863982,-0.554856,0.672273,-1.798218,-1.3073,-0.276558,-0.316196,0.295195,-0.320704,0.333259,-0.090308,-0.356737,-0.11404,-0.371367,0.038332,-0.349304,-0.409899,-0.846888,0.722811,-0.434533,-1.020007,0.78691,0.558637,0.651868,-0.362859,-0.357171,-0.3536,0.490726,-0.359954,-0.365738,0.600619,-0.376822,-0.364174,-1.002341,0.139553,0.613231,-0.370333,-0.369301,0.011217,-0.296875,0.255288,-0.252552,-0.395555,-0.139106,-0.349485,-1.278389,-1.210457,-0.725171,-0.640196,1.312862,-0.361274,-0.290371,-0.068041,6


In [37]:
score = silhouette_score(X_sc, labels, metric='euclidean')
score

0.1866657773708327