# ML4VA: Health Opportunity Index & Life Expectancy of Virginians

### CS 4774 Machine Learning - Department of Computer Science - University of Virginia



# Write-Up


---


In order to predict the life expectancy of Virginian residents, we plan to use regression.In this task, the learning algorithm of *Linear Regression* is to be used in order to estimate a house price given other features and variables. 


---



In [None]:
# General imports
import sklearn # general ml package
import numpy as np # fundamental package for scientific computing
import os # to run file I/O operation 

# Set the seed (consistent throughout code)
np.random.seed(55)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
  """Save the selected figure into disc under an image extention and resolution
  """

  path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
  print("Saving figure", fig_id)
  if tight_layout:
    plt.tight_layout()
  plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
import pandas as pd

HOI = pd.read_csv('Health_Opportunity_Index.csv') # Reads CSV, converts into a PANDA object
Life_Tract = pd.read_csv('Life_Expectancy_Census.csv')
CountyCodes = pd.read_csv("Virginia_County_Codes.csv")

#data = pd.merge(HOI, Life_Tract, on='Census Tract', how='inner')
CountyCodes

Unnamed: 0,Code,County
0,1,Accomack
1,3,Albemarle
2,5,Alleghany
3,7,Amelia
4,9,Amherst
...,...,...
123,800,Suffolk City
124,810,Virginia Beach City
125,820,Waynesboro City
126,830,Williamsburg City


In [None]:
codes = {}
for index, row in CountyCodes.iterrows():
    #print(row['Code'], row['County'])
    code = row['Code']
    county = row['County']
    codes[county] = code
print(codes)

{'Accomack': 1, 'Albemarle': 3, 'Alleghany': 5, 'Amelia': 7, 'Amherst': 9, 'Appomattox': 11, 'Arlington': 13, 'Augusta': 15, 'Bath': 17, 'Bedford': 19, 'Bland': 21, 'Botetourt': 23, 'Brunswick': 25, 'Buchanan': 27, 'Buckingham': 29, 'Campbell': 31, 'Caroline': 33, 'Carroll': 35, 'Charles City': 36, 'Charlotte': 37, 'Chesterfield': 41, 'Clarke': 43, 'Craig': 45, 'Culpeper': 47, 'Cumberland': 49, 'Dickenson': 51, 'Dinwiddie': 53, 'Essex': 57, 'Fairfax': 59, 'Fauquier': 61, 'Floyd': 63, 'Fluvanna': 65, 'Franklin': 67, 'Frederick': 69, 'Giles': 71, 'Gloucester': 73, 'Goochland': 75, 'Grayson': 77, 'Greene': 79, 'Greensville': 81, 'Halifax': 83, 'Hanover': 85, 'Henrico': 87, 'Henry': 89, 'Highland': 91, 'Isle of Wight': 93, 'James City': 95, 'King and Queen': 97, 'King George': 99, 'King William': 101, 'Lancaster': 103, 'Lee': 105, 'Loudoun': 107, 'Louisa': 109, 'Lunenburg': 111, 'Madison': 113, 'Mathews': 115, 'Mecklenburg': 117, 'Middlesex': 119, 'Montgomery': 121, 'Nelson': 125, 'New Ken

In [None]:
# Sort by Census Tract to clean data easier
HOI = HOI.sort_values(by=['Census Tract'])
Life_Tract = Life_Tract.sort_values(by=['Census Tract'])

In [None]:
# Clean Life_Tract data to make it equal to county codes
Life_Tract["County"] = Life_Tract["County"].str.replace(", VA","")
Life_Tract["County"] = Life_Tract["County"].str.replace(" County","")
Life_Tract["County"] = Life_Tract["County"].str.replace(" city"," City")

In [None]:
Life_Tract.head(20)

Unnamed: 0,State,County,Census Tract,Life Expectancy,Life Expectancy Range,Life Expectancy Standard Error
1732,Virginia,Roanoke City,1.0,74.8,56.9-75.1,1.6514
1535,Virginia,Norfolk City,1.0,,,
1397,Virginia,Fredericksburg City,1.0,80.1,79.6-81.6,1.5574
1902,Virginia,Winchester City,1.0,74.5,56.9-75.1,1.369
1760,Virginia,Staunton City,1.0,78.4,77.6-79.5,1.7956
909,Virginia,Nottoway,1.0,74.5,56.9-75.1,1.1742
1369,Virginia,Danville City,1.0,78.7,77.6-79.5,1.5963
1458,Virginia,Lynchburg City,1.0,81.8,81.7-97.5,1.0626
1486,Virginia,Martinsville City,1.0,,,
1439,Virginia,Harrisonburg City,1.01,75.3,75.2-77.5,1.8965


In [None]:
TractCodes = []
for entry in Life_Tract['Census Tract']:
  entry = format(entry, '.2f')
  entry = str(entry).zfill(7)
  entry = entry.replace(".","")
  TractCodes.append(entry)
  #if entry < 1000:
  #print(entry)
print(TractCodes)
print(len(TractCodes))

['000100', '000100', '000100', '000100', '000100', '000100', '000100', '000100', '000100', '000101', '000102', '000200', '000200', '000200', '000200', '000200', '000201', '000201', '000201', '000201', '000202', '000202', '000202', '000202', '000203', '000203', '000204', '000205', '000206', '000207', '000300', '000300', '000300', '000300', '000300', '000300', '000300', '000301', '000301', '000301', '000302', '000302', '000302', '000302', '000400', '000400', '000400', '000400', '000400', '000400', '000400', '000401', '000401', '000402', '000402', '000500', '000500', '000500', '000500', '000500', '000500', '000500', '000501', '000502', '000600', '000600', '000600', '000600', '000600', '000601', '000602', '000700', '000700', '000700', '000700', '000800', '000800', '000800', '000801', '000802', '000900', '000900', '000900', '000900', '000901', '000902', '001000', '001000', '001000', '001000', '001100', '001100', '001100', '001100', '001200', '001200', '001200', '001300', '001301', '001302',

In [None]:
TractCounties = []
for county in Life_Tract['County']:
  try:
    codeVal = codes[county]
    # Make same length, 3 digits for county codes
    codeVal = str(codeVal).zfill(3)

    #print("County: ", county, "Code: ", codeVal)
    TractCounties.append(codeVal)
  except KeyError:
    # If it's a bad data point (e.g. a City not a county, we put 000 as the code, which doesn't exist)
    TractCounties.append("000")
    #print("ERROR")
print(TractCounties)
print(len(TractCounties))

['770', '710', '630', '840', '790', '135', '590', '680', '690', '660', '660', '590', '790', '135', '690', '630', '710', '680', '540', '840', '680', '540', '710', '840', '660', '680', '660', '660', '660', '660', '690', '680', '135', '770', '590', '790', '710', '630', '660', '840', '630', '660', '540', '840', '710', '680', '590', '770', '690', '630', '790', '660', '540', '540', '660', '630', '690', '790', '770', '680', '710', '590', '540', '540', '710', '540', '680', '590', '790', '770', '770', '540', '590', '710', '680', '710', '540', '590', '680', '680', '540', '770', '590', '680', '710', '710', '680', '770', '590', '540', '680', '770', '590', '710', '710', '770', '590', '710', '590', '590', '710', '680', '590', '710', '680', '710', '680', '710', '680', '770', '770', '680', '710', '770', '710', '770', '710', '770', '710', '710', '770', '710', '770', '770', '710', '710', '770', '710', '770', '710', '770', '710', '770', '710', '820', '770', '710', '820', '820', '710', '710', '820', '820'

In [None]:
TractCountiesClean = []
TractCodesClean = []
TractIndexes = []
badIndexes = []
for i, num in enumerate(TractCounties):
  if num != "000":
    TractCountiesClean.append(num)
    TractCodesClean.append(TractCodes[i])
    TractIndexes.append(i)
  else:
    badIndexes.append(i)
print("Len of Counties: ", len(TractCountiesClean))
print("Len of Codes: ", len(TractCodesClean))

Len of Counties:  1895
Len of Codes:  1895


In [None]:
geoids = []
IndexMap = {}
for i, code in enumerate(TractCodesClean):
  countyNum = TractCountiesClean[i]
  combID = countyNum + code
  geoids.append(combID)
  IndexMap[combID] = TractIndexes[i]
print("9-digit geo IDS: ", geoids)
print("Map of indexes: ", IndexMap)

9-digit geo IDS:  ['770000100', '710000100', '630000100', '840000100', '790000100', '135000100', '590000100', '680000100', '690000100', '660000101', '660000102', '590000200', '790000200', '135000200', '690000200', '630000200', '710000201', '680000201', '540000201', '840000201', '680000202', '540000202', '710000202', '840000202', '660000203', '680000203', '660000204', '660000205', '660000206', '660000207', '690000300', '680000300', '135000300', '770000300', '590000300', '790000300', '710000300', '630000301', '660000301', '840000301', '630000302', '660000302', '540000302', '840000302', '710000400', '680000400', '590000400', '770000400', '690000400', '630000400', '790000400', '660000401', '540000401', '540000402', '660000402', '630000500', '690000500', '790000500', '770000500', '680000500', '710000500', '590000500', '540000501', '540000502', '710000600', '540000600', '680000600', '590000600', '790000600', '770000601', '770000602', '540000700', '590000700', '710000700', '680000700', '71000

In [None]:
# Cleaning Checking Shape:
Life_Tract.shape

(1907, 6)

In [None]:
# Drop the Bad Data Points!
Life_Tract = Life_Tract.drop(badIndexes)
print(Life_Tract.shape)
Life_Tract

(1895, 6)


Unnamed: 0,State,County,Census Tract,Life Expectancy,Life Expectancy Range,Life Expectancy Standard Error
1732,Virginia,Roanoke City,1.0,74.8,56.9-75.1,1.6514
1535,Virginia,Norfolk City,1.0,,,
1397,Virginia,Fredericksburg City,1.0,80.1,79.6-81.6,1.5574
1902,Virginia,Winchester City,1.0,74.5,56.9-75.1,1.3690
1760,Virginia,Staunton City,1.0,78.4,77.6-79.5,1.7956
...,...,...,...,...,...,...
908,Virginia,Northumberland,9901.0,,,
878,Virginia,Middlesex,9901.0,,,
1893,Virginia,Virginia Beach City,9901.0,,,
864,Virginia,Mathews,9901.0,,,


In [None]:
# Add GEOID column!
Life_Tract['GEOID'] = geoids
# Drop Census Tract here, we don't need it anymore
Life_Tract = Life_Tract.drop(['Census Tract'], axis=1)

In [None]:
# Check
Life_Tract.head(5)

Unnamed: 0,State,County,Life Expectancy,Life Expectancy Range,Life Expectancy Standard Error,GEOID
1732,Virginia,Roanoke City,74.8,56.9-75.1,1.6514,770000100
1535,Virginia,Norfolk City,,,,710000100
1397,Virginia,Fredericksburg City,80.1,79.6-81.6,1.5574,630000100
1902,Virginia,Winchester City,74.5,56.9-75.1,1.369,840000100
1760,Virginia,Staunton City,78.4,77.6-79.5,1.7956,790000100


In [None]:
# Clean Census Tract of HOI!
HOI.head(5)

Unnamed: 0,Census Tract,Rural~Urban,Access to Care,Employment Accessibility,Affordability,Air Quality,Population Churning,Education,Food Accessibility,Income Inequality,Job Participation,Population Density,Segregation,Material Deprivation,Walkability,Community Environment Profile,Consumer Opportunity Profile,Economic Opportunity Profile,Wellness Disparity Profile,Health Opportunity Index
263,51001090100,Rural,0.273908,0.011102,0.465465,0.887563,0.912214,0.718747,0.942139,0.347638,0.588,0.006054,0.749077,0.413495,0.162616,0.321956,0.639827,0.185053,0.27018,0.330824
483,51001090200,Rural,0.273908,0.008525,0.526527,0.896792,0.949109,0.684452,0.942139,0.391093,0.581,0.001021,0.810144,0.516062,0.087799,0.271668,0.658822,0.200148,0.239055,0.370909
554,51001090300,Rural,0.277446,0.01411,0.381381,0.907006,0.949109,0.628343,0.942139,0.452362,0.614,0.00054,0.702573,0.410243,0.05117,0.254516,0.51674,0.235381,0.296392,0.378442
485,51001090400,Rural,0.555905,0.021915,0.505506,0.902609,0.895674,0.616631,0.942139,0.398111,0.58,0.001038,0.813625,0.292771,0.10635,0.302841,0.556501,0.222238,0.437237,0.371092
741,51001090500,Rural,0.555905,0.014512,0.421421,0.891768,0.989822,0.681361,0.942139,0.435358,0.607,0.002097,0.748832,0.308089,0.089221,0.282644,0.567528,0.229408,0.470261,0.403519


In [None]:
# Convert Census Tract to string to remove the "51" on the end, also create new GEOID column to merge on in HOI dataframe
HOI['Census Tract'] = HOI['Census Tract'].astype(str)
HOI['GEOID'] = HOI['Census Tract'].map(lambda x: x.lstrip('51'))
HOI.drop('Census Tract', axis=1)

Unnamed: 0,Rural~Urban,Access to Care,Employment Accessibility,Affordability,Air Quality,Population Churning,Education,Food Accessibility,Income Inequality,Job Participation,Population Density,Segregation,Material Deprivation,Walkability,Community Environment Profile,Consumer Opportunity Profile,Economic Opportunity Profile,Wellness Disparity Profile,Health Opportunity Index,GEOID
263,Rural,0.273908,0.011102,0.465465,0.887563,0.912214,0.718747,0.942139,0.347638,0.588,0.006054,0.749077,0.413495,0.162616,0.321956,0.639827,0.185053,0.270180,0.330824,001090100
483,Rural,0.273908,0.008525,0.526527,0.896792,0.949109,0.684452,0.942139,0.391093,0.581,0.001021,0.810144,0.516062,0.087799,0.271668,0.658822,0.200148,0.239055,0.370909,001090200
554,Rural,0.277446,0.014110,0.381381,0.907006,0.949109,0.628343,0.942139,0.452362,0.614,0.000540,0.702573,0.410243,0.051170,0.254516,0.516740,0.235381,0.296392,0.378442,001090300
485,Rural,0.555905,0.021915,0.505506,0.902609,0.895674,0.616631,0.942139,0.398111,0.580,0.001038,0.813625,0.292771,0.106350,0.302841,0.556501,0.222238,0.437237,0.371092,001090400
741,Rural,0.555905,0.014512,0.421421,0.891768,0.989822,0.681361,0.942139,0.435358,0.607,0.002097,0.748832,0.308089,0.089221,0.282644,0.567528,0.229408,0.470261,0.403519,001090500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Urban,0.435096,0.086138,0.274274,0.653627,0.737913,0.602679,0.979340,0.435628,0.671,0.064798,0.898252,0.201041,0.246344,0.459465,0.424871,0.274488,0.308441,0.269371,840000100
723,Urban,0.435096,0.157033,0.564565,0.770042,0.708651,0.774457,0.766586,0.348988,0.680,0.036447,0.735834,0.473983,0.181110,0.394476,0.547955,0.312852,0.391224,0.400053,840000201
693,Urban,0.435096,0.109394,0.613614,0.758740,0.782443,0.803688,0.891328,0.371795,0.595,0.049587,0.735834,0.522296,0.236572,0.393562,0.702260,0.254909,0.391224,0.395440,840000202
365,Urban,0.435096,0.069851,0.504505,0.752720,0.732824,0.648909,0.936152,0.493117,0.636,0.050741,0.735834,0.303825,0.251226,0.404216,0.530829,0.282053,0.391224,0.349738,840000301


In [None]:
# Verify
HOI.head(5)

Unnamed: 0,Census Tract,Rural~Urban,Access to Care,Employment Accessibility,Affordability,Air Quality,Population Churning,Education,Food Accessibility,Income Inequality,...,Population Density,Segregation,Material Deprivation,Walkability,Community Environment Profile,Consumer Opportunity Profile,Economic Opportunity Profile,Wellness Disparity Profile,Health Opportunity Index,GEOID
263,51001090100,Rural,0.273908,0.011102,0.465465,0.887563,0.912214,0.718747,0.942139,0.347638,...,0.006054,0.749077,0.413495,0.162616,0.321956,0.639827,0.185053,0.27018,0.330824,1090100
483,51001090200,Rural,0.273908,0.008525,0.526527,0.896792,0.949109,0.684452,0.942139,0.391093,...,0.001021,0.810144,0.516062,0.087799,0.271668,0.658822,0.200148,0.239055,0.370909,1090200
554,51001090300,Rural,0.277446,0.01411,0.381381,0.907006,0.949109,0.628343,0.942139,0.452362,...,0.00054,0.702573,0.410243,0.05117,0.254516,0.51674,0.235381,0.296392,0.378442,1090300
485,51001090400,Rural,0.555905,0.021915,0.505506,0.902609,0.895674,0.616631,0.942139,0.398111,...,0.001038,0.813625,0.292771,0.10635,0.302841,0.556501,0.222238,0.437237,0.371092,1090400
741,51001090500,Rural,0.555905,0.014512,0.421421,0.891768,0.989822,0.681361,0.942139,0.435358,...,0.002097,0.748832,0.308089,0.089221,0.282644,0.567528,0.229408,0.470261,0.403519,1090500


In [None]:
data = pd.merge(HOI, Life_Tract, on='GEOID', how='inner')

In [None]:
print("Final Combined Data Shape: ", data.shape)
# Look at the data
data.head(5)

Final Combined Data Shape:  (1262, 26)


Unnamed: 0,Census Tract,Rural~Urban,Access to Care,Employment Accessibility,Affordability,Air Quality,Population Churning,Education,Food Accessibility,Income Inequality,...,Consumer Opportunity Profile,Economic Opportunity Profile,Wellness Disparity Profile,Health Opportunity Index,GEOID,State,County,Life Expectancy,Life Expectancy Range,Life Expectancy Standard Error
0,51001090100,Rural,0.273908,0.011102,0.465465,0.887563,0.912214,0.718747,0.942139,0.347638,...,0.639827,0.185053,0.27018,0.330824,1090100,Virginia,Accomack,77.5,75.2-77.5,1.1577
1,51001090200,Rural,0.273908,0.008525,0.526527,0.896792,0.949109,0.684452,0.942139,0.391093,...,0.658822,0.200148,0.239055,0.370909,1090200,Virginia,Franklin City,71.5,56.9-75.1,0.9309
2,51001090300,Rural,0.277446,0.01411,0.381381,0.907006,0.949109,0.628343,0.942139,0.452362,...,0.51674,0.235381,0.296392,0.378442,1090300,Virginia,Accomack,77.7,77.6-79.5,1.201
3,51001090400,Rural,0.555905,0.021915,0.505506,0.902609,0.895674,0.616631,0.942139,0.398111,...,0.556501,0.222238,0.437237,0.371092,1090400,Virginia,Accomack,73.1,56.9-75.1,2.2543
4,51001090500,Rural,0.555905,0.014512,0.421421,0.891768,0.989822,0.681361,0.942139,0.435358,...,0.567528,0.229408,0.470261,0.403519,1090500,Virginia,Accomack,77.0,75.2-77.5,1.6637


In [None]:
# Drop Census Tract since we only needed it for GEOID calculation!
data.drop('Census Tract', axis=1)

Unnamed: 0,Rural~Urban,Access to Care,Employment Accessibility,Affordability,Air Quality,Population Churning,Education,Food Accessibility,Income Inequality,Job Participation,...,Consumer Opportunity Profile,Economic Opportunity Profile,Wellness Disparity Profile,Health Opportunity Index,GEOID,State,County,Life Expectancy,Life Expectancy Range,Life Expectancy Standard Error
0,Rural,0.273908,0.011102,0.465465,0.887563,0.912214,0.718747,0.942139,0.347638,0.588,...,0.639827,0.185053,0.270180,0.330824,001090100,Virginia,Accomack,77.5,75.2-77.5,1.1577
1,Rural,0.273908,0.008525,0.526527,0.896792,0.949109,0.684452,0.942139,0.391093,0.581,...,0.658822,0.200148,0.239055,0.370909,001090200,Virginia,Franklin City,71.5,56.9-75.1,0.9309
2,Rural,0.277446,0.014110,0.381381,0.907006,0.949109,0.628343,0.942139,0.452362,0.614,...,0.516740,0.235381,0.296392,0.378442,001090300,Virginia,Accomack,77.7,77.6-79.5,1.2010
3,Rural,0.555905,0.021915,0.505506,0.902609,0.895674,0.616631,0.942139,0.398111,0.580,...,0.556501,0.222238,0.437237,0.371092,001090400,Virginia,Accomack,73.1,56.9-75.1,2.2543
4,Rural,0.555905,0.014512,0.421421,0.891768,0.989822,0.681361,0.942139,0.435358,0.607,...,0.567528,0.229408,0.470261,0.403519,001090500,Virginia,Accomack,77.0,75.2-77.5,1.6637
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1257,Urban,0.435096,0.086138,0.274274,0.653627,0.737913,0.602679,0.979340,0.435628,0.671,...,0.424871,0.274488,0.308441,0.269371,840000100,Virginia,Winchester City,74.5,56.9-75.1,1.3690
1258,Urban,0.435096,0.157033,0.564565,0.770042,0.708651,0.774457,0.766586,0.348988,0.680,...,0.547955,0.312852,0.391224,0.400053,840000201,Virginia,Winchester City,82.2,81.7-97.5,2.0388
1259,Urban,0.435096,0.109394,0.613614,0.758740,0.782443,0.803688,0.891328,0.371795,0.595,...,0.702260,0.254909,0.391224,0.395440,840000202,Virginia,Winchester City,82.8,81.7-97.5,1.5941
1260,Urban,0.435096,0.069851,0.504505,0.752720,0.732824,0.648909,0.936152,0.493117,0.636,...,0.530829,0.282053,0.391224,0.349738,840000301,Virginia,Winchester City,72.9,56.9-75.1,1.3280


In [None]:
# Data info summary, look at all the features and whether they are numerical or categorical
HOI.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1875 entries, 263 to 431
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Census Tract                   1875 non-null   object 
 1   Rural~Urban                    1875 non-null   object 
 2   Access to Care                 1875 non-null   float64
 3   Employment Accessibility       1875 non-null   float64
 4   Affordability                  1875 non-null   float64
 5   Air Quality                    1875 non-null   float64
 6   Population Churning            1875 non-null   float64
 7   Education                      1875 non-null   float64
 8   Food Accessibility             1875 non-null   float64
 9   Income Inequality              1875 non-null   float64
 10  Job Participation              1875 non-null   float64
 11  Population Density             1875 non-null   float64
 12  Segregation                    1875 non-null   

In [None]:
# Statistics about numerical features of the dataset
HOI.describe()

Unnamed: 0,Access to Care,Employment Accessibility,Affordability,Air Quality,Population Churning,Education,Food Accessibility,Income Inequality,Job Participation,Population Density,Segregation,Material Deprivation,Walkability,Community Environment Profile,Consumer Opportunity Profile,Economic Opportunity Profile,Wellness Disparity Profile,Health Opportunity Index
count,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0,1875.0
mean,0.430293,0.100788,0.541067,0.826108,0.821509,0.738404,0.932584,0.467536,0.659006,0.043494,0.735817,0.456662,0.183523,0.351329,0.620942,0.296378,0.387827,0.422507
std,0.122966,0.040566,0.130588,0.109615,0.125996,0.079122,0.074521,0.088873,0.113889,0.066599,0.172272,0.142174,0.106976,0.089182,0.131028,0.060072,0.124507,0.089592
min,0.0,0.0,0.0,0.0,0.022901,0.0,0.313483,0.0,0.0,1e-06,0.0,0.0,0.000678,0.0,0.0,0.006034,0.0,0.0
25%,0.363984,0.084459,0.473473,0.764383,0.768448,0.681429,0.933302,0.415655,0.599,0.003532,0.668347,0.366538,0.105048,0.284778,0.546642,0.260002,0.312137,0.36818
50%,0.458326,0.105232,0.558559,0.850917,0.85369,0.730412,0.942139,0.475169,0.667,0.027831,0.76079,0.457951,0.166149,0.334323,0.62299,0.303343,0.383706,0.427082
75%,0.502987,0.121596,0.636637,0.904748,0.908397,0.792474,0.976872,0.52915,0.735,0.055554,0.847383,0.551325,0.252531,0.402707,0.709174,0.336892,0.456334,0.483719
max,1.0,0.696429,0.827828,1.0,1.0,1.0,1.0,0.907557,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.867737,1.0,0.728815


In [None]:
HOI.isnull()

Unnamed: 0,Census Tract,Rural~Urban,Access to Care,Employment Accessibility,Affordability,Air Quality,Population Churning,Education,Food Accessibility,Income Inequality,...,Population Density,Segregation,Material Deprivation,Walkability,Community Environment Profile,Consumer Opportunity Profile,Economic Opportunity Profile,Wellness Disparity Profile,Health Opportunity Index,GEOID
263,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
483,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
554,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
485,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
741,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
723,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
693,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
365,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
HOI.isnull().any()
data = data.drop(['State', 'Life Expectancy Standard Error', 'Life Expectancy Range', 'GEOID', "County", "Census Tract"], axis=1)

In [None]:
from sklearn.model_selection import train_test_split


cat_vars = ["Rural~Urban"]
print(data.columns.values)
num_vars = list(set(data.columns.values) - set(cat_vars) - set(["Life Expectancy"]))
print(num_vars)

print(data.shape)
data = data.dropna(subset=['Life Expectancy'])
print(data.shape)

data_x = data.copy()
data_x = data_x.drop("Life Expectancy", axis=1)

data_y = data.copy()
data_y = data["Life Expectancy"]

print(data_x.shape)



# Split the data into train set (80%) and validation set (20%)
X_train, X_temp, Y_train, Y_temp = train_test_split(data_x, data_y, test_size=0.2, random_state=55)
# Split validation set into testing set (10%) and validation set(10%)
X_test, X_valid, Y_test, Y_valid = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=55)

['Rural~Urban' 'Access to Care' 'Employment Accessibility' 'Affordability'
 'Air Quality' 'Population Churning' 'Education' 'Food Accessibility'
 'Income Inequality' 'Job Participation' 'Population Density'
 'Segregation' 'Material Deprivation' 'Walkability'
 'Community Environment Profile' 'Consumer Opportunity Profile'
 'Economic Opportunity Profile' 'Wellness Disparity Profile'
 'Health Opportunity Index' 'Life Expectancy']
['Food Accessibility', 'Health Opportunity Index', 'Walkability', 'Air Quality', 'Access to Care', 'Wellness Disparity Profile', 'Income Inequality', 'Employment Accessibility', 'Population Churning', 'Community Environment Profile', 'Material Deprivation', 'Population Density', 'Consumer Opportunity Profile', 'Affordability', 'Job Participation', 'Segregation', 'Education', 'Economic Opportunity Profile']
(1262, 20)
(1117, 20)
(1117, 19)


In [None]:
# See the shapes of each stage
print(X_train.head())
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)

    Rural~Urban  Access to Care  Employment Accessibility  Affordability  \
233       Urban        0.501111                  0.091191       0.651652   
612       Rural        0.284416                  0.022983       0.302302   
422       Urban        0.501637                  0.100682       0.725726   
214       Urban        0.487993                  0.109434       0.619620   
775       Urban        0.522603                  0.112742       0.577578   

     Air Quality  Population Churning  Education  Food Accessibility  \
233     0.887514             0.853690   0.783465            0.986243   
612     0.959823             0.935115   0.609833            0.987444   
422     0.764272             0.922392   0.890377            0.964756   
214     0.901588             0.893130   0.741117            0.919030   
775     0.824784             0.754453   0.705989            0.889342   

     Income Inequality  Job Participation  Population Density  Segregation  \
233           0.508772          

In [None]:
from sklearn.linear_model import SGDRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    #('std_scaler', StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ("num", numerical_pipeline, num_vars),
    ("cat", OneHotEncoder(sparse=False), cat_vars),
])

X_train_prepared = full_pipeline.fit_transform(X_train)
X_val_prepared = full_pipeline.fit_transform(X_valid)
X_test_prepared = full_pipeline.fit_transform(X_test)

mySGDModel = SGDRegressor(max_iter=2000)

print(full_pipeline.get_feature_names_out)

param_grid = {
    'alpha': [1, .1, .01, .001, .0001, .00001],
    'loss': ['squared_error', 'huber', 'epsilon_insensitive'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'learning_rate': ['constant', 'optimal', 'invscaling'],
}





<bound method ColumnTransformer.get_feature_names_out of ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['Food Accessibility',
                                  'Health Opportunity Index', 'Walkability',
                                  'Air Quality', 'Access to Care',
                                  'Wellness Disparity Profile',
                                  'Income Inequality',
                                  'Employment Accessibility',
                                  'Population Churning',
                                  'Community Environment Profile',
                                  'Material Deprivation', 'Population Density',
                                  'Consumer Opportunity Profile',
                                  'Affordability', 'Job Participation',
                                

In [None]:
gridSearch = GridSearchCV(mySGDModel, param_grid)
print(X_train_prepared.shape)
print(X_test_prepared.shape)
gridSearch.fit(X_train_prepared, Y_train)
print(gridSearch.best_params_)

(893, 20)
(112, 20)




{'alpha': 1e-05, 'learning_rate': 'constant', 'loss': 'squared_error', 'penalty': 'l2'}


AttributeError: ignored

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score


y_predict = gridSearch.predict(X_test_prepared)

mse = np.sqrt(mean_squared_error(Y_test, y_predict))
print(mse)

scores = cross_val_score(gridSearch, X_val_prepared, Y_valid,
                         scoring="neg_mean_squared_error", cv=5)
print(np.sqrt(-scores))

3.8152296571247697




[3.9801606  3.77125372 4.45479983 5.68836681 3.59230684]




In [None]:
from sklearn.svm import SVR


mySVRModel = SVR(max_iter=2000)

param_grid_SVR = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [3,4,5],
    'C': [.1, 1, 5, 10],
    'gamma': ['scale', 'auto'],
}


gridSearchSVR = GridSearchCV(mySVRModel, param_grid_SVR)
gridSearchSVR.fit(X_train_prepared, Y_train)



GridSearchCV(estimator=SVR(max_iter=2000),
             param_grid={'C': [0.1, 1, 5, 10], 'degree': [3, 4, 5],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

In [None]:
print(gridSearchSVR.best_params_)

y_predict_SVR = gridSearchSVR.predict(X_test_prepared)

mse_SVR = np.sqrt(mean_squared_error(Y_test, y_predict_SVR))
print(mse_SVR)

scores_SVR = cross_val_score(gridSearchSVR, X_val_prepared, Y_valid,
                         scoring="neg_mean_squared_error", cv=5)
print(np.sqrt(-scores_SVR))

{'C': 1, 'degree': 3, 'gamma': 'auto', 'kernel': 'rbf'}
3.7630639434549766




[3.81018597 4.10425674 4.14651759 5.05995893 3.83497548]




In [None]:
column_names = ['Food Accessibility',
                                  'Health Opportunity Index', 'Walkability',
                                  'Air Quality', 'Access to Care',
                                  'Wellness Disparity Profile',
                                  'Income Inequality',
                                  'Employment Accessibility',
                                  'Population Churning',
                                  'Community Environment Profile',
                                  'Material Deprivation', 'Population Density',
                                  'Consumer Opportunity Profile',
                                  'Affordability', 'Job Participation',
                                  'Segregation', 'Education',
                                  'Economic Opportunity Profile', 'Rural', 'Urban']

best_lin = gridSearch.best_estimator_
print(len(best_lin.coef_))
print(len(column_names))

for x in range(len(best_lin.coef_)):
  print(column_names[x])
  print(best_lin.coef_[x])
    

20
20
Food Accessibility
6.608007245004938
Health Opportunity Index
-0.8326660634481756
Walkability
-5.019198522003579
Air Quality
5.397637955155033
Access to Care
-4.679634473470238
Wellness Disparity Profile
6.6533562317308315
Income Inequality
5.497058921663974
Employment Accessibility
-1.139382682024937
Population Churning
3.21224003434055
Community Environment Profile
12.15779820684674
Material Deprivation
1.6160821167662065
Population Density
-1.0688242976560947
Consumer Opportunity Profile
-1.6944769578222074
Affordability
2.8619760000119374
Job Participation
2.6034595469643147
Segregation
4.246222364193954
Education
17.6990867067704
Economic Opportunity Profile
1.2139272766531013
Rural
13.352058018414002
Urban
13.159298424254256
