## Resources
[TX County Obesity Data](https://gis.cdc.gov/grasp/diabetes/DiabetesAtlas.html)  
 Total, Adults Aged 20+ Years, Age-Adjusted Percentage, Texas, 2017  
 US Diabetes Surveillance System  
 www.cdc.gov/diabetes/data  
  
[County Obesity Data](https://www.countyhealthrankings.org/app/texas/2020/measure/factors/11/data)

# Imports

In [6]:
# Import the required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [7]:
# Import the data.
tx_df = pd.read_csv('../data/texas_combined_cleaned_WITH_Census_FA.csv')

tx_ob = pd.read_csv('../data/preprocessing/cleaned_tx_obesity.csv')

In [8]:
# Display the first few rows of the dataframe.
tx_df.head(3)

Unnamed: 0,county_name,total_cases,total_fatalities,death_rate,race_pop,race_pop_hispanic_or_latino_of_any_race,race_pop_white_alone,race_pop_black_or_african_american_alone,race_pop_american_indian_and_alaska_native_alone,race_pop_asian_alone,...,inc_hhlds_100_000_to_149_999,inc_hhlds_150_000_to_199_999,inc_hhlds_200_000_or_more,inc_med_hhld_inc_dol,inc_mean_hhld_inc_dol,inc_per_capita_inc_dol,inc_med_earn_workers_dol,inc_med_earn_male_full_yr_workers_dol,inc_med_earn_female_full_yr_workers_dol,pop_density
0,anderson,2922.0,39,0.013347,57863,10142,34312,12127,158,319,...,1432,422,240,43355,55852,16868,26621,36865,26890,54.452632
1,andrews,567.0,10,0.017637,17818,9979,7215,321,0,63,...,1214,398,352,74233,94962,31190,40478,72970,35194,11.87296
2,angelina,2277.0,90,0.039526,87607,19174,53216,12834,107,987,...,3257,985,639,47714,61152,22322,27411,41488,32622,109.813207


In [9]:
# Merge the remainder dataframes on Geographic Area Name.
tx_df = tx_ob.merge(tx_df,on='county_name')
tx_df.head()

Unnamed: 0,county_name,obes_percent,total_cases,total_fatalities,death_rate,race_pop,race_pop_hispanic_or_latino_of_any_race,race_pop_white_alone,race_pop_black_or_african_american_alone,race_pop_american_indian_and_alaska_native_alone,...,inc_hhlds_100_000_to_149_999,inc_hhlds_150_000_to_199_999,inc_hhlds_200_000_or_more,inc_med_hhld_inc_dol,inc_mean_hhld_inc_dol,inc_per_capita_inc_dol,inc_med_earn_workers_dol,inc_med_earn_male_full_yr_workers_dol,inc_med_earn_female_full_yr_workers_dol,pop_density
0,anderson,0.373,2922.0,39,0.013347,57863,10142,34312,12127,158,...,1432,422,240,43355,55852,16868,26621,36865,26890,54.452632
1,andrews,0.313,567.0,10,0.017637,17818,9979,7215,321,0,...,1214,398,352,74233,94962,31190,40478,72970,35194,11.87296
2,angelina,0.396,2277.0,90,0.039526,87607,19174,53216,12834,107,...,3257,985,639,47714,61152,22322,27411,41488,32622,109.813207
3,aransas,0.377,340.0,18,0.052941,24763,6756,16691,365,14,...,1047,476,496,44865,75471,30939,21567,43182,32005,98.246769
4,archer,0.283,133.0,1,0.007519,8789,727,7772,76,27,...,626,198,145,64476,80073,31806,32469,49105,37377,9.729998


# Regression Modeling Obsesity + Pop Density

In [60]:
# Create a subset of columns for modeling.
# Extract columns for obesity-based model.
ob_mod_cols = ['death_rate', 'obes_percent', 'pop_density']

In [61]:
# Create a dataframe for age groups
tx_obes = tx_df[ob_mod_cols]

In [62]:
tx_obes

Unnamed: 0,death_rate,obes_percent,pop_density
0,0.013347,37.3,54.452632
1,0.017637,31.3,11.872960
2,0.039526,39.6,109.813207
3,0.052941,37.7,98.246769
4,0.007519,28.3,9.729998
...,...,...,...
249,0.057971,33.1,67.904439
250,0.017301,29.5,10.717568
251,0.012072,35.9,19.807588
252,0.026316,30.2,14.391854


In [63]:
X = tx_obes[[
    'obes_percent',
    'pop_density'
]]

y = tx_obes['death_rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [64]:
ss = StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

In [65]:
lr = LinearRegression()
lr.fit(X_train_ss, y_train)

LinearRegression()

In [66]:
X_train.shape, y_test.shape

((190, 2), (64,))

In [67]:
cross_val_score(lr, X_test_ss, y_test, cv=5).mean()

-0.22716049609446412