In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import from sklearn.
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

In [2]:
california = pd.read_csv('../data/cleaned_engineered_ca.csv')

In [3]:
list(california.columns)

['county_state',
 'total_cases',
 'total_fatalities',
 'death_rate',
 'total_tests',
 'race_pop',
 'race_pop_hispanic_or_latino_of_any_race',
 'race_pop_white_alone',
 'race_pop_black_or_african_american_alone',
 'race_pop_american_indian_and_alaska_native_alone',
 'race_pop_asian_alone',
 'race_pop_native_hawaiian_and_other_pacific_islander_alone',
 'race_pop_some_other_race_alone',
 'race_pop_two_or_more_races',
 'sex_age_pop',
 'sex_age_pop_male',
 'sex_age_pop_female',
 'sex_age_pop_under_5',
 'sex_age_pop_5_to_9',
 'sex_age_pop_10_to_14',
 'sex_age_pop_15_to_19',
 'sex_age_pop_20_to_24',
 'sex_age_pop_25_to_34',
 'sex_age_pop_35_to_44',
 'sex_age_pop_45_to_54',
 'sex_age_pop_55_to_59',
 'sex_age_pop_60_to_64',
 'sex_age_pop_65_to_74',
 'sex_age_pop_75_to_84',
 'sex_age_pop_85_and_over',
 'sex_age_median_age_in_years',
 'sq_mi',
 'obes_percent',
 'health_ins_noninst_pop',
 'health_ins_noninst_pop_cov_yes',
 'health_ins_noninst_pop_private',
 'health_ins_noninst_pop_public',
 'healt

In [20]:
X = california[[
 'inc_per_capita_inc_dol',
    'percent_sex_age_pop_male',
    'obes_percent',
    'percent_sex_age_pop_0_to_44',
    'percent_sex_age_pop_45_to_74',
    'percent_race_pop_asian_alone',
    'percent_inc_hhlds_less_than_99_999',
    'percent_inc_hhlds_100_000_or_more',
    'percent_health_ins_noninst_pop_cov_yes',
    'pop_density',
    'percent_race_pop_white_alone',
    'percent_race_pop_native_hawaiian_and_other_pacific_islander_alone',
    'percent_race_pop_black_or_african_american_alone',
    'tests_per_100_people']]

y = california['cases_per_100_people']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=22)

In [22]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [23]:
pipe = make_pipeline(PCA(random_state=22), LinearRegression())
params = {'pca__n_components': [5, 10, 20, 50, 100, 200]}

In [24]:
grid = GridSearchCV(pipe, params, cv=10)

In [25]:
grid.fit (X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('pca', PCA(random_state=22)),
                                       ('linearregression',
                                        LinearRegression())]),
             param_grid={'pca__n_components': [5, 10, 20, 50, 100, 200]})

In [26]:
grid.score(X_train, y_train)

0.7707729053307792

In [27]:
grid.score(X_test, y_test)

0.7094123252912556

In [32]:
cross_val_score(grid, X_test, y_test) 

array([ 0.96477792,  0.71189353,  0.11634675,  0.1903197 , -1.68660195])