In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import from sklearn.
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

In [2]:
california = pd.read_csv('../data/cleaned_engineered_ca.csv')

In [3]:
list(california.columns)

['county_state',
 'total_cases',
 'total_fatalities',
 'death_rate',
 'total_tests',
 'race_pop',
 'race_pop_hispanic_or_latino_of_any_race',
 'race_pop_white_alone',
 'race_pop_black_or_african_american_alone',
 'race_pop_american_indian_and_alaska_native_alone',
 'race_pop_asian_alone',
 'race_pop_native_hawaiian_and_other_pacific_islander_alone',
 'race_pop_some_other_race_alone',
 'race_pop_two_or_more_races',
 'sex_age_pop',
 'sex_age_pop_male',
 'sex_age_pop_female',
 'sex_age_pop_under_5',
 'sex_age_pop_5_to_9',
 'sex_age_pop_10_to_14',
 'sex_age_pop_15_to_19',
 'sex_age_pop_20_to_24',
 'sex_age_pop_25_to_34',
 'sex_age_pop_35_to_44',
 'sex_age_pop_45_to_54',
 'sex_age_pop_55_to_59',
 'sex_age_pop_60_to_64',
 'sex_age_pop_65_to_74',
 'sex_age_pop_75_to_84',
 'sex_age_pop_85_and_over',
 'sex_age_median_age_in_years',
 'sq_mi',
 'obes_percent',
 'health_ins_noninst_pop',
 'health_ins_noninst_pop_cov_yes',
 'health_ins_noninst_pop_private',
 'health_ins_noninst_pop_public',
 'healt

In [50]:
X = california[[
 'percent_race_pop_hispanic_or_latino_of_any_race',
    'percent_sex_age_pop_under_5',
    'percent_sex_age_pop_5_to_9',
    'percent_sex_age_pop_10_to_14',
    'percent_sex_age_pop_15_to_19',
    'obes_percent',
    'percent_sex_age_pop_25_to_34',
    'percent_inc_hhlds_35_000_to_49_999',
    'sex_age_pop_20_to_24',
    'percent_sex_age_pop_male',
    'percent_health_ins_noninst_pop_cov_no',
    'tests_per_100_people',
    'percent_sex_age_pop_35_to_44',
    'percent_race_pop_white_alone',    
    'percent_sex_age_pop_45_to_74',
    'percent_sex_age_pop_75_and_over',
    'percent_race_pop_two_or_more_races',
    'inc_per_capita_inc_dol',
    'pop_density']]

y = california['cases_per_100_people']

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [36]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [37]:
pipe = make_pipeline(PCA(random_state=22), LinearRegression())
params = {'pca__n_components': [5, 10, 20, 50, 100, 200]}

In [38]:
grid = GridSearchCV(pipe, params, cv=10)

In [39]:
grid.fit (X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('pca', PCA(random_state=22)),
                                       ('linearregression',
                                        LinearRegression())]),
             param_grid={'pca__n_components': [5, 10, 20, 50, 100, 200]})

In [40]:
grid.score(X_train, y_train)

0.7977426055872998

In [41]:
grid.score(X_test, y_test)

0.7278296719332813

### Quantile transformer

In [87]:
from sklearn.preprocessing import QuantileTransformer, PowerTransformer
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectKBest, f_regression

In [107]:
pipe = make_pipeline(PCA(random_state=22), QuantileTransformer(), SelectKBest(f_regression), Ridge())
params = {
    'pca__n_components': [5, 10, 20, 50, 100, 200],
    'selectkbest__k': range(1,20),
    'ridge__alpha': [.1, 1, 10, 20] 
}

In [108]:
grid = GridSearchCV(pipe, params, cv=10, n_jobs=-1)

In [109]:
grid.fit (X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('pca', PCA(random_state=22)),
                                       ('quantiletransformer',
                                        QuantileTransformer()),
                                       ('selectkbest',
                                        SelectKBest(score_func=<function f_regression at 0x7fc5551b49d0>)),
                                       ('ridge', Ridge())]),
             n_jobs=-1,
             param_grid={'pca__n_components': [5, 10, 20, 50, 100, 200],
                         'ridge__alpha': [0.1, 1, 10, 20],
                         'selectkbest__k': range(1, 20)})

In [110]:
grid.score(X_train, y_train)

0.7868124208556944

In [111]:
grid.score(X_test, y_test)

0.7200286161131164