<a href="https://colab.research.google.com/github/clyde2020/ML_Portfolio/blob/main/US%20Cancer%20Death%20Rates/Capstone_2_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install featuretools==0.4.0 dask==0.19.4 distributed==1.23.3 tornado==5.0.0

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from scipy.stats import t
from numpy.random import seed

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
import datetime
from pathlib import Path

import featuretools as ft

# Useful for showing multiple outputs
from IPython.core.interactiveshell import InteractiveShell

Get the data with imputations

In [None]:
cancer = '/content/cancer_reg.csv'
df = pd.read_csv(cancer, encoding='latin-1')
df.PctPrivateCoverageAlone = df.PctPrivateCoverageAlone.fillna(df.PctPrivateCoverageAlone.mean())
df.PctEmployed16_Over = df.PctEmployed16_Over.fillna(df.PctEmployed16_Over.mean())
df = df[df.MedianAge < 70]
df.drop(['Geography', 'binnedInc', 'PctSomeCol18_24'], axis=1, inplace=True)

X = df.drop('TARGET_deathRate', axis=1)
y = df['TARGET_deathRate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Let's get the dummy regressor metrics as a baseline

In [None]:
train_mean = y_train.mean()

dumb_reg = DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
dumb_reg.constant_

y_tr_pred = dumb_reg.predict(X_train)
y_te_pred = train_mean * np.ones(len(y_test))

DummyRegressor()

array([[178.5901782]])

In [None]:
print(r2_score(y_test, y_te_pred))
print(mean_squared_error(y_test, y_te_pred))
print(mean_absolute_error(y_test, y_te_pred))

-0.0001049627237619255
800.153559600647
21.765571764970662


Get scaled data

In [None]:
ss = StandardScaler()
X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

# lm = LinearRegression().fit(X_train_scaled, y_train)

Get data with no imputations for feeding into pipelines

In [None]:
cancer = '/content/cancer_reg.csv'
df = pd.read_csv(cancer, encoding='latin-1')
df = df[df.MedianAge < 70]
df.drop(['Geography', 'binnedInc', 'PctSomeCol18_24'], axis=1, inplace=True)

X = df.drop('TARGET_deathRate', axis=1)
y = df['TARGET_deathRate']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Pipeline model with Linear regression.

In [None]:
pipe = make_pipeline(
    SimpleImputer(strategy='mean'), 
    StandardScaler(), 
    SelectKBest(f_regression),
    LinearRegression()
)

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train)
y_test_pred = pipe.predict(X_test)

print()
print(r2_score(y_train, y_train_pred))
print(r2_score(y_test, y_test_pred))
print(mean_absolute_error(y_train, y_train_pred))
print(mean_absolute_error(y_test, y_test_pred))


0.4888589635269407
0.4606108525099447
14.74409748123871
15.4203024862343


Gridsearch for best Linear Regression model

In [None]:
k = [k+1 for k in range(len(X_train.columns))]
grid_params = {'selectkbest__k': k}

lr_grid_cv = GridSearchCV(pipe, param_grid=grid_params, cv=5, n_jobs=-1)
lr_grid_cv.fit(X_train, y_train)

print()
print(lr_grid_cv.best_params_)
print(lr_grid_cv.best_estimator_)


{'selectkbest__k': 27}
Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('standardscaler', StandardScaler()),
                ('selectkbest',
                 SelectKBest(k=27,
                             score_func=<function f_regression at 0x7f0fb72b3b00>)),
                ('linearregression', LinearRegression())])


Gridsearch for best Random forest model

In [None]:
RF_pipe = make_pipeline(
    SimpleImputer(),
    StandardScaler(), 
    RandomForestRegressor(random_state=47)
)

n_est = [int(n) for n in np.logspace(start=1, stop=3, num=20)]
grid_params = {
        'randomforestregressor__n_estimators': n_est,
        'standardscaler': [StandardScaler(), None],
        'simpleimputer__strategy': ['mean', 'median']
}

rf_grid_cv = GridSearchCV(RF_pipe, param_grid=grid_params, cv=5, n_jobs=-1)
rf_grid_cv.fit(X_train, y_train)
print(rf_grid_cv.best_params_)

{'randomforestregressor__n_estimators': 784, 'simpleimputer__strategy': 'median', 'standardscaler': None}


Random Forest has the best scores

In [None]:
rf_best_cv_results = cross_validate(rf_grid_cv.best_estimator_, X_train, y_train, cv=5)
rf_best_scores = rf_best_cv_results['test_score']
np.mean(rf_best_scores), np.std(rf_best_scores)

(0.5426079420594067, 0.04875258831740505)

In [None]:
cv_results = cross_validate(lr_grid_cv.best_estimator_, X_train, y_train, cv=5)
cv_best_scores = cv_results['test_score']
np.mean(cv_best_scores), np.std(cv_best_scores)

(0.509730766019896, 0.051952379722592125)

In [None]:
cv_results_2 = cross_validate(lm, X_train_scaled, y_train)
cv_2_scores = cv_results_2['test_score']
np.mean(cv_2_scores), np.std(cv_2_scores)

(0.5083389799932486, 0.050078582536126064)

Linear regression model performance<a id='4.11.1_Linear_regression_model_performance'></a>

In [None]:
# 'neg_mean_absolute_error' uses the (negative of) the mean absolute error
lr_neg_mae = cross_validate(lr_grid_cv.best_estimator_, X_train, y_train, 
                            scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)

In [None]:
lr_mae_mean = np.mean(-1 * lr_neg_mae['test_score'])
lr_mae_std = np.std(-1 * lr_neg_mae['test_score'])
lr_mae_mean, lr_mae_std

(14.30536945859473, 0.39971850684976495)

In [None]:
mean_absolute_error(y_test, lr_grid_cv.best_estimator_.predict(X_test))

14.75654072549843

Random forest regression model performance<a id='4.11.2_Random_forest_regression_model_performance'></a>

In [None]:
rf_neg_mae = cross_validate(rf_grid_cv.best_estimator_, X_train, y_train, 
                            scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)

In [None]:
rf_mae_mean = np.mean(-1 * rf_neg_mae['test_score'])
rf_mae_std = np.std(-1 * rf_neg_mae['test_score'])
rf_mae_mean, rf_mae_std

(13.66200531565039, 0.2470278210794711)

In [None]:
mean_absolute_error(y_test, rf_grid_cv.best_estimator_.predict(X_test))

14.112610868022688

Automated feature engineering

In [None]:
InteractiveShell.ast_node_interactivity = "all"

# make some folders we will need if they do not exist
Path("./data").mkdir(parents=True, exist_ok=True)

In [None]:
import featuretools.variable_types as vtypes

# Make empty entityset
es = ft.EntitySet(id = 'cancer_patients')

In [None]:
es.entity_from_dataframe(entity_id='patients', dataframe=X,
                         index = 'index'
                         )



Entityset: cancer_patients
  Entities:
    patients [Rows: 3017, Columns: 31]
  Relationships:
    No relationships

In [None]:
feature_defs = ft.dfs(entityset=es, target_entity='patients', 
                      cutoff_time = None,
                      where_primitives = ['sum', 'mean'],
                      max_depth=2, features_only=True)

In [None]:
print(f'This will generate {len(feature_defs)} features.')

This will generate 30 features.


In [None]:
all_p = ft.list_primitives()
len(all_p)

62

In [None]:
trans_p = all_p.loc[all_p['type'] == 'transform'].copy()
agg_p = all_p.loc[all_p['type'] == 'aggregation'].copy()

In [None]:
pd.options.display.max_colwidth = 100
trans_p

In [None]:
pd.options.display.max_colwidth = 100
agg_p