# Modeling

In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import csv

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error 
from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor

np.random.seed(42)

### Load the Data

In [69]:
X_train_sc = pd.read_csv('../data/X_train_sc.csv', header=None)
X_test_sc = pd.read_csv('../data/X_test_sc.csv', header=None)

In [89]:
y_train = pd.read_csv('../data/y_train.csv', header=None)
y_test = pd.read_csv('../data/y_test.csv', header=None)

In [81]:
ss = pd.read_pickle('../pickles/standard_scaler.pkl')

In [82]:
y_train.mean()

0    103.213262
dtype: float64

In [83]:
y_test.mean()

0    103.297193
dtype: float64

In [84]:
X_train_sc.shape, X_test_sc.shape

((12291, 829), (4097, 829))

In [90]:
y_train.shape, y_test.shape

((12291, 1), (4097, 1))

In [96]:
y_train = y_train[0].values

In [97]:
y_test = y_test[0].values

In [98]:
y_train.shape, y_test.shape

((12291,), (4097,))

### Pipeline

In [22]:
pipe = Pipeline([
    ('kbest', SelectKBest(f_regression)),
    ('rf', RandomForestRegressor())
])

In [27]:
params = {
    'rf__n_estimators':[80, 90, 100],
    'rf__max_depth':[40, 50, 60]
}

### Cross Validation Score

In [49]:
rf = RandomForestRegressor()

In [38]:
cross_val_score(rf, X_train_sc, y_train).mean()

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


0.542817267501669

### Instantiate the GridSearch with RandomForestRegressor

In [28]:
gs = GridSearchCV(pipe, param_grid=params)

In [29]:
gs.fit(X_train_sc, y_train)

  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  c

  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  c

  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  c

  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  c

  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  n_samples * X_means ** 2)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)
  y = column_or_1d(y, warn=True)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  self._final_estimator.fit(Xt, y, **fit_params)


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('kbest', SelectKBest(k=10, score_func=<function f_regression at 0x1a1f0532f0>)), ('rf', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_i...imators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'rf__n_estimators': [80, 90, 100], 'rf__max_depth': [40, 50, 60]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [39]:
gs.best_params_

{'svr__C': 10, 'svr__epsilon': 1}

In [32]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('kbest', SelectKBest(k=10, score_func=<function f_regression at 0x1a1f0532f0>)), ('rf', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=40,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))])

In [33]:
gs.score(X_train_sc, y_train)

0.9410958655707631

In [34]:
gs.score(X_test_sc, y_test)

0.5865314169089416

### Instantiate the GridSearch with SVR

In [50]:
pipe = Pipeline([
    ('svr', SVR())
])

In [51]:
params = {
    'svr__C':[10],
    'svr__epsilon':[.5, 1, 2]
}

In [52]:
gs = GridSearchCV(pipe, param_grid=params)

In [53]:
gs.fit(X_train_sc, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('svr', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'svr__C': [10], 'svr__epsilon': [0.5, 1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [54]:
gs.best_params_

{'svr__C': 10, 'svr__epsilon': 2}

In [55]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('svr', SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=2, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))])

In [56]:
gs.score(X_train_sc, y_train)

0.6165649100759626

In [57]:
gs.score(X_test_sc, y_test)

0.5066035617326069

### Instantiate GridSearch with PCA & KNN

In [60]:
pipe = Pipeline([
    ('pca', PCA()),
    ('knn', KNeighborsRegressor())
])

In [61]:
params = {
    'knn__n_neighbors':np.arange(3, 11, 2)
}

In [62]:
gs = GridSearchCV(pipe, param_grid=params)

In [63]:
gs.fit(X_train_sc, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('knn', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'knn__n_neighbors': array([3, 5, 7, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [65]:
gs.best_params_

{'knn__n_neighbors': 9}

In [66]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('knn', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=9, p=2,
          weights='uniform'))])

In [64]:
gs.score(X_train_sc, y_train)

0.33136216313069256

In [67]:
gs.score(X_test_sc, y_test)

0.19514480279758506

### Instantiate GridSeach with GradientBoostRegressor

In [110]:
pipe = Pipeline([
    ('gbr', GradientBoostingRegressor())
])

In [111]:
params = {
    'gbr__n_estimators':[90, 100, 110],
    'gbr__max_depth':[1,3,5]
}

In [112]:
gs = GridSearchCV(pipe, param_grid=params)

In [113]:
gs.fit(X_train_sc, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('gbr', GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, ...=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'gbr__n_estimators': [90, 100, 110], 'gbr__max_depth': [1, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [114]:
gs.best_params_

{'gbr__max_depth': 5, 'gbr__n_estimators': 110}

In [115]:
gs.score(X_train_sc, y_train)

0.7207023317924169

In [116]:
gs.score(X_test_sc, y_test)

0.6251774158455016