In [1]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.preprocessing
import sklearn.linear_model
import sklearn.model_selection
import sklearn.metrics
import sklearn.ensemble
from pygam import LinearGAM
import warnings
from matplotlib import pyplot as plt
warnings.filterwarnings('ignore')

In [2]:
# read in imputed data
sensor_census_imp = pd.read_csv('../data/sensor_census_imputed_rf.csv')

In [3]:
np.random.seed(1)

# get sites for test data
test_sites = np.random.choice(np.unique(sensor_census_imp['site'].values), round(len(np.unique(sensor_census_imp['site'].values))/6), replace = False)

# train sites/rows and x/y split
sensor_census_imp_train = sensor_census_imp[~sensor_census_imp['site'].isin(test_sites)]
sensor_census_imp_train_x = sensor_census_imp_train.iloc[:, 2:]
sensor_census_imp_train_y = sensor_census_imp_train.iloc[:, 1]

# test sites/rows and x/y split
sensor_census_imp_test = sensor_census_imp[sensor_census_imp['site'].isin(test_sites)]
sensor_census_imp_test_x = sensor_census_imp_test.iloc[:, 2:]
sensor_census_imp_test_y = sensor_census_imp_test.iloc[:, 1]

# standardize train and test data
standardizer = sklearn.preprocessing.StandardScaler(with_mean = True, with_std = True)
sensor_census_imp_train_x_stand = standardizer.fit_transform(sensor_census_imp_train_x)
sensor_census_imp_test_x_stand = standardizer.transform(sensor_census_imp_test_x)

In [22]:
# ridge tuning
ridge = sklearn.linear_model.Ridge(random_state = 1)

parameter_grid_ridge = {'alpha' : [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]}

grid_ridge = sklearn.model_selection.GridSearchCV(ridge, parameter_grid_ridge, cv = 10, refit = 'r2')
grid_ridge.fit(sensor_census_imp_train_x_stand, sensor_census_imp_train_y.values)
test_pred_ridge = grid_ridge.predict(sensor_census_imp_test_x_stand)
test_r2_ridge = sklearn.metrics.r2_score(sensor_census_imp_test_y.values, test_pred_ridge)
print(grid_ridge.best_params_)
test_r2_ridge

{'alpha': 10}


0.76663525561902179

In [25]:
# get ridge coefficients
ridge = sklearn.linear_model.Ridge(alpha = 10, random_state = 1)
ridge.fit(sensor_census_imp_train_x_stand, sensor_census_imp_train_y.values)
ridge_coefs = pd.DataFrame([sensor_census_imp_train.columns, ridge.coef_]).transpose()
ridge_coefs.columns = ['Predictor', 'Coef']
ridge_coefs.sort_values(['Coef'], axis = 0, ascending = False)

Unnamed: 0,Predictor,Coef
64,MOD13A2_Nearest4,5.7806
83,OMUVBd_UVindex_Mean,1.37762
95,Family_Poverty,0.622291
104,Hispanic_p,0.451681
32,MAIACUS_cosVZA_Aqua_Nearest,0.397618
102,Other_p,0.373274
67,Nearby_Peak2_Ozone,0.362759
108,Age_30_p,0.362705
110,Age_50_p,0.335252
105,Age_0_p,0.324553


In [35]:
# lasso tuning
lasso = sklearn.linear_model.Lasso(random_state = 1, tol = 1e-3, max_iter = 5000)

parameter_grid_lasso = {'alpha' : [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]}

grid_lasso = sklearn.model_selection.GridSearchCV(lasso, parameter_grid_lasso, cv = 10, refit = 'r2')
grid_lasso.fit(sensor_census_imp_train_x_stand, sensor_census_imp_train_y.values)
test_pred_lasso = grid_lasso.predict(sensor_census_imp_test_x_stand)
test_r2_lasso = sklearn.metrics.r2_score(sensor_census_imp_test_y.values, test_pred_lasso)
print(grid_lasso.best_params_)
test_r2_lasso

{'alpha': 0.001}


0.76687017097542165

In [84]:
# rf tuning
rf = sklearn.ensemble.RandomForestRegressor(n_estimators = 100, random_state = 1, n_jobs = 2, verbose = 1)

parameter_grid_rf = {'max_features' : [40, 50, 60, 70, 80]}

grid_rf = sklearn.model_selection.GridSearchCV(rf, parameter_grid_rf, cv = 4, refit = 'r2')
grid_rf.fit(sensor_census_imp_train_x_stand, sensor_census_imp_train_y.values)
test_pred_rf = grid_rf.predict(sensor_census_imp_test_x_stand)
test_r2_rf = sklearn.metrics.r2_score(sensor_census_imp_test_y.values, test_pred_rf)
print(grid_rf.best_params_)
test_r2_rf

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.3s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   13.7s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.7s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   13.6s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.1s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   13.4s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: 

{'max_features': 70}


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   36.9s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished


0.77682075932760575

In [108]:
# tuning gam
fit_splines = [True for i in range(sensor_census_imp_train_x_stand.shape[1])]
fit_splines[118] = False

fit_linear = [False for i in range(sensor_census_imp_train_x_stand.shape[1])]
fit_linear[118] = True

gam = LinearGAM(max_iter = 1000, fit_splines = fit_splines)

parameter_grid_gam = {'lam': np.logspace(-3, 3, 11), 'n_splines' : [4, 7, 10, 13], 'fit_linear' : [True, fit_linear]}

grid_gam = sklearn.model_selection.GridSearchCV(gam, parameter_grid_gam, scoring = sklearn.metrics.make_scorer(sklearn.metrics.r2_score), 
                                                cv = 4, refit = 'r2')
grid_gam.fit(sensor_census_imp_train_x_stand, sensor_census_imp_train_y.values)
test_pred_gam = grid_gam.predict(sensor_census_imp_test_x_stand)
test_r2_gam = sklearn.metrics.r2_score(sensor_census_imp_test_y.values, test_pred_gam)
print(grid_gam.best_params_)
test_r2_gam

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.


{'fit_linear': [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False], 'lam': 251.18864315095772, 'n_splines': 13}


0.7724362288001575

In [129]:
# want to see how r2 varies with training data size
train_sizes = [2000*(i+1) for i in range(5)]
train_sizes.append(sensor_census_imp_train_x_stand.shape[0])

r2_scores = []
for size in train_sizes:
    
    # fitting with best hyperparams from cross-validation
    rf = sklearn.ensemble.RandomForestRegressor(n_estimators = 1000, random_state = 4, n_jobs = 2, max_features = 60, verbose = 1)
    rf.fit(sensor_census_imp_train_x_stand[:size, :], sensor_census_imp_train_y.values[:size])
    test_pred = rf.predict(sensor_census_imp_test_x_stand)
    test_r2 = sklearn.metrics.r2_score(sensor_census_imp_test_y.values, test_pred)
    r2_scores.append(test_r2)

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    1.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    6.9s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   16.9s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:   19.5s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.2s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.2s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    5.1s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   18.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:   37.7s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:   42.0s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 500 out of 500 | elapsed:    0.

[0.76605145008442499,
 0.76329433416927406,
 0.77941797490622866,
 0.78480099541398307,
 0.7874713617852841,
 0.78198204545720329]

In [136]:
# write true and predicted test vals to csv
test_true_pred_rfImp_best_rf = pd.DataFrame([sensor_census_imp_test_y.values, test_pred]).transpose()
test_true_pred_rfImp_best_rf.columns = ['true', 'pred']
test_true_pred_rfImp_best_rf.to_csv('../data/test_true_pred_rfImp_best_rf.csv', index = False)

In [207]:
# making figure to show improvement with data size
plt.figure(figsize = (9, 8))
plt.plot(train_sizes[:-1], r2_scores[:-1], 'r')
plt.ylim([0.75, 0.8])
plt.grid(color='gray', linestyle='-', linewidth=0.5)
plt.xlabel('Train Data Size', fontsize = 15)
plt.ylabel('Test R^2', fontsize = 15)
plt.title('Test R^2 vs. Train Data Size', fontsize = 15)
plt.rc('xtick',labelsize=13)
plt.rc('ytick',labelsize=13)
#plt.show()
plt.savefig('../plots/convergence.png', bbox_inches='tight')

In [5]:
# for getting variable importances
rf = sklearn.ensemble.RandomForestRegressor(n_estimators = 2000, random_state = 1, n_jobs = 2, max_features = 60, verbose = 1)
rf.fit(sensor_census_imp_train_x_stand, sensor_census_imp_train_y.values)

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   14.5s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:  1.0min
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:  2.2min
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:  4.0min
[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed:  6.4min
[Parallel(n_jobs=2)]: Done 1796 tasks      | elapsed:  9.1min
[Parallel(n_jobs=2)]: Done 2000 out of 2000 | elapsed: 10.2min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=60, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=2000, n_jobs=2, oob_score=False, random_state=1,
           verbose=1, warm_start=False)

In [18]:
# variable importances
varImp = pd.DataFrame([sensor_census_imp_train.columns, rf.feature_importances_]).transpose()
varImp.columns = ['Predictor', 'Variable Importance']
varImp.sort_values(['Variable Importance'], axis = 0, ascending = False)

Unnamed: 0,Predictor,Variable Importance
64,MOD13A2_Nearest4,0.56713
67,Nearby_Peak2_Ozone,0.109235
29,NO2_Region,0.02415
70,Nearby_Peak2Lag1_Ozone,0.0164348
49,REANALYSIS_weasd_DailyMean,0.00897696
38,REANALYSIS_dswrf_DailyMean,0.00881196
65,Nearby_Peak2_NO2,0.00805522
23,RoadDensity_prisecroads1000,0.0075038
63,MOD11A1_Clear_night_cov_Nearest4,0.00691605
56,REANALYSIS_prate_1Day,0.00665184
