In [57]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.preprocessing
import sklearn.linear_model
import sklearn.model_selection
import sklearn.metrics
import sklearn.ensemble
from pygam import LinearGAM
import warnings
warnings.filterwarnings('ignore')

In [32]:
# read in imputed data
sensor_census_imp = pd.read_csv('../data/sensor_census_imputed_rf.csv')

In [33]:
np.random.seed(1)

# get sites for test data
test_sites = np.random.choice(np.unique(sensor_census_imp['site'].values), round(len(np.unique(sensor_census_imp['site'].values))/6), replace = False)

# train sites/rows and x/y split
sensor_census_imp_train = sensor_census_imp[~sensor_census_imp['site'].isin(test_sites)]
sensor_census_imp_train_x = sensor_census_imp_train.iloc[:, 2:]
sensor_census_imp_train_y = sensor_census_imp_train.iloc[:, 1]

# test sites/rows and x/y split
sensor_census_imp_test = sensor_census_imp[sensor_census_imp['site'].isin(test_sites)]
sensor_census_imp_test_x = sensor_census_imp_test.iloc[:, 2:]
sensor_census_imp_test_y = sensor_census_imp_test.iloc[:, 1]

# standardize train and test data
standardizer = sklearn.preprocessing.StandardScaler(with_mean = True, with_std = True)
sensor_census_imp_train_x_stand = standardizer.fit_transform(sensor_census_imp_train_x)
sensor_census_imp_test_x_stand = standardizer.transform(sensor_census_imp_test_x)

In [34]:
# ridge tuning
ridge = sklearn.linear_model.Ridge(random_state = 1)

parameter_grid_ridge = {'alpha' : [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]}

grid_ridge = sklearn.model_selection.GridSearchCV(ridge, parameter_grid_ridge, cv = 10, refit = 'r2')
grid_ridge.fit(sensor_census_imp_train_x_stand, sensor_census_imp_train_y.values)
test_pred_ridge = grid_ridge.predict(sensor_census_imp_test_x_stand)
test_r2_ridge = sklearn.metrics.r2_score(sensor_census_imp_test_y.values, test_pred_ridge)
print(grid_ridge.best_params_)
test_r2_ridge

{'alpha': 10}


0.76663525561902179

In [35]:
# lasso tuning
lasso = sklearn.linear_model.Lasso(random_state = 1, tol = 1e-3, max_iter = 5000)

parameter_grid_lasso = {'alpha' : [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001]}

grid_lasso = sklearn.model_selection.GridSearchCV(lasso, parameter_grid_lasso, cv = 10, refit = 'r2')
grid_lasso.fit(sensor_census_imp_train_x_stand, sensor_census_imp_train_y.values)
test_pred_lasso = grid_lasso.predict(sensor_census_imp_test_x_stand)
test_r2_lasso = sklearn.metrics.r2_score(sensor_census_imp_test_y.values, test_pred_lasso)
print(grid_lasso.best_params_)
test_r2_lasso

{'alpha': 0.001}


0.76687017097542165

In [None]:
# rf tuning
rf = sklearn.ensemble.RandomForestRegressor(n_estimators = 100, random_state = 1, n_jobs = 2, verbose = 1)

parameter_grid_rf = {'max_features' : [40, 50, 60, 70, 80]}

grid_rf = sklearn.model_selection.GridSearchCV(rf, parameter_grid_rf, cv = 4, refit = 'r2')
grid_rf.fit(sensor_census_imp_train_x_stand, sensor_census_imp_train_y.values)
test_pred_rf = grid_rf.predict(sensor_census_imp_test_x_stand)
test_r2_rf = sklearn.metrics.r2_score(sensor_census_imp_test_y.values, test_pred_rf)
print(grid_rf.best_params_)
test_r2_rf

[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.3s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   13.7s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.7s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   13.6s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    6.1s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   13.4s finished
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: 

In [61]:
# tuning gam
param_grid_gam = {'lam': np.logspace(-3, 3, 11), 'n_splines' : [4, 7, 10, 13]}
gam = LinearGAM(max_iter = 1000).gridsearch(X = sensor_census_imp_train_x_stand, y = sensor_census_imp_train_y.values, return_scores = True, objective = 'GCV',
                                 keep_best = True, **param_grid_gam)

test_pred_gam = gam.predict(sensor_census_imp_test_x_stand)
test_r2_gam = sklearn.metrics.r2_score(sensor_census_imp_test_y.values, test_pred_gam)
print(gam.summary())
test_r2_gam

  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity penalties and many splines.
See installation instructions for installing Scikit-Sparse and Suite-Sparse via Conda.
  self._validate_data_dep_params(X)
This will slow down optimization for models with monotonicity/convexity 

AttributeError: 'collections.OrderedDict' object has no attribute 'predict'

In [82]:
list(gam)[1].predict(sensor_census_imp_test_x_stand)
test_r2_gam = sklearn.metrics.r2_score(sensor_census_imp_test_y.values, list(gam)[1].predict(sensor_census_imp_test_x_stand))
test_r2_gam

-0.023292442369208688

In [74]:
gam

OrderedDict([(LinearGAM(callbacks=[Deviance(), Diffs()], constraints=None, 
                 dtype='auto', fit_intercept=True, fit_linear=True, 
                 fit_splines=True, lam=0.6, max_iter=1000, n_splines=5, 
                 penalties='auto', scale=None, spline_order=3, tol=0.0001),
              63790.360333775985),
             (LinearGAM(callbacks=[Deviance(), Diffs()], constraints=None, 
                 dtype='auto', fit_intercept=True, fit_linear=True, 
                 fit_splines=True, lam=0.6, max_iter=1000, n_splines=25, 
                 penalties='auto', scale=None, spline_order=3, tol=0.0001),
              63335.250810711412),
             (LinearGAM(callbacks=[Deviance(), Diffs()], constraints=None, 
                 dtype='auto', fit_intercept=True, fit_linear=True, 
                 fit_splines=True, lam=0.8, max_iter=1000, n_splines=5, 
                 penalties='auto', scale=None, spline_order=3, tol=0.0001),
              63808.467815804062),
            