In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import linear regression, decision tree regressor and ada boost regressor, random forest regressor, xgboost regressor, ridge regressor, lasso regressor, elastic net regressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from xgboost import XGBRegressor

# run grid search on the above regressors, and use k fold cross validation
# make sure the number of canidates for each hyperparameter is less than 4
from sklearn.model_selection import GridSearchCV, KFold

# import mean squared error and r2 score
from sklearn.metrics import mean_squared_error, r2_score

# import joblib to save the model
import joblib

# ignore all warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# load data
train_data = np.load('ptrain_pca.npy')
test_data = np.load('ptest_pca.npy')
targets = pd.read_csv('training_data_targets.csv', header=None).to_numpy()[:,0]

In [3]:
# start with linear regression
# define the model
model = LinearRegression()
# define the grid search
grid = dict()
grid['fit_intercept'] = [True, False]
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='r2')
# perform the search
grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# save best parameters as a dictionary
best_params = grid_result.best_params_
# save as a json file
import json
with open('linear_regression_best_params_pca.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'linear_regression_pca.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('linear_regression_results_pca.npy', y_pred)

Best: 0.175019 using {'fit_intercept': False}


In [5]:
# ridge regression
# define the model
model = Ridge()
# define the grid search
grid = dict()
grid['alpha'] = [0.1, 1, 10]
grid['max_iter'] = [1000, 2000, 4000]
grid['solver'] = ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='neg_mean_squared_error')
# perform the search
grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('ridge_regression_best_params_pca.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'ridge_regression_pca.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('ridge_regression_results_pca.npy', y_pred)

Best: -198761.567201 using {'alpha': 10, 'max_iter': 2000, 'solver': 'saga'}


In [6]:
# ridge regression
# define the model
model = Ridge()
# define the grid search
grid = dict()
grid['alpha'] = [0.1, 1, 10]
grid['max_iter'] = [1000, 2000, 4000]
grid['solver'] = ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='r2')
# perform the search
grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('ridge_regression_best_params_pca.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'ridge_regression_pca.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('ridge_regression_results_pca.npy', y_pred)

Best: 0.482418 using {'alpha': 10, 'max_iter': 4000, 'solver': 'saga'}


In [7]:
# decision tree regressor
# define the model
model = DecisionTreeRegressor()
# define the grid search
grid = dict()
grid['max_depth'] = [None, 10, 100, 500]
grid['min_samples_split'] = [2, 4, 8]
grid['min_samples_leaf'] = [1, 2, 4]

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='r2')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('decision_tree_regressor_best_params_pca.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'decision_tree_regressor_pca.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('decision_tree_regressor_results_pca.npy', y_pred)

Best: 0.890967 using {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 8}


In [8]:
# decision tree regressor
# define the model
model = DecisionTreeRegressor()
# define the grid search
grid = dict()
grid['max_depth'] = [None, 10, 100, 500]
grid['min_samples_split'] = [2, 4, 8]
grid['min_samples_leaf'] = [1, 2, 4]

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='neg_mean_squared_error')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('decision_tree_regressor_best_params_pca.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'decision_tree_regressor_pca.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('decision_tree_regressor_results_pca.npy', y_pred)

Best: -52053.822410 using {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 8}


In [9]:
# ada boost regressor
# define the model
model = AdaBoostRegressor()
# define the grid search
grid = dict()
grid['n_estimators'] = [10, 100, 1000]
grid['learning_rate'] = [0.001, 0.01, 0.1, 1.0]
grid['loss'] = ['linear', 'square', 'exponential']
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=5, scoring='r2')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('ada_boost_regressor_best_params.json_pca', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'ada_boost_regressor_pca.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('ada_boost_regressor_results_pca.npy', y_pred)

Best: 0.851712 using {'learning_rate': 0.001, 'loss': 'linear', 'n_estimators': 100}


In [14]:
# random forest regressor
# define the model
model = RandomForestRegressor()
# define the grid search
grid = dict()
grid['n_estimators'] = [10, 100, 200]
grid['max_depth'] = [None, 10, 60, 100]
grid['min_samples_split'] = [2, 4, 8]
grid['min_samples_leaf'] = [1, 2, 4]

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='r2')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('random_forest_regressor_best_params_pca.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'random_forest_regressor_pca.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('random_forest_regressor_results_pca.npy', y_pred)

Best: 0.890539 using {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}


In [None]:
# define the model
model = XGBRegressor()
# define the grid search
grid = dict()
grid['n_estimators'] = [10, 100, 500, 1000]
grid['max_depth'] = [5, 10, 100]
grid['learning_rate'] = [0.001, 0.01, 0.1, 1.0]
grid['booster'] = ['gbtree', 'gblinear', 'dart']

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='r2')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
import json
with open('xgboost_regression_best_params.json_pca', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'xgboost_regressor_pca.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('xgboost_regressor_results.npy_pca', y_pred)

## Using Negative Mean Squared Error

In [3]:
targets = np.log(targets)

In [16]:
# start with linear regression
# define the model
model = LinearRegression()
# define the grid search
grid = dict()
grid['fit_intercept'] = [True, False]
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='neg_mean_squared_error')
# perform the search
grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# save best parameters as a dictionary
best_params = grid_result.best_params_
# save as a json file
import json
with open('linear_regression_best_params_pca_nmse.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'linear_regression_pca_nmse.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('linear_regression_results_pca_nmse.npy', y_pred)

Best: -17.599714 using {'fit_intercept': False}


In [17]:
# ridge regression
# define the model
model = Ridge()
# define the grid search
grid = dict()
grid['alpha'] = [0.1, 1, 10]
grid['max_iter'] = [1000, 2000, 4000]
grid['solver'] = ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='neg_mean_squared_error')
# perform the search
grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('ridge_regression_best_params_pca_nmse.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'ridge_regression_pca_nmse.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('ridge_regression_results_pca.npy_nmse', y_pred)

Best: -3.491327 using {'alpha': 10, 'max_iter': 2000, 'solver': 'saga'}


In [18]:
# decision tree regressor
# define the model
model = DecisionTreeRegressor()
# define the grid search
grid = dict()
grid['max_depth'] = [None, 10, 100, 500]
grid['min_samples_split'] = [2, 4, 8]
grid['min_samples_leaf'] = [1, 2, 4]

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='neg_mean_squared_error')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('decision_tree_regressor_best_params_pca_nmse.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'decision_tree_regressor_pca_nmse.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('decision_tree_regressor_results_pca_nmse.npy', y_pred)

Best: -0.164042 using {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 8}


In [19]:
# ada boost regressor
# define the model
model = AdaBoostRegressor()
# define the grid search
grid = dict()
grid['n_estimators'] = [10, 100, 1000]
grid['learning_rate'] = [0.001, 0.01, 0.1, 1.0]
grid['loss'] = ['linear', 'square', 'exponential']
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=5, scoring='neg_mean_squared_error')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('ada_boost_regressor_best_params.json_pca_nmse', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'ada_boost_regressor_pca_nmse.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('ada_boost_regressor_results_pca_nmse.npy', y_pred)

Best: -0.335058 using {'learning_rate': 0.01, 'loss': 'linear', 'n_estimators': 100}


In [20]:
# random forest regressor
# define the model
model = RandomForestRegressor()
# define the grid search
grid = dict()
grid['n_estimators'] = [10, 100, 200]
grid['max_depth'] = [10, 60, 100, 500]
grid['min_samples_split'] = [2, 4, 8]
grid['min_samples_leaf'] = [1, 2, 4]

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='neg_mean_squared_error')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('random_forest_regressor_best_params_pca_nmse.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'random_forest_regressor_pca_nmse.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('random_forest_regressor_results_pca_nmse.npy', y_pred)

Best: -0.136311 using {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}


In [4]:
# define the model
model = XGBRegressor()
# define the grid search
grid = dict()
grid['n_estimators'] = [10, 100, 500, 1000]
grid['max_depth'] = [5, 10, 100]
grid['learning_rate'] = [0.001, 0.01, 0.1, 1.0]
grid['booster'] = ['gbtree', 'gblinear', 'dart']

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='neg_mean_squared_error')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
import json
with open('xgboost_regression_best_params.json_pca_nmse', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'xgboost_regressor_pca_nmse.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('xgboost_regressor_results.npy_pca_nmse', y_pred)

Best: -0.110677 using {'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 1000}
