In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import linear regression, decision tree regressor and ada boost regressor, random forest regressor, xgboost regressor, ridge regressor, lasso regressor, elastic net regressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from xgboost import XGBRegressor

# run grid search on the above regressors, and use k fold cross validation
# make sure the number of canidates for each hyperparameter is less than 4
from sklearn.model_selection import GridSearchCV, KFold

# import mean squared error and r2 score
from sklearn.metrics import mean_squared_error, r2_score

# import joblib to save the model
import joblib
import json
# ignore all warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/bc/43/242432efc3f60052a4a534dc4926b21e236ab4ec8d4920c593da3f65c65d/xgboost-2.0.2-py3-none-win_amd64.whl.metadata
  Downloading xgboost-2.0.2-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.2-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB 1.3 MB/s eta 0:01:18
   ---------------------------------------- 0.0/99.8 MB 388.9 kB/s eta 0:04:17
   ---------------------------------------- 0.1/99.8 MB 777.7 kB/s eta 0:02:09
   ---------------------------------------- 0.1/99.8 MB 655.4 kB/s eta 0:02:33
   ---------------------------------------- 0.1/99.8 MB 599.1 kB/s eta 0:02:47
   ---------------------------------------- 0.1/99.8 MB 532.5 kB/s eta 0:03:08
   ---------------------------------




   --------------------------------- ------ 83.1/99.8 MB 1.0 MB/s eta 0:00:17
   --------------------------------- ------ 83.1/99.8 MB 1.0 MB/s eta 0:00:17
   --------------------------------- ------ 83.1/99.8 MB 1.0 MB/s eta 0:00:17
   --------------------------------- ------ 83.2/99.8 MB 1.0 MB/s eta 0:00:17
   --------------------------------- ------ 83.2/99.8 MB 1.0 MB/s eta 0:00:16
   --------------------------------- ------ 83.3/99.8 MB 1.0 MB/s eta 0:00:16
   --------------------------------- ------ 83.3/99.8 MB 1.0 MB/s eta 0:00:16
   --------------------------------- ------ 83.3/99.8 MB 1.0 MB/s eta 0:00:16
   --------------------------------- ------ 83.3/99.8 MB 1.0 MB/s eta 0:00:16
   --------------------------------- ------ 83.5/99.8 MB 1.0 MB/s eta 0:00:16
   --------------------------------- ------ 83.6/99.8 MB 1.1 MB/s eta 0:00:16
   --------------------------------- ------ 83.7/99.8 MB 1.0 MB/s eta 0:00:16
   --------------------------------- ------ 83.7/99.8 MB 1.0 MB

In [2]:
# load data
train_data = np.load('ptrain.npy')
test_data = np.load('ptest.npy')
targets = pd.read_csv('training_data_targets.csv', header=None).to_numpy()[:,0]


In [12]:
# start with linear regression
# define the model
model = LinearRegression()
# define the grid search
grid = dict()
grid['fit_intercept'] = [True, False]
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='r2')
# perform the search
grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# save best parameters as a dictionary
best_params = grid_result.best_params_
# save as a json file
import json
with open('linear_regression_best_params.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'linear_regression.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('linear_regression_results.npy', y_pred)

Best: 0.080735 using {'fit_intercept': True}


In [14]:
# ridge regression
# define the model
model = Ridge()
# define the grid search
grid = dict()
grid['alpha'] = [0.1, 1, 10]
grid['max_iter'] = [1000, 2000, 4000]
grid['solver'] = ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='neg_mean_squared_error')
# perform the search
grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('ridge_regression_best_params.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'ridge_regression.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('ridge_regression_results.npy', y_pred)

Best: -232870.705847 using {'alpha': 10, 'max_iter': 4000, 'solver': 'saga'}


In [18]:
# decision tree regressor
# define the model
model = DecisionTreeRegressor()
# define the grid search
grid = dict()
grid['max_depth'] = [None, 10, 100, 500]
grid['min_samples_split'] = [2, 4, 8]
grid['min_samples_leaf'] = [1, 2, 4]

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='r2')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('decision_tree_regressor_best_params.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'decision_tree_regressor.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('decision_tree_regressor_results.npy', y_pred)

Best: 0.882812 using {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [19]:
# ada boost regressor
# define the model
model = AdaBoostRegressor()
# define the grid search
grid = dict()
grid['n_estimators'] = [10, 100, 1000]
grid['learning_rate'] = [0.001, 0.01, 0.1, 1.0]
grid['loss'] = ['linear', 'square', 'exponential']
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=5, scoring='r2')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('ada_boost_regressor_best_params.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'ada_boost_regressor.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('ada_boost_regressor_results.npy', y_pred)

Best: 0.851353 using {'learning_rate': 0.001, 'loss': 'exponential', 'n_estimators': 10}


In [22]:
# random forest regressor
# define the model
model = RandomForestRegressor()
# define the grid search
grid = dict()
grid['n_estimators'] = [10, 100, 200]
grid['max_depth'] = [None, 10, 60, 100]
grid['min_samples_split'] = [2, 4, 8]
grid['min_samples_leaf'] = [1, 2, 4]

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='r2')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('random_forest_regressor_best_params.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'random_forest_regressor.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('random_forest_regressor_results.npy', y_pred)

Best: 0.906636 using {'max_depth': 100, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [10]:
# define the model
model = XGBRegressor()
# define the grid search
grid = dict()
grid['n_estimators'] = [10, 100, 500, 1000]
grid['max_depth'] = [10, 100, 500, 1000]
grid['learning_rate'] = [0.001, 0.01, 0.1, 1.0]
grid['booster'] = ['gbtree', 'gblinear', 'dart']

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='r2')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
import json
with open('xgboost_regression_best_params.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'xgboost_regressor.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('xgboost_regressor_results.npy', y_pred)

Best: 0.889643 using {'booster': 'gbtree', 'learning_rate': 0.01, 'max_depth': 100, 'n_estimators': 1000}


In [18]:
random_forest_regressor_results = np.load('random_forest_regressor_results.npy')
len(random_forest_regressor_results)

2946

## Using Negative Mean Squared Error

In [6]:
targets = np.log(targets)

In [4]:
# start with linear regression
# define the model
model = LinearRegression()
# define the grid search
grid = dict()
grid['fit_intercept'] = [True, False]
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='neg_mean_squared_error')
# perform the search
grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# save best parameters as a dictionary
best_params = grid_result.best_params_
# save as a json file
import json
with open('linear_regression_best_params_nmse.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'linear_regression_nmse.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('linear_regression_results_nmse.npy', y_pred)

Best: -8.970014 using {'fit_intercept': False}


In [7]:
# ridge regression
# define the model
model = Ridge()
# define the grid search
grid = dict()
grid['alpha'] = [0.1, 1, 10]
grid['max_iter'] = [1000, 2000, 4000]
grid['solver'] = ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='neg_mean_squared_error')
# perform the search
grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('ridge_regression_best_params_nmse.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'ridge_regression_nmse.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('ridge_regression_results_nmse.npy', y_pred)

Best: -1.799904 using {'alpha': 10, 'max_iter': 2000, 'solver': 'saga'}


In [8]:
# decision tree regressor
# define the model
model = DecisionTreeRegressor()
# define the grid search
grid = dict()
grid['max_depth'] = [None, 10, 100, 500]
grid['min_samples_split'] = [2, 4, 8]
grid['min_samples_leaf'] = [1, 2, 4]

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='neg_mean_squared_error')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('decision_tree_regressor_best_params_nmse.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'decision_tree_regressor_nmse.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('decision_tree_regressor_results_nmse.npy', y_pred)

Best: -0.164837 using {'max_depth': 100, 'min_samples_leaf': 4, 'min_samples_split': 8}


In [9]:
# ada boost regressor
# define the model
model = AdaBoostRegressor()
# define the grid search
grid = dict()
grid['n_estimators'] = [10, 100, 1000]
grid['learning_rate'] = [0.001, 0.01, 0.1, 1.0]
grid['loss'] = ['linear', 'square', 'exponential']
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=5, scoring='neg_mean_squared_error')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('ada_boost_regressor_best_params.json_nmse', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'ada_boost_regressor_nmse.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('ada_boost_regressor_results_nmse.npy', y_pred)

Best: -0.338826 using {'learning_rate': 0.001, 'loss': 'linear', 'n_estimators': 1000}


In [10]:
# random forest regressor
# define the model
model = RandomForestRegressor()
# define the grid search
grid = dict()
grid['n_estimators'] = [10, 100, 200]
grid['max_depth'] = [10, 60, 100, 500]
grid['min_samples_split'] = [2, 4, 8]
grid['min_samples_leaf'] = [1, 2, 4]

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='neg_mean_squared_error')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
with open('random_forest_regressor_best_params_nmse.json', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'random_forest_regressor_nmse.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('random_forest_regressor_results_nmse.npy', y_pred)

Best: -0.146748 using {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}


In [11]:
# define the model
model = XGBRegressor()
# define the grid search
grid = dict()
grid['n_estimators'] = [10, 100, 500, 1000]
grid['max_depth'] = [5, 10, 100]
grid['learning_rate'] = [0.001, 0.01, 0.1, 1.0]
grid['booster'] = ['gbtree', 'gblinear', 'dart']

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=10, scoring='neg_mean_squared_error')
# perform the search

grid_result = grid_search.fit(train_data, targets)
# summarize the results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
best_params = grid_result.best_params_
# save as a json file
import json
with open('xgboost_regression_best_params.json_nmse', 'w') as f:
    json.dump(best_params, f)
# save the model
joblib.dump(grid_result.best_estimator_, 'xgboost_regressor_nmse.pkl')
# evaluate the model
y_pred = grid_result.predict(test_data)
#save results
np.save('xgboost_regressor_results.npy_nmse', y_pred)

Best: -0.110529 using {'booster': 'gbtree', 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 1000}


In [4]:
train_data[0]

array([ 1.        ,  0.        , -0.00664863,  0.        ,  1.        ,
       -0.44197478, -0.26815245,  0.07755049,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.  