In [1]:
import math
import matplotlib.pylab as plt
import numpy as np 
import pandas as pd 
import xgboost as xgb

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

np.random.seed(171)

In [2]:
X_train = pd.read_csv(r"C:\Users\patri\Documents\GitHub\ECS171-Project-Group-10\Data\X_train.csv", delimiter = ',', header = None)
X_test = pd.read_csv(r"C:\Users\patri\Documents\GitHub\ECS171-Project-Group-10\Data\X_test.csv", delimiter = ',', header = None)
Y_train = pd.read_csv(r"C:\Users\patri\Documents\GitHub\ECS171-Project-Group-10\Data\Y_train.csv", header = None)
Y_test = pd.read_csv(r"C:\Users\patri\Documents\GitHub\ECS171-Project-Group-10\Data\Y_test.csv", header = None)

In [3]:
rf = RandomForestRegressor(n_jobs = -1)

rf.fit(X_train, Y_train)
Y_pred_train_rf = rf.predict(X_train)
Y_pred_rf = rf.predict(X_test)

print("Random Forest Training Score:", rf.score(X_train, Y_train))
print("Random Forest Testing Score:", rf.score(X_test, Y_test))
print("Random Forest Training RMSE", mean_squared_error(Y_train, Y_pred_train_rf))
print("Random Forest Testing RMSE:", mean_squared_error(Y_test, Y_pred_rf, squared = False))

Random Forest Training Score: 0.9789047035558702
Random Forest Testing Score: 0.9202344847137612
Random Forest Training RMSE 24.57722324398197
Random Forest Testing RMSE: 9.742821488829312


In [10]:
rf_param = {'n_estimators': [100, 150, 200, 250],
            'max_depth': [30, 40, 50, 60, 70],
            'max_features': ['auto', 'sqrt', 'log2', 15, 20, 25, 30, 35, 40]}

rf_param_ccpa = {'ccp_alpha': [0.0025, 0.005, 0.0075, 0.010, 0.0125, 0.015, 0.0175, 0.020, 0.0225, 0.025, 0.0275, 0.030]}

print("Now running RandomizedSearchCV on Tree Hyperparameters...\n")

rf_rands = RandomizedSearchCV(rf, rf_param, cv = 10, n_jobs = -1, verbose = True)
rf_rands.fit(X_train, Y_train)

print("\n========================= Hyperparameter Tuning =========================")
print("\nThe score for the best model is:", rf_rands.best_score_)
print("\nThe best parameters are:\n", rf_rands.best_params_)
print("=========================================================================")
print("Now running RandomizedSearchCV on CCP Alpha values...\n")
rf_rands_ccpa = RandomizedSearchCV(rf, rf_param_ccpa, cv = 10, n_jobs = -1, verbose = True)
rf_rands_ccpa.fit(X_train, Y_train)

print("\n========================= CCP Alpha Pruning =========================")
print("\nThe score for the best model is:", rf_rands_ccpa.best_score_)
print("\nThe best CCP Alpha is:\n", rf_rands_ccpa.best_params_)
print("=====================================================================")

Now running RandomizedSearchCV on Tree Hyperparameters...

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.0min finished


The score for the best model is: 0.9205818641608783

The best parameters are:
 {'n_estimators': 250, 'max_features': 20, 'max_depth': 60}
Now running RandomizedSearchCV on CCPA Alpha Values...

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  8.7min finished


The score for the best model is: 0.917302958389261

The best parameters are:
 {'ccp_alpha': 0.0075}


In [11]:
rf_opt = RandomForestRegressor(n_estimators = 250, max_features = 25, max_depth = 30, n_jobs = -1)
#rf_opt = RandomForestRegressor(ccp_alpha = 0.0175, n_jobs = -1)
rf_opt.fit(X_train, Y_train)
Y_pred_train_rf_opt = rf_opt.predict(X_train)
Y_pred_rf_opt = rf_opt.predict(X_test)

print("Tree Param Random Forest Training Score:", rf_opt.score(X_train, Y_train))
print("Tree Param Random Forest Testing Score:", rf_opt.score(X_test, Y_test))
print("Tree Param Random Forest Training RMSE:", mean_squared_error(Y_train, Y_pred_train_rf_opt))
print("Tree Param Random Forest Testing RMSE:", mean_squared_error(Y_test, Y_pred_rf_opt, squared = False))

Tree Param Random Forest Training Score: 0.9794135856445266
Tree Param Random Forest Testing Score: 0.9218007222861753
Tree Param Random Forest Training RMSE: 23.9843465934502
Tree Param Random Forest Testing RMSE: 9.646694578477193


In [12]:
rf_opt = RandomForestRegressor(n_estimators = 250, max_features = 20, max_depth = 60, n_jobs = -1)
rf_opt.fit(X_train, Y_train)
Y_pred_train_rf_opt = rf_opt.predict(X_train)
Y_pred_rf_opt = rf_opt.predict(X_test)

print("Tree Param Random Forest Training Score:", rf_opt.score(X_train, Y_train))
print("Tree Param Random Forest Testing Score:", rf_opt.score(X_test, Y_test))
print("Tree Param Random Forest Training RMSE:", mean_squared_error(Y_train, Y_pred_train_rf_opt))
print("Tree Param Random Forest Testing RMSE:", mean_squared_error(Y_test, Y_pred_rf_opt, squared = False))

Tree Param Random Forest Training Score: 0.9793897766106074
Tree Param Random Forest Testing Score: 0.9220464189474245
Tree Param Random Forest Training RMSE: 24.012085475594063
Tree Param Random Forest Testing RMSE: 9.631528036217661


In [6]:
rf_ccpa = RandomForestRegressor(ccp_alpha = 0.0075, n_jobs = -1)
rf_ccpa.fit(X_train, Y_train)
Y_pred_train_ccpa = rf_ccpa.predict(X_train)
Y_pred_ccpa = rf_opt.predict(X_test)

print("Optimized Random Forest Training Score:", rf_opt.score(X_train, Y_train))
print("Optimized Random Forest Testing Score:", rf_opt.score(X_test, Y_test))
print("Optimized Random Forest Training RMSE:", mean_squared_error(Y_train, Y_pred_train_ccpa))
print("Optimized Random Forest Testing RMSE:", mean_squared_error(Y_test, Y_pred_ccpa, squared = False))

Optimized Random Forest Training Score: 0.9794383357141647
Optimized Random Forest Testing Score: 0.9222047035296672
Optimized Random Forest Training RMSE: 27.207440632241905
Optimized Random Forest Testing RMSE: 9.621744668715246
