<a href="https://colab.research.google.com/github/Base-R-Best-R/Auction/blob/main/Code/Models/Colab/RF_Train_Sk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RF in Sklearn

In [55]:
# Imports
import pandas as pd
import numpy as np
import sklearn
import pickle
from pprint import pprint
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error


# gdrive
from google.colab import drive
drive.mount('/content/drive')

# prel. for saving objects
def save_object(obj, filename):
    with open(filename, 'wb') as outp:
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import RDS training Data
dat_train = pd.read_csv("drive/MyDrive/Master_Thesis/Input_Data/Aucs_df_feateng_train.csv")

# glimpse 
dat_train.head()

In [None]:
## one-hot-encoding ##

# change year and month to str
dat_train.Letting_Month = "M" + dat_train.Letting_Month.astype("str")
dat_train.Letting_Year = "Y" + dat_train.Letting_Year.astype("str")

# remove variables that we will not use in our model
dat_train.drop(["EW_Diff", "MLOT", "Contract_ID"], inplace = True, axis = 1)

# one hat matrix
feat = pd.get_dummies(dat_train)

# glimpse
feat.head()

In [4]:
## prep model input ##

# dep. var.
W_bid = np.array(feat.Winning_Bid)

# features rm W_bid
feat.drop("Winning_Bid", inplace = True, axis = 1)

# safe column names
feat_list = list(feat.columns)

# to np array
feat = np.array(feat)

# Random Forest CV


In [37]:
## Model ##

# glimpse at amount of vars
np.shape(feat)

# set method
rf = RandomForestRegressor(random_state = 33)

# display possible inputs
pprint(rf.get_params())

# build tuning grid
tgrid = {
 'criterion': ['squared_error'],
 'max_depth': np.append([int(x) for x in np.linspace(10, 110, num = 11)], 200),
 'max_features': np.append([int(x) for x in np.linspace(10, 110, num = 11)], [500, 800, 1000, 1500]),
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
}

# display tuning grid
pprint(tgrid)

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 33,
 'verbose': 0,
 'warm_start': False}
{'criterion': ['squared_error'],
 'max_depth': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 200]),
 'max_features': array([  10,   20,   30,   40,   50,   60,   70,   80,   90,  100,  110,
        500,  800, 1000, 1500]),
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [42]:
# CV settings
rf_CV_rand = RandomizedSearchCV(estimator = rf, param_distributions = tgrid, 
                           n_iter = 200, cv = 5, verbose = 2, random_state = 33, n_jobs = -1)

# train
rf_CV_rand.fit(feat, W_bid)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=33),
                   n_iter=200, n_jobs=-1,
                   param_distributions={'criterion': ['squared_error'],
                                        'max_depth': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 200]),
                                        'max_features': array([  10,   20,   30,   40,   50,   60,   70,   80,   90,  100,  110,
        500,  800, 1000, 1500]),
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=33, verbose=2)

In [56]:
# view best params
print(rf_CV_rand.best_params_)

# view best estimate
print(rf_CV_rand.best_score_)

# save 
save_object(rf_CV_rand, "drive/MyDrive/Master_Thesis/Models_MT/RF_CV_rand.pkl")

{'n_estimators': 1600, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 1500, 'max_depth': 20, 'criterion': 'squared_error'}
0.8990087017708138
