<a href="https://colab.research.google.com/github/Base-R-Best-R/Auction/blob/main/Code/Models/Colab/RF_Train_Sk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RF in Sklearn

In [1]:
# Imports
import pandas as pd
import numpy as np
import sklearn
import pickle
from pprint import pprint
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error


# gdrive
from google.colab import drive
drive.mount("/content/drive")

## prel. for reading and writing pickle ##

# write
def save_object(obj, filename):
    with open(filename, "wb") as outp:
        pickle.dump(obj, outp, pickle.HIGHEST_PROTOCOL)

# read
def load_object(filename):
    with open(filename, "rb") as inp:
        return pickle.load(inp)

Mounted at /content/drive


In [6]:
#  file names
nom = ["Aucs_df_feateng_test.csv", "Aucs_df_feateng_train.csv"]
df_lst = []

for i in range(len(nom)):
  df_lst.append(pd.read_csv("drive/MyDrive/Master_Thesis/Input_Data/" + nom[i]))

In [7]:
## one-hot-encoding ##

# storage 
oh_mats = [np.nan, np.nan]

for i, df in enumerate(df_lst):

  # year and month to str
  df.Letting_Month = "M" + df.Letting_Month.astype("str")
  df.Letting_Year = "Y" + df.Letting_Year.astype("str")
  df.Winning_Bid = df.Winning_Bid / 1e3
  df.Eng_Est = df.Eng_Est / 1e3

  # remove variables that we will not use in our model
  df.drop(["EW_Diff", "MLOT", "Contract_ID"], inplace = True, axis = 1)

  # one-hot-matrix
  oh_mats[i] = pd.get_dummies(df)

In [8]:
## prep model input ##
dep = [np.nan, np.nan]
feat = [np.nan, np.nan]
nom = [np.nan, np.nan]

for i, df in enumerate(oh_mats):

  # dep. var.
  dep[i] = np.array(df.Winning_Bid)

  # features rm W_bid
  df.drop("Winning_Bid", inplace = True, axis = 1)

  # safe column names
  nom[i] = list(df.columns)

  # to np array
  feat[i] = np.array(df)

# Random Forest CV


In [None]:
## Model ##

# set method
rf = RandomForestRegressor(random_state = 33)

# display possible inputs
pprint(rf.get_params())

# build tuning grid
tgrid = {
 'criterion': ['squared_error'],
 'max_depth': np.append([int(x) for x in np.linspace(10, 110, num = 11)], 200),
 'max_features': np.append([int(x) for x in np.linspace(10, 110, num = 11)], [500, 800, 1000]),
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
}

In [21]:
# CV settings
rf_CV_rand = RandomizedSearchCV(estimator = rf, param_distributions = tgrid, 
                           n_iter = 200, cv = 5, verbose = 2, random_state = 33, n_jobs = -1)

# train
rf_CV_rand.fit(feat[1], dep[1])

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(random_state=33),
                   n_iter=200, n_jobs=-1,
                   param_distributions={'criterion': ['squared_error'],
                                        'max_depth': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 200]),
                                        'max_features': array([  10,   20,   30,   40,   50,   60,   70,   80,   90,  100,  110,
        500,  800, 1000]),
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=33, verbose=2)

In [24]:
# view best params
print(rf_CV_rand.best_params_)

# view best estimate
print(rf_CV_rand.best_score_)

# save 
save_object(rf_CV_rand, "drive/MyDrive/Master_Thesis/Models_MT/RF_CV_rand.pkl")

{'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 1000, 'max_depth': 40, 'criterion': 'squared_error'}
0.9028404945714119


In [25]:
# unmount gdrive
drive.flush_and_unmount()