In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pickle

data = pd.read_csv('data_2_model2.csv', sep = ',')
data.head()

Unnamed: 0,assumed_danger_states_new,Max_Success,Max_Best,Max_Exp,Home_Plate_Control,Woman_Adv,OD_MST_Ratio,All_OCR,O_Avg_Edge,D_Avg_Edge,...,ScoreProb_p3,ScoreProb_p4,ScoreProb_p5,ScoreProb_p6,Exp_p1,Exp_p2,Exp_p3,Exp_p4,Exp_p5,Exp_p6
0,0,0.274195,0.017804,0.029651,-0.585296,-2,1.956106,0.6,25.426174,12.998359,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027242
1,0,0.516279,0.025508,0.031988,-0.516093,1,1.719546,0.75,24.210186,14.079408,...,0.155717,0.089968,0.034349,0.0,0.0,0.0,0.030999,0.03072,0.004306,-0.010502
2,0,0.756784,0.019947,0.039369,-0.38673,1,2.005085,0.75,25.245765,12.59087,...,0.132135,0.045605,0.030886,0.0,0.0,0.021324,0.0,0.010992,0.038993,0.015839
3,0,0.778119,0.028621,0.037645,-0.407654,1,1.488252,0.875,25.988658,17.462536,...,0.095735,0.066797,0.021006,0.0,0.0,0.0,0.037433,0.028593,-0.014305,-0.007602
4,0,0.687612,0.049012,0.111732,-0.31442,0,1.18959,0.714286,22.851884,19.209889,...,0.237726,0.082992,0.0,0.0,0.0,0.0,0.0,-0.003617,0.094022,-0.019252


In [8]:
# Separate target and features
x, y = data.iloc[:,1:], data['assumed_danger_states_new']
# Get train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=123)
print(y_test.mean(), y_train.mean())


0.3148148148148148 0.32934131736526945


In [9]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 9)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 60, num = 6)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 4, 6, 8]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4,5]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

{'n_estimators': [200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_depth': [10, 20, 30, 40, 50, 60, None], 'min_samples_split': [2, 4, 6, 8], 'min_samples_leaf': [1, 2, 3, 4, 5]}


In [10]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [11]:
# Use the forest's predict method on the test data
predictions = rf_random.predict(x_test)
print((1*(predictions>0.5)!=y_test).mean())
print((0!=y_test).mean()) 

#0.19534883720930232 all vars
#0.19069767441860466 small vars

print(rf_random.best_params_)

0.2962962962962963
0.3148148148148148
{'n_estimators': 600, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_depth': 10}


In [12]:
rf_final = RandomForestRegressor(n_estimators = rf_random.best_params_['n_estimators'], min_samples_split=rf_random.best_params_['min_samples_split'], min_samples_leaf = rf_random.best_params_['min_samples_leaf'], max_depth = rf_random.best_params_['max_depth'],random_state = 42)
# Train the model on all data
rf_final.fit(x, y)
# save the model to disk
pickle.dump(rf_final, open('finalized_rf_model.pkl', 'wb'))

In [None]:
 # load the model from disk
# loaded_rf = pickle.load(open('finalized_rf_model.pkl', 'rb'))
# loaded_pred = loaded_rf.predict(x_test)
# print((1*(loaded_pred>0.5)!=y_test).mean())
# print((0!=y_test).mean()) 