In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pickle

data = pd.read_csv('data_2_model.csv', sep = ',')
data.head()

Unnamed: 0,assumed_danger_states_new,Home_Plate_Control,Rink_Control,Max_Success,Max_Best,Max_Exp,Max_Player_Success,Max_Player_Best,Max_Player_Exp,Mean_Player_Success,...,O_Avg_Edges_per_Player,D_Avg_Edges per Player,OD_MST_Ratio,All_OCR,distance_to_net,angle_to_attacking_net,goal_diff,on_ice_woman_diff,woman_adv,split
0,0,-0.775281,-0.41741,0.17395,0.012174,0.004196,0.153757,0.008026,0.003794,0.153757,...,1.0,1.6,1.956106,0.6,35.724641,173.571252,0,-3,1,0
1,0,-0.137802,0.323503,0.695621,0.038747,0.06938,0.613144,0.038747,0.067885,0.312198,...,1.6,1.6,1.719546,0.75,48.417456,144.668669,0,0,1,0
2,1,-0.205564,0.177664,0.462435,0.031124,0.032817,0.258113,0.025985,0.028199,0.18367,...,1.6,1.6,2.005085,0.75,41.584252,161.782905,0,0,1,0
3,1,0.058052,0.321745,0.741035,0.030433,0.036346,0.683848,0.017807,0.03252,0.379489,...,1.6,1.6,1.488252,0.875,64.274801,113.374164,0,0,1,0
4,1,0.138706,0.090045,0.429536,0.047263,0.050201,0.199147,0.046098,0.017132,0.169649,...,1.5,1.6,1.18959,0.714286,51.53882,75.963757,0,-1,1,1


In [12]:
# Separate target and features
vars = ['distance_to_net', 'goal_diff', 'woman_adv', 'All_OCR', 'Max_Success', 'Home_Plate_Control', 'Mean_Player_Best', 
'Max_Player_Best', 'angle_to_attacking_net','Mean_Player_Exp']
x, y = data[vars], data['assumed_danger_states_new']

# Get train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=123)
print(y_test.mean(), y_train.mean())

0.22325581395348837 0.22355289421157684


In [21]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 9)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 60, num = 6)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 4, 6, 8,10,12]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4,5,6]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

{'n_estimators': [200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_depth': [10, 20, 30, 40, 50, 60, None], 'min_samples_split': [2, 4, 6, 8, 10, 12], 'min_samples_leaf': [1, 2, 3, 4, 5, 6]}


In [22]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'max_depth': [10, 20, 30, 40, 50, 60,
                                                      None],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6],
                                        'min_samples_split': [2, 4, 6, 8, 10,
                                                              12],
                                        'n_estimators': [200, 300, 400, 500,
                                                         600, 700, 800, 900,
                                                         1000]},
                   random_state=42, verbose=2)

In [23]:
# Use the forest's predict method on the test data
predictions = rf_random.predict(x_test)
print((1*(predictions>0.5)!=y_test).mean())
print((0!=y_test).mean()) 

#0.18604651162790697 all vars
#0.17674418604651163 small vars

print(rf_random.best_params_)

0.18604651162790697
0.22325581395348837
{'n_estimators': 900, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 30}


In [24]:
rf_final = RandomForestRegressor(n_estimators = rf_random.best_params_['n_estimators'], min_samples_split=rf_random.best_params_['min_samples_split'], min_samples_leaf = rf_random.best_params_['min_samples_leaf'], max_depth = rf_random.best_params_['max_depth'],random_state = 42)
# Train the model on all data
rf_final.fit(x, y)
# save the model to disk
pickle.dump(rf_final, open('finalized_rf_model.pkl', 'wb'))

900


In [None]:
 # load the model from disk
# loaded_rf = pickle.load(open('finalized_rf_model.pkl', 'rb'))
# loaded_pred = loaded_rf.predict(x_test)
# print((1*(loaded_pred>0.5)!=y_test).mean())
# print((0!=y_test).mean()) 