In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pickle

data = pd.read_csv('data_2_model.csv', sep = ',')
data.head()

Unnamed: 0,assumed_danger_states_new,Home_Plate_Control,Rink_Control,Max_Success,Max_Best,Max_Exp,All_Avg_Edge,O_Avg_Edge,D_Avg_Edge,All_Avg_Edges per Player,O_Avg_Edges_per_Player,D_Avg_Edges per Player,OD_MST_Ratio,All_OCR,distance_to_net,angle_to_attacking_net,woman_adv,split
0,0,-0.775281,-0.41741,0.17395,0.012174,0.004196,12.686693,25.426174,12.998359,1.666667,1.0,1.6,1.956106,0.6,35.724641,173.571252,1,0
1,0,-0.137802,0.323503,0.695621,0.038747,0.06938,14.134405,24.210186,14.079408,1.777778,1.6,1.6,1.719546,0.75,48.417456,144.668669,1,0
2,1,-0.205564,0.177664,0.462435,0.031124,0.032817,14.011834,25.245765,12.59087,1.777778,1.6,1.6,2.005085,0.75,41.584252,161.782905,1,0
3,1,0.058052,0.321745,0.741035,0.030433,0.036346,14.474768,25.988658,17.462536,1.777778,1.6,1.6,1.488252,0.875,64.274801,113.374164,1,0
4,1,0.138706,0.090045,0.429536,0.047263,0.050201,15.171425,22.851884,19.209889,1.75,1.5,1.6,1.18959,0.714286,51.53882,75.963757,1,1


In [21]:
# Separate target and features
vars=['distance_to_net', 'Rink_Control', 'All_OCR', 'Home_Plate_Control', 'angle_to_attacking_net', 'woman_adv']
x, y = data[vars], data['assumed_danger_states_new'] #data.iloc[:,1:],
# Get train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=123)
print(y_test.mean(), y_train.mean())

0.22325581395348837 0.22355289421157684


In [26]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 500, stop = 2000, num = 7)]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 60, num = 6)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 4, 6, 8]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3, 4,5]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

{'n_estimators': [500, 750, 1000, 1250, 1500, 1750, 2000], 'max_depth': [10, 20, 30, 40, 50, 60, None], 'min_samples_split': [2, 4, 6, 8], 'min_samples_leaf': [1, 2, 3, 4, 5]}


In [34]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
                   param_distributions={'max_depth': [10, 20, 30, 40, 50, 60,
                                                      None],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'min_samples_split': [2, 4, 6, 8],
                                        'n_estimators': [500, 750, 1000, 1250,
                                                         1500, 1750, 2000]},
                   random_state=42, verbose=2)

In [35]:
# Use the forest's predict method on the test data
predictions = rf_random.predict(x_test)
print((1*(predictions>0.5)!=y_test).mean())
print((0!=y_test).mean()) 

#0.19534883720930232 all vars
#0.19069767441860466 small vars

print(rf_random.best_params_)

0.19069767441860466
0.22325581395348837
{'n_estimators': 1000, 'min_samples_split': 8, 'min_samples_leaf': 5, 'max_depth': 50}


In [36]:
rf_final = RandomForestRegressor(n_estimators = rf_random.best_params_['n_estimators'], min_samples_split=rf_random.best_params_['min_samples_split'], min_samples_leaf = rf_random.best_params_['min_samples_leaf'], max_depth = rf_random.best_params_['max_depth'],random_state = 42)
# Train the model on all data
rf_final.fit(x, y)
# save the model to disk
pickle.dump(rf_final, open('finalized_rf_model.pkl', 'wb'))

In [None]:
 # load the model from disk
# loaded_rf = pickle.load(open('finalized_rf_model.pkl', 'rb'))
# loaded_pred = loaded_rf.predict(x_test)
# print((1*(loaded_pred>0.5)!=y_test).mean())
# print((0!=y_test).mean()) 