In [1]:
%load_ext jupyternotify

# import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
# import timeit
import main

from tqdm.auto import tqdm
# from scipy.stats import *

pd.set_option('display.max_columns', None)

<IPython.core.display.Javascript object>

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [3]:
np.set_printoptions(suppress=True)

In [4]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))

In [5]:
def extract_proba(arr) -> np.array:
    arr_len = len(arr[0])
    
    single_arr = np.reshape(arr[0][:, 1], [arr_len, 1])
    double_arr = np.reshape(arr[1][:, 1], [arr_len, 1])
    triple_arr = np.reshape(arr[2][:, 1], [arr_len, 1])
    hr_arr = np.reshape(arr[3][:, 1], [arr_len, 1])
    fo_arr = np.reshape(arr[4][:, 1], [arr_len, 1])
    bb_arr = np.reshape(arr[5][:, 1], [arr_len, 1])
    k_arr = np.reshape(arr[6][:, 1], [arr_len, 1])
    other_arr = np.reshape(arr[7][:, 1], [arr_len, 1])
    proba_arr = np.concatenate([single_arr, double_arr, triple_arr, hr_arr, fo_arr, bb_arr, k_arr, other_arr], axis=1)
    
    return proba_arr

In [6]:
def clf_performance(classifier, title=None):
    if title != None:
        print(title)
    print('Best Score: ' + str(classifier.best_score_))
    print('Best Parameters: ' + str(classifier.best_params_))

In [7]:
data_df = pd.read_csv('data/pa_df_19.csv', index_col=[0])

In [8]:
train_val_df, test_df = train_test_split(data_df, test_size=0.2, random_state=0, shuffle=True)
train_df, val_df = train_test_split(train_val_df, test_size=0.25, random_state=0, shuffle=True)
print(f"train size: {train_df.shape}")
print(f"val size: {val_df.shape}")
print(f"test size: {test_df.shape}")

train size: (50196, 26)
val size: (16732, 26)
test size: (16733, 26)


# Baseline Model, No RF

In [9]:
main.EVENT_LIST

['1B', '2B', '3B', 'HR', 'FO', 'BB', 'K', 'other']

In [10]:
main.PROBA_LIST

['1B_proba',
 '2B_proba',
 '3B_proba',
 'HR_proba',
 'FO_proba',
 'BB_proba',
 'K_proba',
 'other_proba']

In [11]:
X_all = data_df[main.PROBA_LIST].values
X_train = train_df[main.PROBA_LIST].values
X_val = val_df[main.PROBA_LIST].values
X_train_val = train_val_df[main.PROBA_LIST].values
X_test = test_df[main.PROBA_LIST].values

In [12]:
y_all = data_df[main.EVENT_LIST].values
y_train = train_df[main.EVENT_LIST].values
y_val = val_df[main.EVENT_LIST].values
y_train_val = train_val_df[main.EVENT_LIST].values
y_test = test_df[main.EVENT_LIST].values

In [13]:
main.get_brier_score(y_test, X_test)

0.7025291959761956

# Baseline RF Default

## Classifier

In [14]:
start = time.time()

In [15]:
clf = RandomForestClassifier(random_state=1, n_jobs=6, max_features=None)
rf_baseline_def = clf.fit(X_train, y_train)

In [16]:
time.time() - start

12.276416063308716

In [17]:
proba_arr = rf_baseline_def.predict_proba(X_test)
y_test_pred = extract_proba(proba_arr)
main.get_brier_score(y_test, y_test_pred)

0.8755914800716054

In [None]:
proba_arr = rf_baseline_def.predict_proba(X_test)

In [None]:
y_test_pred = extract_proba(proba_arr)

In [None]:
main.get_brier_score(y_test, y_test_pred)

In [None]:
%%notify
pass

# Baseline RF

In [None]:
start = time.time()

In [None]:
clf = RandomForestClassifier(random_state=0)
param_grid =  {'n_estimators': [250], 
                  'max_depth': [7, 15, 25],
                  'max_features': ['sqrt', None],
                  'min_samples_leaf': [3, 8, 12],
                  'min_samples_split': [2, 4, 7],
                  'max_samples': [0.5, 0.8]}
                                  
rf_baseline_1 = RandomizedSearchCV(clf, param_distributions=param_grid, n_iter=100, cv=5, verbose=1, n_jobs=5)
best_rf_baseline_1 = rf_baseline_1.fit(X_train, y_train)

In [None]:
clf_performance(best_rf_baseline_1)

In [None]:
time.time() - start

In [None]:
y_pred_proba_baseline_1 = extract_proba(best_rf_baseline_1.predict_proba(X_test))

In [None]:
main.get_brier_score(y_test, y_pred_proba_baseline_1)

In [None]:
y_test[0]

In [None]:
X_test[0]

In [None]:
y_pred_proba_baseline_1[0]

In [None]:
arr = best_rf_baseline_1.predict_proba(X_test)

In [None]:
X_test[0]

In [None]:
print(arr[0][:, 1].mean())
print(arr[1][:, 1].mean())
print(arr[2][:, 1].mean())
print(arr[3][:, 1].mean())
print(arr[4][:, 1].mean())
print(arr[5][:, 1].mean())
print(arr[6][:, 1].mean())
print(arr[7][:, 1].mean())