In [7]:
%load_ext jupyternotify

# import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import timeit
import joblib
import utils

from tqdm.auto import tqdm
# from scipy.stats import *

pd.set_option('display.max_columns', None)

The jupyternotify extension is already loaded. To reload it, use:
  %reload_ext jupyternotify


In [8]:
import importlib
# importlib.reload(utils)

In [9]:
# pd.set_option('display.max_colwidth', None)

pd.set_option('display.float_format', lambda x: '%.5f' % x)
np.set_printoptions(suppress=True)

In [10]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [11]:
# data_df_1522 = pd.read_csv('../data/PitchDataCleanedLean_15-22.csv', index_col=[0])
# data_train_df = data_df_1522[data_df_1522['game_year'].isin([2015, 2016, 2017, 2018])].copy()
# data_test_df = data_df_1522[data_df_1522['game_year']==2019].copy()

### Load T dicts

In [12]:
T_batters_dict_1518 = joblib.load('T_batters_dict_1518')
T_pitchers_dict_1518 = joblib.load('T_pitchers_dict_1518')

pitcher_quali_list = list(T_pitchers_dict_1518.keys())
batter_quali_list = list(T_batters_dict_1518.keys())

# Logit Regression

In [41]:
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.calibration import CalibratedClassifierCV

In [14]:
pa_data_1519 = pd.read_csv('../../data/PA_data_merged_15-19_012323.csv', index_col=[0])
# remove 'other'
pa_data_1519 = pa_data_1519[pa_data_1519['event_type']!='other'].copy()

In [17]:
train_val_df = pa_data_1519[pa_data_1519['game_year'].isin([2015, 2016, 2017, 2018])].copy()
train_val_df = train_val_df[(train_val_df['pitcher'].isin(pitcher_quali_list))&(train_val_df['batter'].isin(batter_quali_list))].copy()
test_df = pa_data_1519[pa_data_1519['game_year']==2019].copy()
test_df = test_df[(test_df['pitcher'].isin(pitcher_quali_list))&(test_df['batter'].isin(batter_quali_list))].copy()

train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=0, shuffle=True)

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f"train size: {train_df.shape}")
print(f"val size: {val_df.shape}")
print(f"test size: {test_df.shape}")

train size: (434677, 36)
val size: (108670, 36)
test size: (83464, 36)


In [18]:
# create matchup lists to loop through later
train_matchup_list = list(zip(train_df['pitcher'], train_df['batter']))
val_matchup_list = list(zip(val_df['pitcher'], val_df['batter']))
test_matchup_list = list(zip(test_df['pitcher'], test_df['batter']))

In [19]:
y_train = train_df['event_type'].values
y_val = val_df['event_type'].values
y_test = test_df['event_type'].values

y_train_factor = pd.factorize(train_df['event_type'].values)[0]
y_val_factor = pd.factorize(val_df['event_type'].values)[0]
y_test_factor = pd.factorize(test_df['event_type'].values)[0]

y_train_binary = train_df[utils.EVENT_LIST].values
y_val_binary = val_df[utils.EVENT_LIST].values
y_test_binary = test_df[utils.EVENT_LIST].values

# Random Forest T matrix as X

In [20]:
master_map = [[False, True, False, False, True, False, False, False, False, False, False, False, True, True, True, True, True, True, False],
 [False, False, True, False, False, True, False, False, False, False, False, False, True, True, True, True, True, True, False],
 [False, False, False, True, False, False, True, False, False, False, False, False, True, True, True, True, True, True, False],
 [False, False, False, False, False, False, False, True, False, False, False, False, True, True, True, True, True, True, False],
 [False, False, False, False, False, True, False, False, True, False, False, False, True, True, True, True, True, True, False],
 [False, False, False, False, False, False, True, False, False, True, False, False, True, True, True, True, True, True, False],
 [False, False, False, False, False, False, False, True, False, False, True, False, True, True, True, True, True, True, False],
 [False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, False],
 [False, False, False, False, False, False, False, False, True, True, False, False, True, True, True, True, True, True, True],
 [False, False, False, False, False, False, False, False, False, True, True, False, True, True, True, True, True, True, True],
 [False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True],
 [False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True]]

import itertools

# get feature names
feature_names = list(itertools.product(utils.COUNT_LIST_STR, utils.COUNT_LIST_STR + utils.EVENT_LIST))
feature_names = [f[0]+'-'+f[1] for f in feature_names]
feature_names = np.array(feature_names)[np.array(master_map).flatten()]
p_feature_names = ['pitcher-'+f for f in feature_names]
b_feature_names = ['batter-'+f for f in feature_names]
feature_names = np.array(p_feature_names + b_feature_names)


T_batters_RF_1518_trans = {}
for key, item in tqdm(T_batters_dict_1518.items()):
    T_batter_long = []
    for i in range(12):
        T_count = item[i][master_map[i]].tolist()
        T_batter_long+=T_count
    T_batters_RF_1518_trans[key] = T_batter_long

T_pitchers_RF_1518_trans = {}
for key, item in tqdm(T_pitchers_dict_1518.items()):
    T_pitcher_long = []
    for i in range(12):
        T_count = item[i][master_map[i]].tolist()
        T_pitcher_long+=T_count
    T_pitchers_RF_1518_trans[key] = T_pitcher_long

  0%|          | 0/491 [00:00<?, ?it/s]

  0%|          | 0/487 [00:00<?, ?it/s]

In [21]:
X_train = []
for train_matchup in tqdm(train_matchup_list):
    pitcher = train_matchup[0]
    batter = train_matchup[1]

    X_row = np.concatenate([T_pitchers_RF_1518_trans[pitcher], T_batters_RF_1518_trans[batter]]).tolist()
    X_train+=[X_row]
X_train = np.array(X_train)
    
X_val = []
for val_matchup in tqdm(val_matchup_list):
    pitcher = val_matchup[0]
    batter = val_matchup[1]

    X_row = np.concatenate([T_pitchers_RF_1518_trans[pitcher], T_batters_RF_1518_trans[batter]]).tolist()
    X_val+=[X_row]    
X_val = np.array(X_val)

X_test = []
for test_matchup in tqdm(test_matchup_list):
    pitcher = test_matchup[0]
    batter = test_matchup[1]

    X_row = np.concatenate([T_pitchers_RF_1518_trans[pitcher], T_batters_RF_1518_trans[batter]]).tolist()
    X_test+=[X_row]
X_test = np.array(X_test)

  0%|          | 0/434677 [00:00<?, ?it/s]

  0%|          | 0/108670 [00:00<?, ?it/s]

  0%|          | 0/83464 [00:00<?, ?it/s]

In [30]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_val_std = scaler.transform(X_val)
X_test_std = scaler.transform(X_test)

In [22]:
# create X df with column names to make easy for feature selection
X_train_df = pd.DataFrame(X_train, columns=feature_names)
X_val_df = pd.DataFrame(X_val, columns=feature_names)
X_test_df = pd.DataFrame(X_test, columns=feature_names)

train_df = pd.concat([train_df, X_train_df], axis=1)
val_df = pd.concat([val_df, X_val_df], axis=1)
test_df = pd.concat([test_df, X_test_df], axis=1)

In [32]:
reg = LogisticRegression(random_state=0, max_iter=500)
model_logit_default = reg.fit(X_train_std, y_train)

In [46]:
y_pred = model_logit_default.predict_proba(X_test_std)

In [36]:
utils.get_brier_score(y_test_binary, y_pred)

0.7093742343714293

In [49]:
model_logit_default_cal = CalibratedClassifierCV(model_logit_default, method='sigmoid', cv=5, n_jobs=5)
model_logit_default_cal.fit(X_val_std, y_val)

CalibratedClassifierCV(base_estimator=LogisticRegression(max_iter=500,
                                                         random_state=0),
                       cv=5, n_jobs=5)

In [50]:
y_pred_cal = model_logit_default_cal.predict_proba(X_test_std)

In [51]:
utils.get_brier_score(y_test_binary, y_pred_cal)

0.7095601988994151

In [47]:
utils.get_brier_score(y_test_binary, y_pred_cal)

0.7096750735004942