In [1]:
%load_ext jupyternotify
import pandas as pd
import numpy as np
import timeit
import import_ipynb

from sklearn.model_selection import KFold
from tqdm.auto import tqdm

pd.set_option('display.max_columns', None)

<IPython.core.display.Javascript object>

# Predict
### Make expected run value (xRV) prediction based on batter vs pitcher matchup

data_X: training data - "historic" run values where the xRV calculation is based on

data_y: test data - the target or true run values

cluster: pitcher ID dataframe

pitchers_new: an array of IDs of pitchers with less than 20IP from the 19-21 training set

batters_new: an array of IDs of batters with less than 100PA from the 19-21 training set

label: an array of pitcher clustering results

data_X and data_y should all come from the data_RV.csv dataset. This function requires passing in the new pitchers/batters list which might come from outside data_X and data_y. This is a small issue when performing cross validation on a smaller sample. I will fix this later.

In [2]:
def predict(data_X, data_y, cluster, pitchers_new, batters_new, label):
    
    cluster['label'] = label # add label into cluster
    
    cluster_dict = {} # create cluster dictionary for faster search
    for index, row in cluster.iterrows():
        cluster_dict[row.pitcher] = int(row.label)

    data_X = data_X.merge(cluster, on='pitcher', how='right') # add labeled cluster to data_X
    
    # individual batter vs grouped pitcher RV database
    RV = data_X.groupby(['batter', 'label', 'pre_baseOutState']).agg({'run_value':'mean', 
                                                                        'PA_count':sum}).reset_index()
    RV_dict = {n:{x:{y:float(grp3.run_value) # convert dataframe into nested dictionary for faster run time
                     for y, grp3 in grp2.loc[x].groupby(level='pre_baseOutState')} 
                  for x, grp2 in grp.loc[n].groupby(level='label')} 
               for n, grp in RV.set_index(['batter', 'label', 'pre_baseOutState']).groupby(level='batter')}

    # avg batter vs avg pitcher
    RV_leagueAVG = data_X.groupby('pre_baseOutState').agg({'run_value':'mean', 
                                                             'PA_count':sum}).reset_index()
    RV_leagueAVG_dict = {}
    for index, row in RV_leagueAVG.iterrows():
        RV_leagueAVG_dict[row.pre_baseOutState] = row.run_value

    # individual pitcher vs avg batter
    RV_VSavgBatter = data_X.groupby(['label', 'pre_baseOutState']).agg({'run_value':'mean', 
                                                                          'PA_count':sum}).reset_index()

    RV_VSavgBatter_dict = {n:{x:float(grp2.run_value) 
                              for x, grp2 in grp.loc[n].groupby(level='pre_baseOutState')} 
                           for n, grp in RV_VSavgBatter.set_index(['label', 'pre_baseOutState']).groupby(level='label')}

    # individual batter vs avg pitcher
    RV_VSavgPitcher = data_X.groupby(['batter', 'pre_baseOutState']).agg({'run_value':'mean', 
                                                                            'PA_count':sum}).reset_index()

    RV_VSavgPitcher_dict = {n:{x: float(grp2.run_value) 
                        for x, grp2 in grp.loc[n].groupby(level='pre_baseOutState')} 
                    for n, grp in RV_VSavgPitcher.set_index(['batter', 'pre_baseOutState']).groupby(level='batter')}

    run_value_pred = [] # list of predicted run value per PA
    run_value_pred_avg = [] # list of predicted run value per PA based on league averages

    for row in data_y.itertuples(): # loop through data_y and calculate xRV
        batter = row.batter # get batter ID
        pitcher = row.pitcher # get pitcher ID
        state = row.pre_baseOutState # get base out state
        try:
            if batter in batters_new and pitcher in pitchers_new:
                run_value = RV_leagueAVG_dict[state]
            elif batter in batters_new and pitcher not in pitchers_new:
                pitcher_label = cluster_dict[pitcher]
                run_value = RV_VSavgBatter_dict[pitcher_label][state]
            elif batter not in batters_new and pitcher in pitchers_new:
                run_value = RV_VSavgPitcher_dict[batter][state]
            else:
                pitcher_label = cluster_dict[pitcher]
                run_value = RV_dict[batter][pitcher_label][state]
        except:
            try:
                run_value = RV_VSavgBatter_dict[pitcher_label][state]
            except:
                run_value = RV_leagueAVG_dict[state]
        run_value_pred.append(run_value)

        run_value_avg = RV_leagueAVG_dict[state] # xRV calculation based on avg batter vs avg pitcher
        run_value_pred_avg.append(run_value_avg)

    # calculate results    
    data_y = data_y.copy()
    data_y['run_value_pred'] = run_value_pred
    data_y['run_value_pred_avg'] = run_value_pred_avg

    RV_test_avg = data_y['run_value'].mean() # true average RV across test data
    RV_pred_avg = data_y['run_value_pred'].mean() # xRV: avg predicted RV
    RV_league_avg = data_y['run_value_pred_avg'].mean() # xRV(league): avg predicted league RV

    error = RV_pred_avg - RV_test_avg
    error_pct = abs(error/RV_test_avg)

    error_league = RV_league_avg - RV_test_avg
    error_league_pct = abs(error_league/RV_test_avg)
    
    # save prediction results as attributes
    predict.RV_test_avg = RV_test_avg
    predict.RV_pred_avg = RV_pred_avg
    predict.RV_league_avg = RV_league_avg

    predict.error = error
    predict.error_pct = error_pct
    predict.error_league = error_league
    predict.error_league_pct = error_league_pct

# Cross Validation
### Perform cross-validation on data

data: the data to perform cross-validation on

cluster/pitchers_new/batters_new/label: same as above

num_fold: 5 as default

In [3]:
def cross_val(data, cluster, pitchers_new, batters_new, label, num_fold=5):
    
    RV_test_list = np.array([]) # array of true RV from each fold
    RV_pred_list = np.array([]) # array of xRV from each fold
    RV_league_list = np.array([]) # array of league xRV from each fold
    
    error_list = np.array([]) # array of xRV errors from each fold
    error_pct_list = np.array([]) # array of xRV pct from each fold
    errorLeague_list = np.array([]) # array of league xRV errors from each fold
    errorLeague_pct_list = np.array([]) # array of league xRV pct from each fold
    
    kf = KFold(n_splits=num_fold, random_state=0, shuffle=True) # create KFold object from scikit learn package
    for train_index, test_index in kf.split(data): # perform cross val
        data_X = data.iloc[train_index].copy()
        data_y = data.iloc[test_index].copy()
        
        predict(data_X, data_y, cluster, pitchers_new, batters_new, label)
        
        RV_test = predict.RV_test_avg
        RV_pred = predict.RV_pred_avg
        RV_league = predict.RV_league_avg
        
        RV_test_list = np.append(RV_test_list, RV_test)
        RV_pred_list = np.append(RV_pred_list, RV_pred)
        RV_league_list = np.append(RV_league_list, RV_league)      
        
        error = predict.error
        error_pct = predict.error_pct
        error_league = predict.error_league
        error_league_pct = predict.error_league_pct
        
        error_list = np.append(error_list, error)
        error_pct_list = np.append(error_pct_list, error_pct)
        errorLeague_list = np.append(errorLeague_list, error_league)
        errorLeague_pct_list = np.append(errorLeague_pct_list, error_league_pct)
    
    # all errors and pct are absolute value
    error_avg = abs(error_list).mean() # avg absolute errors from CV
    error_pct_avg = abs(error_pct_list).mean() # avg absolute error pct from CV
    errorLeague_avg = abs(errorLeague_list).mean()
    errorLeague_pct_avg = abs(errorLeague_pct_list).mean()
    
    # save results as function attributes
    cross_val.error_list = error_list
    cross_val.error_pct_list = error_pct_list
    cross_val.errorLeague_list = errorLeague_list
    cross_val.errorLeague_pct_list = errorLeague_pct_list
    
    cross_val.error_avg = error_avg
    cross_val.error_pct_avg = error_pct_avg
    cross_val.errorLeague_avg = errorLeague_avg
    cross_val.errorLeague_pct_avg = errorLeague_pct_avg
    
    cross_val.RV_test_list = RV_test_list
    cross_val.RV_pred_list = RV_pred_list
    cross_val.RV_league_list = RV_league_list
    
    cross_val.RV_test_avg = RV_test_list.mean()
    cross_val.RV_pred_avg = RV_pred_list.mean()
    cross_val.RV_league_avg = RV_league_list.mean()

# RV Search Example

In [4]:
# cluster['label'] = labels_GMM_PlayerContPitch[3]
# example_df = train_full_df.merge(cluster, on='pitcher', how='right')
# RV = example_df.groupby(['batter', 'label', 'pre_baseOutState']).agg({'run_value':'mean', 
#                                                                         'PA_count':sum}).reset_index()
# RV_leagueAVG = example_df.groupby('pre_baseOutState').agg({'run_value':'mean', 
#                                                              'PA_count':sum}).reset_index()
# RV_VSavgBatter = example_df.groupby(['label', 'pre_baseOutState']).agg({'run_value':'mean', 
#                                                                           'PA_count':sum}).reset_index()
# RV_VSavgPitcher = example_df.groupby(['batter', 'pre_baseOutState']).agg({'run_value':'mean', 
#                                                                             'PA_count':sum}).reset_index()

In [5]:
# start = timeit.default_timer()

# for index, row in val_df.iterrows():
#     batter = row.batter
#     pitcher = row.pitcher
#     state = row.pre_baseOutState

#     try:
#         if batter in batters_new and pitcher in pitchers_new:
#             val_df.at[index, 'run_value_pred'] = RV_leagueAVG[RV_leagueAVG['pre_baseOutState']==state]['run_value']
#         elif batter in batters_new and pitcher not in pitchers_new:
#             pitcher_label = int(cluster[cluster['pitcher']==pitcher]['label'])
#             val_df.at[index, 'run_value_pred'] = RV_VSavgBatter[(RV_VSavgBatter['label']==pitcher_label)&(RV_VSavgBatter['pre_baseOutState']==state)]['run_value']
#         elif batter not in batters_new and pitcher in pitchers_new:
#             val_df.at[index, 'run_value_pred'] = RV_VSavgPitcher[(RV_VSavgPitcher['batter']==batter)&(RV_VSavgPitcher['pre_baseOutState']==state)]['run_value']
#         else:
#             pitcher_label = int(cluster[cluster['pitcher']==pitcher]['label'])
#             val_df.at[index, 'run_value_pred'] = RV[(RV['batter']==batter)&(RV['label']==pitcher_label)&(RV['pre_baseOutState']==state)]['run_value']
#     except:
#         val_df.at[index, 'run_value_pred'] = RV_VSavgBatter[(RV_VSavgBatter['label']==pitcher_label)&(RV_VSavgBatter['pre_baseOutState']==state)]['run_value']
    
#     val_df.at[index, 'run_value_pred_avg'] = float(RV_leagueAVG[(RV_leagueAVG['pre_baseOutState']==state)]['run_value'])

# stop = timeit.default_timer()
# start - stop