In [1]:
%load_ext jupyternotify
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import timeit
import import_ipynb
from sklearn.model_selection import KFold
from tqdm.auto import tqdm


import functions as func

pd.set_option('display.max_columns', None)

<IPython.core.display.Javascript object>

importing Jupyter notebook from functions.ipynb
The jupyternotify extension is already loaded. To reload it, use:
  %reload_ext jupyternotify


# Predict

In [2]:
def predict(data_X, data_y, cluster, pitchers_new, batters_new, label):
    
    cluster['label'] = label

    cluster_dict = {}
    for index, row in cluster.iterrows():
        cluster_dict[row.pitcher] = int(row.label)

    train_df = data_X.merge(cluster, on='pitcher', how='right')

    # individual batter vs grouped pitcher based on base out state
    RV = train_df.groupby(['batter', 'label', 'pre_baseOutState']).agg({'run_value':'mean', 
                                                                        'PA_count':sum}).reset_index()
    RV_dict = {n:{x:{y:float(grp3.run_value) 
                     for y, grp3 in grp2.loc[x].groupby(level='pre_baseOutState')} 
                  for x, grp2 in grp.loc[n].groupby(level='label')} 
               for n, grp in RV.set_index(['batter', 'label', 'pre_baseOutState']).groupby(level='batter')}

    # avg batter vs avg pitcher
    RV_leagueAVG = train_df.groupby('pre_baseOutState').agg({'run_value':'mean', 
                                                             'PA_count':sum}).reset_index()
    RV_leagueAVG_dict = {}
    for index, row in RV_leagueAVG.iterrows():
        RV_leagueAVG_dict[row.pre_baseOutState] = row.run_value

    # individual pitcher vs avg batter
    RV_VSavgBatter = train_df.groupby(['label', 'pre_baseOutState']).agg({'run_value':'mean', 
                                                                          'PA_count':sum}).reset_index()

    RV_VSavgBatter_dict = {n:{x:float(grp2.run_value) 
                              for x, grp2 in grp.loc[n].groupby(level='pre_baseOutState')} 
                           for n, grp in RV_VSavgBatter.set_index(['label', 'pre_baseOutState']).groupby(level='label')}

    # individual batter vs avg pitcher
    RV_VSavgPitcher = train_df.groupby(['batter', 'pre_baseOutState']).agg({'run_value':'mean', 
                                                                            'PA_count':sum}).reset_index()

    RV_VSavgPitcher_dict = {n:{x: float(grp2.run_value) 
                        for x, grp2 in grp.loc[n].groupby(level='pre_baseOutState')} 
                    for n, grp in RV_VSavgPitcher.set_index(['batter', 'pre_baseOutState']).groupby(level='batter')}

    run_value_pred = []
    run_value_pred_avg = []

    for row in tqdm(data_y.itertuples()):
        batter = row.batter
        pitcher = row.pitcher
        state = row.pre_baseOutState
        try:
            if batter in batters_new and pitcher in pitchers_new:
                run_value = RV_leagueAVG_dict[state]
            elif batter in batters_new and pitcher not in pitchers_new:
                pitcher_label = cluster_dict[pitcher]
                run_value = RV_VSavgBatter_dict[pitcher_label][state]
            elif batter not in batters_new and pitcher in pitchers_new:
                run_value = RV_VSavgPitcher_dict[batter][state]
            else:
                pitcher_label = cluster_dict[pitcher]
                run_value = RV_dict[batter][pitcher_label][state]
        except:
            try:
                run_value = RV_VSavgBatter_dict[pitcher_label][state]
            except:
                run_value = RV_leagueAVG_dict[state]
        run_value_pred.append(run_value)

        run_value_avg = RV_leagueAVG_dict[state]
        run_value_pred_avg.append(run_value_avg)

    # calculate results    
    data_y = data_y.copy()
    data_y['run_value_pred'] = run_value_pred
    data_y['run_value_pred_avg'] = run_value_pred_avg

    RV_test_avg = data_y['run_value'].mean()
    RV_pred_avg = data_y['run_value_pred'].mean()
    RV_league_avg = data_y['run_value_pred_avg'].mean()

    error = RV_pred_avg - RV_test_avg
    error_pct = abs(error/RV_test_avg)

    error_league = RV_league_avg - RV_test_avg
    error_league_pct = abs(error_league/RV_test_avg)

    predict.RV_test_avg = RV_test_avg
    predict.RV_pred_avg = RV_pred_avg
    predict.RV_league_avg = RV_league_avg

    predict.error = error
    predict.error_pct = error_pct
    predict.error_league = error_league
    predict.error_league_pct = error_league_pct

# Cross Validation

In [3]:
def cross_val(data, cluster, pitchers_new, batters_new, label, num_fold=5):
    
    RV_test_list = np.array([])
    RV_pred_list = np.array([])
    RV_league_list = np.array([])
    
    error_list = np.array([])
    error_pct_list = np.array([])
    errorLeague_list = np.array([])
    errorLeague_pct_list = np.array([])
    
    kf = KFold(n_splits=num_fold, random_state=0, shuffle=True)
    for train_index, test_index in kf.split(data):
        data_X = data.iloc[train_index].copy()
        data_y = data.iloc[test_index].copy()
        
        predict(data_X, data_y, cluster, pitchers_new, batters_new, label)
        
        RV_test = predict.RV_test_avg
        RV_pred = predict.RV_pred_avg
        RV_league = predict.RV_league_avg
        
        RV_test_list = np.append(RV_test_list, RV_test)
        RV_pred_list = np.append(RV_pred_list, RV_pred)
        RV_league_list = np.append(RV_league_list, RV_league)      
        
        error = predict.error
        error_pct = predict.error_pct
        error_league = predict.error_league
        error_league_pct = predict.error_league_pct
        
        error_list = np.append(error_list, error)
        error_pct_list = np.append(error_pct_list, error_pct)
        errorLeague_list = np.append(errorLeague_list, error_league)
        errorLeague_pct_list = np.append(errorLeague_pct_list, error_league_pct)
    
    # all errors and pct are absolute
    error_avg = abs(error_list).mean() 
    error_pct_avg = abs(error_pct_list).mean()
    errorLeague_avg = abs(errorLeague_list).mean()
    errorLeague_pct_avg = abs(errorLeague_pct_list).mean()
    
    cross_val.error_list = error_list
    cross_val.error_pct_list = error_pct_list
    cross_val.errorLeague_list = errorLeague_list
    cross_val.errorLeague_pct_list = errorLeague_pct_list
    
    cross_val.error_avg = error_avg
    cross_val.error_pct_avg = error_pct_avg
    cross_val.errorLeague_avg = errorLeague_avg
    cross_val.errorLeague_pct_avg = errorLeague_pct_avg
    
    cross_val.RV_test_list = RV_test_list
    cross_val.RV_pred_list = RV_pred_list
    cross_val.RV_league_list = RV_league_list
    
    cross_val.RV_test_avg = RV_test_list.mean()
    cross_val.RV_pred_avg = RV_pred_list.mean()
    cross_val.RV_league_avg = RV_league_list.mean()

In [4]:
# start = timeit.default_timer()

# for index, row in val_df.iterrows():
#     batter = row.batter
#     pitcher = row.pitcher
#     state = row.pre_baseOutState

#     try:
#         if batter in batters_new and pitcher in pitchers_new:
#             val_df.at[index, 'run_value_pred'] = RV_leagueAVG[RV_leagueAVG['pre_baseOutState']==state]['run_value']
#         elif batter in batters_new and pitcher not in pitchers_new:
#             pitcher_label = int(cluster[cluster['pitcher']==pitcher]['label'])
#             val_df.at[index, 'run_value_pred'] = RV_VSavgBatter[(RV_VSavgBatter['label']==pitcher_label)&(RV_VSavgBatter['pre_baseOutState']==state)]['run_value']
#         elif batter not in batters_new and pitcher in pitchers_new:
#             val_df.at[index, 'run_value_pred'] = RV_VSavgPitcher[(RV_VSavgPitcher['batter']==batter)&(RV_VSavgPitcher['pre_baseOutState']==state)]['run_value']
#         else:
#             pitcher_label = int(cluster[cluster['pitcher']==pitcher]['label'])
#             val_df.at[index, 'run_value_pred'] = RV[(RV['batter']==batter)&(RV['label']==pitcher_label)&(RV['pre_baseOutState']==state)]['run_value']
#     except:
#         val_df.at[index, 'run_value_pred'] = RV_VSavgBatter[(RV_VSavgBatter['label']==pitcher_label)&(RV_VSavgBatter['pre_baseOutState']==state)]['run_value']
    
#     val_df.at[index, 'run_value_pred_avg'] = float(RV_leagueAVG[(RV_leagueAVG['pre_baseOutState']==state)]['run_value'])

# stop = timeit.default_timer()
# start - stop