In [1]:
import pandas as pd
import numpy as np
import warnings
import json
import requests
import os
import io
import sys
from datetime import date
from datetime import timedelta
warnings.filterwarnings("ignore")

today = date.today()
yesterday = str(today - timedelta(days = 1))

In [2]:
df = pd.read_csv("/Users/nickdimmitt/Desktop/dfs_local/mlb/data/single_play_2022.csv")

In [3]:
## function that extracts nexted values from a JSON tree
def json_extract(tree, key):
    arr = []

    def extract(tree, arr, key):
        if isinstance(tree, dict):
            for k, v in tree.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
                elif k==key:
                    arr.append(v)
        elif isinstance(tree, list):
            for item in tree:
                extract(item, arr, key)

        return arr
    values = extract(tree, arr, key)
    return values

In [4]:
def get_gameID(date1, date2):
    """
    function takes in two dates and returns all the gamePKs in that range
    
    """
    game_url_template = "https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate=%s&endDate=%s" % (date1, date2)
    request = requests.get(game_url_template)
    data = request.json()
    gameID_list = json_extract(data, 'gamePk')
    return gameID_list

In [5]:
def play_grabber(gamePk):
    """
    take in gamePK and returns a dataframe with every pitch from that game, its outcome, batter, pitcher, etc. 
    also creates column of gamePk as a link between tables in the database
    """
    try:
        game_url = 'https://baseballsavant.mlb.com/gf?game_pk={}'.format(gamePk)
        request = requests.get(game_url)
        data = request.json()
        home_plays = pd.DataFrame(data['team_home'])
        away_plays = pd.DataFrame(data['team_away'])
        all_plays = pd.concat([home_plays, away_plays])

        return all_plays
    except:
        return gamePk

In [6]:
game_pks = get_gameID(date1="2023-03-30", date2=yesterday)
dfs = [play_grabber(x) for x in game_pks]
dfs = pd.concat(dfs)
df = pd.concat([dfs, df])
del dfs

In [7]:
df = df.loc[:,~df.columns.duplicated()].copy()

In [11]:
df = df.drop(['player_name', 'hc_x', 'hc_x_ft', 'hc_y', 'hc_y_ft', 'des', 'runnerOn1B', 'events','runnerOn2B', 'runnerOn3B', 'game_total_pitches', 'rowId', 'call', 'px', 'pz', 'ax', 'ay', 'player_total_pitches'], axis=1)
df = df.loc[:, ~df.columns.str.contains("pfx")]
df = df.loc[:, ~df.columns.str.contains("calc")]
df = df.loc[:, ~df.columns.str.contains("back")]
df = df.loc[:, ~df.columns.str.contains("Unnamed")]
df = df.loc[:, ~df.columns.str.contains("0")]
df = df.loc[:, ~df.columns.str.contains('sz')]
df = df.loc[:, ~df.columns.str.contains('az')]

In [12]:
df = df.reset_index()

## Pitch Velocities

In [13]:
pitches = ['FF', 'SL', 'SI', 'CH', 'CU', 'FC', 'KC', 'FS']

for pitch in pitches:
    df[f'{pitch}_velo'] = df[df['pitch_type'] == pitch].groupby('pitcher')['start_speed'].transform(lambda x:x.mean())

for pitch in pitches:
    df[f'{pitch}_spin'] = df[df['pitch_type'] == pitch].groupby('pitcher')['spin_rate'].transform(lambda x:x.mean())
    
cols = ['FF_velo', 'SL_velo', 'SI_velo', 'CH_velo', 'CU_velo', 'FC_velo', 'KC_velo', 'FS_velo', 'FF_spin', 'SL_spin', 'SI_spin', 'CH_spin', 'CU_spin', 'FC_spin', 'KC_spin', 'FS_spin']

for col in cols:
    df[col] = df.groupby('pitcher')[col].transform(lambda x:x.fillna(x.mean()))

In [15]:
df['hit_angle'] = pd.to_numeric(df['hit_angle'])
df['extension'] = pd.to_numeric(df['extension'])
df['xba'] = pd.to_numeric(df['xba'])
df['whiff'] = np.where(df['description'] == 'Swinging Strike', 1, 0)

## Averages

In [17]:
cols = ['extension', 'hit_angle', 'xba']

for col in cols:
    df[f'{col}_avg'] = df.groupby('pitcher')[col].transform(lambda x:x.mean())

## Pitch and Zone Frequencies

In [18]:
df['pitch_sum'] = df.groupby('pitcher')['pitcher'].transform('size')

for pitch in pitches:
    df[f'{pitch}_freq'] = df[df['pitch_type']==pitch].groupby(['pitcher', 'pitch_type'])['pitcher'].transform('size')/df['pitch_sum']

zones = [1,2,3,4,5,6,7,8,9,11,12,13,14]

for zone in zones:
    df[f'zone_{str(zone)}_freq'] = df[df['zone'] == zone].groupby(['pitcher', 'zone'])['pitcher'].transform('size')/df['pitch_sum']

freq = ['FF_freq', 'SL_freq', 'SI_freq', 'CH_freq', 'CU_freq', 'FC_freq', 'KC_freq', 'FS_freq', 'zone_1_freq', 'zone_2_freq', 'zone_3_freq', 'zone_4_freq', 'zone_5_freq', 'zone_6_freq', 'zone_7_freq','zone_8_freq', 'zone_9_freq',  'zone_11_freq', 'zone_12_freq', 'zone_13_freq', 'zone_14_freq']

for col in freq:
    df[col] = df.groupby('pitcher')[col].transform(lambda x:x.fillna(x.mean()))

df = df.drop('pitch_sum', axis=1)

## Zones

In [19]:
df['upper_z'] = df['zone_1_freq'] + df['zone_2_freq'] + df['zone_3_freq'] 
df['middle_z'] = df['zone_4_freq'] + df['zone_5_freq'] + df['zone_6_freq']
df['lower_z'] = df['zone_7_freq'] + df['zone_8_freq'] + df['zone_9_freq']
df['strike_freq'] = df['zone_1_freq'] + df['zone_2_freq'] + df['zone_3_freq'] + df['zone_4_freq'] + df['zone_5_freq'] + df['zone_6_freq'] + df['zone_7_freq'] + df['zone_8_freq'] + df['zone_9_freq']
df['ball_freq'] = df['zone_11_freq'] + df['zone_12_freq'] + df['zone_13_freq'] + df['zone_14_freq']

df = df.loc[:,~df.columns.str.contains("zone_")]

## Dummies

In [20]:
df['flyball'] = np.where(df['hit_angle_avg']>=df['hit_angle_avg'].describe()['75%'], 1, 0)
df['avg_fly_ground'] = np.where((df['hit_angle_avg'] < df['hit_angle_avg'].describe()['75%']) & (df['hit_angle_avg'] > df['hit_angle_avg'].describe()['25%']), 1, 0)
df['groundball'] = np.where(df['hit_angle_avg']<=df['hit_angle_avg'].describe()['25%'], 1, 0)

In [21]:
df['tall'] = np.where(df['extension_avg']>=df['extension_avg'].describe()['75%'], 1, 0)
df['medium'] = np.where((df['extension_avg'] < df['extension_avg'].describe()['75%']) & (df['extension_avg'] > df['extension_avg'].describe()['25%']), 1, 0)
df['short'] = np.where(df['extension_avg']<=df['extension_avg'].describe()['25%'], 1, 0)

In [22]:
df['barrel_finder'] = np.where(df['xba_avg']>=df['xba_avg'].describe()['75%'], 1, 0)
df['normal_barrels'] = np.where((df['xba_avg'] < df['xba_avg'].describe()['75%']) & (df['xba_avg'] > df['xba_avg'].describe()['25%']), 1, 0)
df['barrel_misser'] = np.where(df['xba_avg']<=df['xba_avg'].describe()['25%'], 1, 0)

In [23]:
df['high_cheeser'] = np.where(df['upper_z'] >= df['upper_z'].describe()['75%'], 1, 0)
df['middle_middle'] = np.where(df['middle_z'] >= df['middle_z'].describe()['75%'], 1, 0)
df['go_fishing'] = np.where(df['lower_z'] >= df['lower_z'].describe()['75%'], 1, 0)

df['wild'] = np.where(df['ball_freq'] >= df['ball_freq'].describe()['75%'], 1, 0)
df['strike_thrower'] = np.where(df['strike_freq'] >= df['strike_freq'].describe()['75%'], 1, 0)

In [24]:
for pitch in pitches:
    df[f'fast_{pitch}'] = np.where(df[f'{pitch}_velo'] >= df[f'{pitch}_velo'].describe()['75%'], 1, 0)
    df[f'normal_{pitch}'] =  np.where((df[f'{pitch}_velo'] < df[f'{pitch}_velo'].describe()['75%']) & (df[f'{pitch}_velo'] > df[f'{pitch}_velo'].describe()['25%']), 1, 0)
    df[f'slow_{pitch}'] = np.where(df[f'{pitch}_velo'] <= df[f'{pitch}_velo'].describe()['25%'], 1, 0)
    
    df[f'tight_{pitch}'] = np.where(df[f'{pitch}_spin'] >= df[f'{pitch}_spin'].describe()['75%'], 1, 0)
    df[f'normal_spin_{pitch}'] =  np.where((df[f'{pitch}_spin'] < df[f'{pitch}_spin'].describe()['75%']) & (df[f'{pitch}_velo'] > df[f'{pitch}_velo'].describe()['25%']), 1, 0)
    df[f'cement_mixer_{pitch}'] = np.where(df[f'{pitch}_spin'] <= df[f'{pitch}_spin'].describe()['25%'], 1, 0)

## Hitters

Determine which zones they have highest chase rate and which pitchers would dice them. Strike three pitches by zones, homeruns by zones etc.

In [30]:
splits = ['pitcher',
          'p_throws',
          'flyball',
          'avg_fly_ground',
          'groundball',
          'tall',
          'medium',
          'short',
          'barrel_finder',
          'normal_barrels',
          'barrel_misser',
          'high_cheeser',
          'middle_middle',
          'go_fishing',
          'wild',
          'strike_thrower',
          'fast_FF',
          'normal_FF',
          'slow_FF',
          'tight_FF',
          'normal_spin_FF',
          'cement_mixer_FF',
          'fast_SL',
          'normal_SL',
          'slow_SL',
          'tight_SL',
          'normal_spin_SL',
          'cement_mixer_SL',
          'fast_SI',
          'normal_SI',
          'slow_SI',
          'tight_SI',
          'normal_spin_SI',
          'cement_mixer_SI',
          'fast_CH',
          'normal_CH',
          'slow_CH',
          'tight_CH',
          'normal_spin_CH', 
          'cement_mixer_CH',
          'fast_CU',
          'normal_CU',
          'slow_CU',
          'tight_CU',
          'normal_spin_CU',
          'cement_mixer_CU',
          'fast_FC',
          'normal_FC',
          'slow_FC',
          'tight_FC',
          'normal_spin_FC',
          'cement_mixer_FC',
          'fast_KC',
          'normal_KC',
          'slow_KC',
          'tight_KC',
          'normal_spin_KC',
          'cement_mixer_KC',
          'fast_FS',
          'normal_FS',
          'slow_FS',
          'tight_FS',
          'normal_spin_FS',
          'cement_mixer_FS']


splits = ['pitcher',
          'p_throws',
          'flyball',
          'avg_fly_ground',
          'groundball',
          'tall',
          'medium',
          'short',
          'high_cheeser',
          'middle_middle',
          'go_fishing',
          'wild',
          'strike_thrower',
          'fast_FF',
          'normal_FF',
          'slow_FF',
          'tight_FF',
          'normal_spin_FF',
          'cement_mixer_FF']

       
df[splits + ['hit_angle', 'hit_speed', 'xba', 'whiff']] = df[splits + ['hit_angle', 'hit_speed', 'xba', 'whiff']].apply(pd.to_numeric, errors='coerce')

for split in splits:
    df['split_size'] = df.groupby(['batter', split])['batter'].transform('size')
    df[f'la_vs_{split}'] = df.groupby(['batter',split])['hit_angle'].transform(lambda x:x.mean())
    df[f'ev_vs_{split}'] = df.groupby(['batter', split])['hit_speed'].transform(lambda x:x.mean())
    df[f'xba_vs_{split}'] = df.groupby(['batter', split])['xba'].transform(lambda x:x.mean())
    df[f'whiff%_vs_{split}'] = df[df['whiff'] == 1].groupby(['batter', split])['batter'].transform('size')/df['split_size']

In [27]:
## Drop by frequency
df = df.groupby('batter_name').filter(lambda x : len(x)>500)

## Daily Portion

In [33]:
pitch_dict = {
    'Austin Gomber': 'SD',
    'Seth Lugo': 'COL',
    'Tyler Anderson': 'OAK',
    'Ken Waldichuk': 'LAA',
    'Chris Bassitt': 'STL',
    'Jordan Montgomery': 'TOR',
    'Ross Stripling': 'NYY',
    'Jhony Brito': 'SFG',
    'Jared Shuster': 'WAS',
    'MacKenzie Gore': 'ATL',
    'Cole Irvin':'BOS',
    'Tanner Houck': 'BAL',
    'Eric Lauer':'CHC',
    'Jameson Taillon':'MIL',
    'Joey Wentz':'TBR',
    'Jeffrey Springs':'DET',
    'Bailey Falter':'TEX',
    'Martin Perez':'PHI',
    'Vince Velasquez':'CIN',
    'Graham Ashcraft':'PIT',
    'Joe Ryan':'KC',
    'Brad Keller':'MIN',
    'Kodai Senga':'MIA',
    'Trevor Rogers':'NYM',
    'Mike Clevinger':"HOU",
    'Luis Garcia':'CHW',
    'Zach Davies':'LAD',
    'Noah Syndergaard':'ARI',
    'Cal Quantrill':'SEA',
    'Marco Gonzales':'CLE'}

def daily_gen(splits, pitch_dict, df):
    daily_df = pd.DataFrame()
    la_split = [f'la_vs_{split}' for split in splits]
    ev_split = [f'ev_vs_{split}' for split in splits]
    xba_split = [f'xba_vs_{split}' for split in splits]
    whiff_split = [f'whiff%_vs_{split}' for split in splits]
    columns = la_split + ev_split + xba_split + whiff_split
    for pitcher in pitch_dict.keys():
        if pitcher not in set(list(df['pitcher_name'])):
            print(pitcher)
        else:
            pitcher_df = df[df['pitcher_name'] == pitcher].drop_duplicates(subset='pitcher_name', keep='last')
            if pitcher_df.shape[0] == 0:
                return pitcher
            df1 = df[df['team_batting'] == pitch_dict[pitcher]][['batter_name'] + columns].drop_duplicates(subset='batter_name', keep='last')
            df1['pitcher_name'] = pitcher
            count = 0
            for split in splits:
                df1[split] = pitcher_df.iloc[0, pitcher_df.columns.get_loc(split)]
            daily_df = pd.concat([daily_df, df1])
    daily_df = daily_df[['batter_name','pitcher_name'] + splits + la_split + ev_split + xba_split + whiff_split]
    for split in splits:
        daily_df[f'ev_vs_{split}'] = np.where(daily_df[split] == 0, np.NaN, daily_df[f'ev_vs_{split}'])
        daily_df[f'la_vs_{split}'] = np.where(daily_df[split] == 0, np.NaN, daily_df[f'la_vs_{split}'])
        daily_df[f'xba_vs_{split}'] = np.where(daily_df[split] == 0, np.NaN, daily_df[f'xba_vs_{split}'])
        daily_df[f'whiff%_vs_{split}'] = np.where(daily_df[split] == 0, np.NaN, daily_df[f'whiff%_vs_{split}'])

    daily_df['avg_ev'] = round(daily_df[ev_split].mean(axis=1),3)
    daily_df['avg_la'] = round(daily_df[la_split].mean(axis=1),3)
    daily_df['avg_xba'] = round(daily_df[xba_split].mean(axis=1),3)
    daily_df['avg_whiff%'] = round(daily_df[whiff_split].mean(axis=1),3)
    daily_df['batter_team'] = daily_df['pitcher_name'].map(pitch_dict)

    return daily_df[['batter_name', 'batter_team', 'pitcher_name', 'avg_ev', 'avg_la', 'avg_xba', 'avg_whiff%']].dropna(axis=0).sort_values(['avg_xba', 'avg_ev'], ascending=False)

df1 = daily_gen(splits, pitch_dict, df)

Jared Shuster
Kodai Senga


In [36]:
df1.to_csv(f"predictions/{today}.csv")