In [1]:
import pandas as pd
import numpy as np
import warnings
import json
import requests
import os
import io
import sys
from lxml import etree, html
from datetime import date
from datetime import timedelta
warnings.filterwarnings("ignore")

today = date.today()
yesterday = str(today - timedelta(days = 1))

In [2]:
replace = {
    'S. Fujinami': 'Shintaro Fujinami',
    'A. DeSclafani': 'Anthony DeSclafani',
    'G. Rodriguez':'Grayson Rodriguez'
}

In [3]:
df = pd.read_csv("/Users/nickdimmitt/Desktop/dfs_local/mlb/data/single_play_2022.csv")

In [4]:
## function that extracts nexted values from a JSON tree
def json_extract(tree, key):
    arr = []

    def extract(tree, arr, key):
        if isinstance(tree, dict):
            for k, v in tree.items():
                if isinstance(v, (dict, list)):
                    extract(v, arr, key)
                elif k==key:
                    arr.append(v)
        elif isinstance(tree, list):
            for item in tree:
                extract(item, arr, key)

        return arr
    values = extract(tree, arr, key)
    return values

In [5]:
def get_gameID(date1, date2):
    """
    function takes in two dates and returns all the gamePKs in that range
    
    """
    game_url_template = "https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate=%s&endDate=%s" % (date1, date2)
    request = requests.get(game_url_template)
    data = request.json()
    gameID_list = json_extract(data, 'gamePk')
    return gameID_list

In [6]:
def play_grabber(gamePk):
    """
    take in gamePK and returns a dataframe with every pitch from that game, its outcome, batter, pitcher, etc. 
    also creates column of gamePk as a link between tables in the database
    """
    try:
        game_url = 'https://baseballsavant.mlb.com/gf?game_pk={}'.format(gamePk)
        request = requests.get(game_url)
        data = request.json()
        home_plays = pd.DataFrame(data['team_home'])
        away_plays = pd.DataFrame(data['team_away'])
        all_plays = pd.concat([home_plays, away_plays])

        return all_plays
    except:
        return None

In [7]:
game_pks = get_gameID(date1="2023-03-30", date2=yesterday)
dfs = [play_grabber(x) for x in game_pks]
dfs = pd.concat(dfs)
hitter_dict = pd.Series(dfs.team_batting.values,index=dfs.batter_name).to_dict()
df = pd.concat([dfs, df])
del dfs

In [8]:
df = df.loc[:,~df.columns.duplicated()].copy()

In [9]:
df = df.drop(['player_name', 'hc_x', 'hc_x_ft', 'hc_y', 'hc_y_ft', 'des', 'runnerOn1B', 'events','runnerOn2B', 'runnerOn3B', 'game_total_pitches', 'rowId', 'call', 'px', 'pz', 'ax', 'ay', 'player_total_pitches'], axis=1)
df = df.loc[:, ~df.columns.str.contains("pfx")]
df = df.loc[:, ~df.columns.str.contains("calc")]
df = df.loc[:, ~df.columns.str.contains("back")]
df = df.loc[:, ~df.columns.str.contains("Unnamed")]
df = df.loc[:, ~df.columns.str.contains("0")]
df = df.loc[:, ~df.columns.str.contains('sz')]
df = df.loc[:, ~df.columns.str.contains('az')]

In [10]:
df = df.reset_index()
df['game_pk'] = pd.to_numeric(df['game_pk'])
df = df.sort_values('game_pk')

## Pitch Velocities

In [11]:
pitches = ['FF', 'SL', 'SI', 'CH', 'CU', 'FC', 'KC', 'FS']

for pitch in pitches:
    df[f'{pitch}_velo'] = df[df['pitch_type'] == pitch].groupby('pitcher')['start_speed'].transform(lambda x:x.mean())

for pitch in pitches:
    df[f'{pitch}_spin'] = df[df['pitch_type'] == pitch].groupby('pitcher')['spin_rate'].transform(lambda x:x.mean())
    
cols = ['FF_velo', 'SL_velo', 'SI_velo', 'CH_velo', 'CU_velo', 'FC_velo', 'KC_velo', 'FS_velo', 'FF_spin', 'SL_spin', 'SI_spin', 'CH_spin', 'CU_spin', 'FC_spin', 'KC_spin', 'FS_spin']

for col in cols:
    df[col] = df.groupby('pitcher')[col].transform(lambda x:x.fillna(x.mean()))

In [12]:
df['hit_angle'] = pd.to_numeric(df['hit_angle'])
df['extension'] = pd.to_numeric(df['extension'])
df['xba'] = pd.to_numeric(df['xba'])
df['whiff'] = np.where(df['description'] == 'Swinging Strike', 1, 0)

## Averages

In [13]:
cols = ['extension', 'hit_angle', 'xba']

for col in cols:
    df[f'{col}_avg'] = df.groupby('pitcher')[col].transform(lambda x:x.mean())

## Pitch and Zone Frequencies

In [14]:
df['pitch_sum'] = df.groupby('pitcher')['pitcher'].transform('size')

for pitch in pitches:
    df[f'{pitch}_freq'] = df[df['pitch_type']==pitch].groupby(['pitcher', 'pitch_type'])['pitcher'].transform('size')/df['pitch_sum']

zones = [1,2,3,4,5,6,7,8,9,11,12,13,14]

for zone in zones:
    df[f'zone_{str(zone)}_freq'] = df[df['zone'] == zone].groupby(['pitcher', 'zone'])['pitcher'].transform('size')/df['pitch_sum']

freq = ['FF_freq', 'SL_freq', 'SI_freq', 'CH_freq', 'CU_freq', 'FC_freq', 'KC_freq', 'FS_freq', 'zone_1_freq', 'zone_2_freq', 'zone_3_freq', 'zone_4_freq', 'zone_5_freq', 'zone_6_freq', 'zone_7_freq','zone_8_freq', 'zone_9_freq',  'zone_11_freq', 'zone_12_freq', 'zone_13_freq', 'zone_14_freq']

for col in freq:
    df[col] = df.groupby('pitcher')[col].transform(lambda x:x.fillna(x.mean()))

df = df.drop('pitch_sum', axis=1)

## Zones

In [15]:
df['upper_z'] = df['zone_1_freq'] + df['zone_2_freq'] + df['zone_3_freq'] 
df['middle_z'] = df['zone_4_freq'] + df['zone_5_freq'] + df['zone_6_freq']
df['lower_z'] = df['zone_7_freq'] + df['zone_8_freq'] + df['zone_9_freq']
df['strike_freq'] = df['zone_1_freq'] + df['zone_2_freq'] + df['zone_3_freq'] + df['zone_4_freq'] + df['zone_5_freq'] + df['zone_6_freq'] + df['zone_7_freq'] + df['zone_8_freq'] + df['zone_9_freq']
df['ball_freq'] = df['zone_11_freq'] + df['zone_12_freq'] + df['zone_13_freq'] + df['zone_14_freq']

df = df.loc[:,~df.columns.str.contains("zone_")]

## Dummies

In [16]:
df['righty'] = np.where(df['p_throws'] == 'R', 1, 0)
df['lefty'] = np.where(df['p_throws'] == 'L', 1, 0)

In [17]:
df['flyball'] = np.where(df['hit_angle_avg']>=df['hit_angle_avg'].describe()['75%'], 1, 0)
df['avg_fly_ground'] = np.where((df['hit_angle_avg'] < df['hit_angle_avg'].describe()['75%']) & (df['hit_angle_avg'] > df['hit_angle_avg'].describe()['25%']), 1, 0)
df['groundball'] = np.where(df['hit_angle_avg']<=df['hit_angle_avg'].describe()['25%'], 1, 0)

In [18]:
df['tall'] = np.where(df['extension_avg']>=df['extension_avg'].describe()['75%'], 1, 0)
df['medium'] = np.where((df['extension_avg'] < df['extension_avg'].describe()['75%']) & (df['extension_avg'] > df['extension_avg'].describe()['25%']), 1, 0)
df['short'] = np.where(df['extension_avg']<=df['extension_avg'].describe()['25%'], 1, 0)

In [19]:
df['barrel_finder'] = np.where(df['xba_avg']>=df['xba_avg'].describe()['75%'], 1, 0)
df['normal_barrels'] = np.where((df['xba_avg'] < df['xba_avg'].describe()['75%']) & (df['xba_avg'] > df['xba_avg'].describe()['25%']), 1, 0)
df['barrel_misser'] = np.where(df['xba_avg']<=df['xba_avg'].describe()['25%'], 1, 0)

In [20]:
df['high_cheeser'] = np.where(df['upper_z'] >= df['upper_z'].describe()['75%'], 1, 0)
df['middle_middle'] = np.where(df['middle_z'] >= df['middle_z'].describe()['75%'], 1, 0)
df['go_fishing'] = np.where(df['lower_z'] >= df['lower_z'].describe()['75%'], 1, 0)

df['wild'] = np.where(df['ball_freq'] >= df['ball_freq'].describe()['75%'], 1, 0)
df['strike_thrower'] = np.where(df['strike_freq'] >= df['strike_freq'].describe()['75%'], 1, 0)

In [21]:
for pitch in pitches:
    df[f'fast_{pitch}'] = np.where(df[f'{pitch}_velo'] >= df[f'{pitch}_velo'].describe()['75%'], 1, 0)
    df[f'normal_{pitch}'] =  np.where((df[f'{pitch}_velo'] < df[f'{pitch}_velo'].describe()['75%']) & (df[f'{pitch}_velo'] > df[f'{pitch}_velo'].describe()['25%']), 1, 0)
    df[f'slow_{pitch}'] = np.where(df[f'{pitch}_velo'] <= df[f'{pitch}_velo'].describe()['25%'], 1, 0)
    
    df[f'tight_{pitch}'] = np.where(df[f'{pitch}_spin'] >= df[f'{pitch}_spin'].describe()['75%'], 1, 0)
    df[f'normal_spin_{pitch}'] =  np.where((df[f'{pitch}_spin'] < df[f'{pitch}_spin'].describe()['75%']) & (df[f'{pitch}_velo'] > df[f'{pitch}_velo'].describe()['25%']), 1, 0)
    df[f'cement_mixer_{pitch}'] = np.where(df[f'{pitch}_spin'] <= df[f'{pitch}_spin'].describe()['25%'], 1, 0)

## Hitters

Determine which zones they have highest chase rate and which pitchers would dice them. Strike three pitches by zones, homeruns by zones etc.

In [22]:
splits = ['righty',
          'lefty',
          'flyball',
          'avg_fly_ground',
          'groundball',
          'tall',
          'medium',
          'short',
          'barrel_finder',
          'normal_barrels',
          'barrel_misser',
          'high_cheeser',
          'middle_middle',
          'go_fishing',
          'wild',
          'strike_thrower']

df[splits + ['hit_angle', 'hit_speed', 'xba', 'whiff']] = df[splits + ['hit_angle', 'hit_speed', 'xba', 'whiff']].apply(pd.to_numeric, errors='coerce')
df['avg_ev'] = df.groupby('batter')['hit_speed'].transform(lambda x:x.mean())
df['avg_la'] = df.groupby('batter')['hit_angle'].transform(lambda x:x.mean())
df['avg_xba'] = df.groupby('batter')['xba'].transform(lambda x:x.mean())


df = df.sort_values(by='batter_name')
for split in splits:
    df['split_size'] = df.groupby(['batter', split])['batter'].transform('size')
    df[f'la_vs_{split}'] = df.groupby(['batter',split])['hit_angle'].transform(lambda x:x.mean())
    df[f'ev_vs_{split}'] = df.groupby(['batter', split])['hit_speed'].transform(lambda x:x.mean())
    df[f'xba_vs_{split}'] = df.groupby(['batter', split])['xba'].transform(lambda x:x.mean())
    df[f'whiff%_vs_{split}'] = df.groupby(['batter', split])['whiff'].transform('sum')/df['split_size']

df['split_size'] = df.groupby(['batter', 'pitcher'])['batter'].transform('size')
df['ev_vs_pitcher'] = df.groupby(['batter', 'pitcher'])['hit_speed'].transform(lambda x:x.mean())
df['la_vs_pitcher'] = df.groupby(['batter', 'pitcher'])['hit_angle'].transform(lambda x:x.mean())
df['xba_vs_pitcher'] = df.groupby(['batter', 'pitcher'])['xba'].transform(lambda x:x.mean())
df['whiff%_vs_pitcher'] = df.groupby(['batter', 'pitcher'])['whiff'].transform('sum')/df['split_size']

In [23]:
df['pitcher'] = df.groupby(['batter', 'pitcher'])['hit_speed'].transform(lambda x:x.mean())

In [24]:
df = df.sort_values(by='game_pk', ascending=True)
df = df[df['game_pk'] > df['game_pk'].describe()['50%']]

In [25]:
url = 'https://www.rotowire.com/baseball/daily-lineups.php'
xpath = '//body//div[@class="lineup__player-highlight-name"]//a//text()'
xpath_teams = '//body//div[@class="lineup__teams"]//div[@class="lineup__abbr"]//text()'
results = requests.get(url)
results_tree = html.fromstring(results.content)
pitchers = results_tree.xpath(xpath)
teams = results_tree.xpath(xpath_teams)

In [26]:
away_teams = teams[::2]
home_teams = teams[1::2]
teams = []
for i in range(len(away_teams)):
    teams.append(home_teams[i])
    teams.append(away_teams[i])
    
pitchers = [replace[x] if x in replace else x for x in pitchers]
pitch_dict = {x:y for x,y in zip(pitchers, teams)}

In [27]:
splits = ['righty',
          'lefty',
          'flyball',
          'avg_fly_ground',
          'groundball',
          'tall',
          'medium',
          'short',
          'barrel_finder',
          'normal_barrels',
          'barrel_misser',
          'high_cheeser',
          'middle_middle',
          'go_fishing',
          'wild',
          'strike_thrower']

## Daily Portion

In [33]:

def daily_gen(splits, pitch_dict, df, hitter_dict):
    daily_df = pd.DataFrame()
    la_split = [f'la_vs_{split}' for split in splits]
    ev_split = [f'ev_vs_{split}' for split in splits]
    xba_split = [f'xba_vs_{split}' for split in splits]
    whiff_split = [f'whiff%_vs_{split}' for split in splits]

    columns = la_split + ev_split + xba_split + whiff_split
    for pitcher in pitch_dict.keys():
        if pitcher not in set(list(df['pitcher_name'])):
            print(pitcher)
        else:
            pitcher_df = df[df['pitcher_name'] == pitcher].drop_duplicates(subset='pitcher_name', keep='last')
            if pitcher_df.shape[0] == 0:
                return pitcher
            df1 = df[df['team_batting'] == pitch_dict[pitcher]][['batter_name'] + columns]
            df1['pitcher_name'] = pitcher
            count = 0
            for split in splits:
                df1[split] = pitcher_df.iloc[0, pitcher_df.columns.get_loc(split)]
            daily_df = pd.concat([daily_df, df1])
    
    daily_df = daily_df[['batter_name','pitcher_name'] + splits + la_split + ev_split + xba_split + whiff_split]
    
    for split in splits:
        daily_df[f'ev_vs_{split}'] = np.where(daily_df[split] == 0, np.nan, daily_df[f'ev_vs_{split}'])
        daily_df[f'la_vs_{split}'] = np.where(daily_df[split] == 0, np.nan, daily_df[f'la_vs_{split}'])
        daily_df[f'xba_vs_{split}'] = np.where(daily_df[split] == 0, np.nan, daily_df[f'xba_vs_{split}'])
        daily_df[f'whiff%_vs_{split}'] = np.where(daily_df[split] == 0, np.nan, daily_df[f'whiff%_vs_{split}'])

    daily_df = daily_df.merge(df[['batter_name', 'pitcher_name', 'ev_vs_pitcher', 'la_vs_pitcher', 'xba_vs_pitcher','whiff%_vs_pitcher']], how='left', on=['batter_name', 'pitcher_name'])

    weights = [10,10,5,5,5,5,5,5,5,5,5,5,5,5,5,5,2]
    daily_df['avg_ev'] = round(daily_df[ev_split + ['ev_vs_pitcher']].mul(weights).sum(axis=1),3)
    daily_df['avg_la'] = round(daily_df[la_split + ['la_vs_pitcher']].mul(weights).sum(axis=1),3)
    daily_df['avg_xba'] = round(daily_df[xba_split + ['xba_vs_pitcher']].mul(weights).sum(axis=1),3)
    daily_df['avg_whiff%'] = round(daily_df[whiff_split + ['whiff%_vs_pitcher']].mul(weights).sum(axis=1),3)

    daily_df['batter_team'] = daily_df['pitcher_name'].map(pitch_dict)
    daily_df['drop_check'] = daily_df['batter_name'].map(hitter_dict)
    daily_df = daily_df[daily_df['batter_team'] == daily_df['drop_check']].drop_duplicates(subset='batter_name', keep='last')
    
    daily_df['ev_rank'] = daily_df['avg_ev'].rank(ascending=False)
    daily_df['la_rank'] = daily_df['avg_la'].rank(ascending=False)
    daily_df['xba_rank'] = daily_df['avg_xba'].rank(ascending=False)
    daily_df['whiff_rank'] = daily_df['avg_whiff%'].rank(ascending=True)

    weights = [0.3,0.3,0.3,0.2]
    daily_df['stat'] = round(daily_df[['ev_rank', 'la_rank', 'xba_rank', 'whiff_rank']].mul(weights).sum(axis=1),3)
    

    return daily_df[['batter_name', 'batter_team', 'pitcher_name', 'ev_rank', 'la_rank', 'xba_rank', 'whiff_rank', 'stat']].sort_values('stat', ascending=True)

df1 = daily_gen(splits, pitch_dict, df, hitter_dict)

Jose Butto


In [34]:
df1

Unnamed: 0,batter_name,batter_team,pitcher_name,ev_rank,la_rank,xba_rank,whiff_rank,stat
468458,Mike Trout,LAA,Garrett Whitlock,1.0,2.0,1.0,262.0,53.60
468513,Brandon Drury,LAA,Garrett Whitlock,9.0,44.0,11.0,236.0,66.40
467641,Taylor Ward,LAA,Garrett Whitlock,51.0,79.0,31.0,108.0,69.90
1320814,Rowdy Tellez,MIL,Yu Darvish,23.0,41.0,44.0,210.0,74.40
1237268,Will Smith,LAD,Drew Smyly,92.0,45.0,73.0,57.0,74.40
...,...,...,...,...,...,...,...,...
492369,Matt Wallner,MIN,Gerrit Cole,368.0,317.0,366.0,254.0,366.10
606638,Avisail Garcia,MIA,Zac Gallen,352.0,372.0,345.5,314.0,383.65
527297,Gabriel Arias,CLE,Patrick Corbin,375.0,375.0,365.0,248.0,384.10
491830,Edouard Julien,MIN,Gerrit Cole,356.0,383.0,384.0,258.5,388.60


In [35]:
df1.to_csv(f"predictions/{today}.csv")