In [24]:
import requests
from bs4 import BeautifulSoup
import urllib
import itertools
import os
import re
import functools
from multiprocessing.dummy import Pool as ThreadPool

In [25]:
base_url = "http://gd2.mlb.com/components/game/mlb/"

In [26]:
date_tuples = list(itertools.product(range(4,10),range(1,32),[2019])) # put the dates to download pitchfx, as tuples (month, day, year)

In [27]:
date_tuples += [(3,28,2019),(3,31,2019),(3,30,2019)]

In [None]:
date_tuples

In [None]:
# This builds a list of games url paths to download
games = []
for date in date_tuples:
    month, day, year = date
    try:
        day_url = base_url+f"year_{year}/month_{str(month).zfill(2)}/day_{str(day).zfill(2)}"
        soup = BeautifulSoup(requests.get(day_url).text)
        games += [ f"{day_url}/{node.get('href').split('/')[1]}" for node in soup.find_all('a') if 'nya' in node.get('href')]
    except:
        pass

In [None]:
#games # just a check to see if there's data here

In [None]:
# this code downloads the pitchfx xml for the games discovered on the date
# it uses multi-threading to download faster(a lot)
pool = ThreadPool(4)
results = pool.map(lambda x: urllib.request.urlretrieve(f"{x}/inning/inning_all.xml", f"{x.split('/')[9]}.xml") if not os.path.isfile(f"{x.split('/')[9]}.xml") else None, games)
pool.close()
pool.join()

In [None]:
!pip install scipy

## Code below is meant to parse the xml files downloaded above and put into a pandas dataframe and also get the homeplate ump

In [29]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import os
from scipy.spatial import distance
from scipy.spatial.distance import cdist
pd.set_option('display.max_columns', 500)

In [30]:
# this is later used to find the center of the strike zone
def to_inches(height_string):
    feet, inches = re.findall(re.compile("(\d+)['] ?(\d\d?)"),height_string)[0]
    return int(feet) * 12 + int(inches)

In [31]:
# well use this later to make the highlights easier to get to
pd.set_option("display.max_rows",300)
def make_clickable(val):
    # target _blank to open new window
    return '<a target="_blank" href="{}">{}</a>'.format(val, val)


In [32]:
def get_ump(game):
    urllib.request.urlretrieve(f"{game}/players.xml","players.xml")
    with open("players.xml", 'r') as file:
        root = ET.parse(file).getroot()
    for ump in root.iter(tag="umpire"):
        if ump.attrib["position"] == "home":
            return ump.attrib["name"]

In [33]:
get_ump(games[1]) # test if get_ump works

'Brian Knight'

In [34]:
# builds a list for each game
at_bats = []
for game in games:
   # print(game)
    with open(f"{game.split('/')[9]}.xml", 'r') as file:
        root = ET.parse(file).getroot()
    home_ump = get_ump(game)
    for atbat in root.iter(tag='atbat'):
        for pitch in atbat.iter(tag='pitch'):
            data = pitch.attrib
            data["home_ump"] = home_ump
            data["batter"] = atbat.attrib["batter"]
            data["pitcher"] = atbat.attrib["pitcher"]
            data["atbat_result"] = atbat.attrib["event"]
            data["atbat_des"] = atbat.attrib["des"]
            data["atbat_num"] = atbat.attrib["num"]
            data["atbat_b_height_inches"] = to_inches(atbat.attrib["b_height"])
            data["game_id"] = f"{game.split('/')[9]}"
            at_bats.append(data)

## umpire closeness starts here

In [35]:
df_ab = pd.DataFrame(at_bats)

In [36]:
import pybaseball

In [38]:
df_ab.batter.head()

0    543281
1    543281
2    543281
3    543281
4    543281
Name: batter, dtype: object

In [44]:
pybaseball.playerid_reverse_lookup([543281]).apply(lambda x: x["name_first"] + " " + x["name_last"], axis=1)

Gathering player lookup table. This may take a moment.


0    josh harrison
dtype: object

In [45]:
batters = pybaseball.playerid_reverse_lookup(df_ab.batter.unique())

Gathering player lookup table. This may take a moment.


In [47]:
pitchers = pybaseball.playerid_reverse_lookup(df_ab.pitcher.unique())

Gathering player lookup table. This may take a moment.


In [55]:
def get_batter_name(batter_id):
    batter = batters.query(f"key_mlbam == {batter_id}").iloc[0]
    return f'{batter["name_first"]} {batter["name_last"]}'

def get_pitcher_name(pitcher_id):
    pitcher = pitchers.query(f"key_mlbam == {pitcher_id}").iloc[0]
    return f'{pitcher["name_first"]} {pitcher["name_last"]}'

In [56]:
get_batter_name(642715)

'willy adames'

In [54]:
df_ab["batter_name"] = df_ab.batter.apply(get_batter_name)

In [57]:
df_ab["pitcher_name"] = df_ab.pitcher.apply(get_pitcher_name)

In [58]:
df_ab.head()

Unnamed: 0,ax,ay,az,break_angle,break_length,break_y,cc,code,des,des_es,end_speed,event_num,id,mt,nasty,pfx_x,pfx_z,pitch_type,play_guid,px,pz,spin_dir,spin_rate,start_speed,sv_id,sz_bot,sz_top,tfs,tfs_zulu,type,type_confidence,vx0,vy0,vz0,x,x0,y,y0,z0,zone,home_ump,batter,pitcher,atbat_result,atbat_des,atbat_num,atbat_b_height_inches,game_id,batter_name,pitcher_name
0,-13.24,30.05,-15.24,34.8,4.8,24.0,,C,Called Strike,"In play, out(s)",86.4,2,2,,,-6.93,8.86,FF,0f37f9a8-a0b4-4fa9-8057-95614a4c5f67,0.66,2.32,placeholder,placeholder,94.6,190401_223912,1.6,3.46,223907,2019-04-01T22:39:07.000Z,S,placeholder,8.76,-137.3,-5.37,91.73,-1.67,176.15,50.0,5.33,placeholder,Gerry Davis,543281,593334,Pop Out,Josh Harrison pops out to second baseman Gleyb...,1,68,gid_2019_04_01_detmlb_nyamlb_1,josh harrison,domingo german
1,-13.61,30.67,-16.23,33.6,4.8,24.0,,S,Swinging Strike,Called Strike,86.4,3,3,,,-7.12,8.34,FF,7f6a7cf6-5250-4e41-952d-aa51dbb61b4a,1.19,2.31,placeholder,placeholder,94.8,190401_223925,1.48,3.26,223920,2019-04-01T22:39:20.000Z,S,placeholder,9.99,-137.47,-5.21,71.77,-1.57,176.31,50.0,5.34,placeholder,Gerry Davis,543281,593334,Pop Out,Josh Harrison pops out to second baseman Gleyb...,1,68,gid_2019_04_01_detmlb_nyamlb_1,josh harrison,domingo german
2,0.88,23.11,-37.31,2.4,10.8,24.0,,B,Ball,Swinging Strike,75.5,4,4,,,0.6,-3.54,CU,ab7edd0c-3ee5-4ff9-b817-5e8f4b1a340d,-0.73,3.75,placeholder,placeholder,82.4,190401_223942,1.61,3.44,223937,2019-04-01T22:39:37.000Z,B,placeholder,1.75,-119.77,2.64,144.96,-1.55,137.39,50.0,5.97,placeholder,Gerry Davis,543281,593334,Pop Out,Josh Harrison pops out to second baseman Gleyb...,1,68,gid_2019_04_01_detmlb_nyamlb_1,josh harrison,domingo german
3,-14.51,32.61,-15.76,37.2,4.8,24.0,,F,Foul,Ball,87.6,5,5,,,-7.36,8.32,FF,554ca6b9-83c6-4d3d-b2bb-3a58a4051839,0.65,2.53,placeholder,placeholder,96.3,190401_223956,1.48,3.26,223952,2019-04-01T22:39:52.000Z,S,placeholder,8.1,-139.81,-5.01,92.4,-1.34,170.51,50.0,5.38,placeholder,Gerry Davis,543281,593334,Pop Out,Josh Harrison pops out to second baseman Gleyb...,1,68,gid_2019_04_01_detmlb_nyamlb_1,josh harrison,domingo german
4,1.27,20.32,-38.45,2.4,10.8,24.0,,X,"In play, out(s)",Foul,76.5,6,6,,,0.86,-4.28,CU,c49afc96-76da-450b-8a97-46b588a80c05,-0.06,2.57,placeholder,placeholder,82.2,190401_224029,1.63,3.55,224016,2019-04-01T22:40:16.000Z,X,placeholder,2.66,-119.62,0.32,119.04,-1.29,169.44,50.0,5.84,placeholder,Gerry Davis,543281,593334,Pop Out,Josh Harrison pops out to second baseman Gleyb...,1,68,gid_2019_04_01_detmlb_nyamlb_1,josh harrison,domingo german


In [59]:
df_ab.to_csv("yankees_2019.csv")