In [3]:
%pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.7-py3-none-any.whl (235 kB)
                                              0.0/235.5 kB ? eta -:--:--
                                              0.0/235.5 kB ? eta -:--:--
     -                                        10.2/235.5 kB ? eta -:--:--
     -                                        10.2/235.5 kB ? eta -:--:--
     -                                        10.2/235.5 kB ? eta -:--:--
     ----                                  30.7/235.5 kB 262.6 kB/s eta 0:00:01
     ------                                41.0/235.5 kB 245.8 kB/s eta 0:00:01
     ------------                          81.9/235.5 kB 416.7 kB/s eta 0:00:01
     -----------------                    112.6/235.5 kB 504.4 kB/s eta 0:00:01
     -------------------------            163.8/235.5 kB 614.4 kB/s eta 0:00:01
     ----------------------------------   225.3/235.5 kB 724.0 kB/s eta 0:00:01
     ------------------------------------ 235.5/235.5 kB 758.4 kB/s eta 0:0

In [1]:
import warnings
warnings.filterwarnings('ignore')
from pybaseball import statcast
from pybaseball import playerid_reverse_lookup
from unidecode import unidecode 
import os
import json
import time
import pickle

## Scraping Data
Finding a good data source for training data is very hard.  You need massive amounts of data. One of the approaches is to use derived text to train a specific model.  You can turn statistical data into text by encoding the data with a complex template.  Below I take twenty years of baseball data and encode it into a text format.  This way we use language to represent statistics. 

We first download and cache all the the mlb statcast events from the start of the season to the end of the season from 2003 to 2023 (this will take a long time)

In [4]:

captures = [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012]
years = [2018, 2019, 2020, 2021, 2022, 2023]
for year in years:
    events = statcast(start_dt=f"{year}-03-01", end_dt=f"{year}-11-15")
    with open(f"../data/seasons/season-{year}.pickle", "wb") as scores:
        pickle.dump(events, scores)
    time.sleep(5)

This is a large query, it may take a moment to complete
Skipping offseason dates
Skipping offseason dates


100%|██████████| 214/214 [11:16<00:00,  3.16s/it]


This is a large query, it may take a moment to complete
Skipping offseason dates
Skipping offseason dates


100%|██████████| 225/225 [11:46<00:00,  3.14s/it]


This is a large query, it may take a moment to complete
Skipping offseason dates
Skipping offseason dates


100%|██████████| 97/97 [05:10<00:00,  3.20s/it] 


This is a large query, it may take a moment to complete
Skipping offseason dates


100%|██████████| 246/246 [13:19<00:00,  3.25s/it]


This is a large query, it may take a moment to complete
Skipping offseason dates


100%|██████████| 246/246 [13:16<00:00,  3.24s/it]


This is a large query, it may take a moment to complete
Skipping offseason dates


100%|██████████| 246/246 [13:29<00:00,  3.29s/it]


Here comes the fun part!  We must examine this data to see what it contains and what we can use to encode the information we want to know.  We want to know the outcome of each pitch to the batter and use that to build a response that contains the at-bat of each player and the subsequent event or the outcome of the at-bat.  

But first we use the cache to create a dictionary of the player UUIDs to their actual names




In [15]:
players = {}

In [3]:
path = "../data/seasons/"
files = os.listdir(path)
files.sort()
for file in files:
    print(file)
    pickle_in = open(f"{path}/{file}","rb")
    every_pitch = pickle.load(pickle_in)
    every_pitch = every_pitch.iloc[::-1]
    for index, row in every_pitch.iterrows(): 
        batter = str(row.batter)
        pitcher = str(row.pitcher)
        if pitcher not in players:
            name_data = playerid_reverse_lookup([row.pitcher], key_type='mlbam')
            if len(name_data['name_first']) > 0 and len(name_data['name_last']) > 0:
                players[pitcher] = f"{unidecode(name_data['name_first'][0])} {unidecode(name_data['name_last'][0])}"
            else:
                players[pitcher] = pitcher
        if batter not in players:
            name_data = playerid_reverse_lookup([row.batter], key_type='mlbam')
            if len(name_data['name_first']) > 0 and len(name_data['name_last']) > 0:
                players[batter] = f"{unidecode(name_data['name_first'][0])} {unidecode(name_data['name_last'][0])}"
            else:
                players[batter] = batter
       
with open(f'../data/players.json', 'w', encoding='utf-8') as f:
    json.dump(players, f, ensure_ascii=True, indent=4, allow_nan=True)
    f.close()


season-2003.pickle


NameError: name 'players' is not defined

In [2]:
with open(f'../data/players.json', 'r', encoding='utf-8') as f:
    players = json.load(f)

In [4]:
path = "../data/seasons/"
files = os.listdir(path)
files.sort()
baseball = []
lines = []
test = []
season_count = 0
for file in files:
    print(file)
    pickle_in = open(f"{path}/{file}","rb")
    every_pitch = pickle.load(pickle_in)
    every_pitch = every_pitch.iloc[::-1]
    season = []
    at_bat = []
    pre_batter = ""
    for index, row in every_pitch.iterrows():
        current_batter = str(row.batter)
        if current_batter not in pre_batter:
            at_bat = []
        pre_batter = current_batter
        if str(row.pitcher) in players and str(row.batter) in players:
            instruction = f"what is the outcome of pitcher {players[str(row.pitcher)]} pitching to batter {players[str(row.batter)]}" 
            input_data = f"{row.inning_topbot} of the {row.inning} inning with {row.outs_when_up} outs "

            description = row.description 
            description = description.replace("_", " ")

            response = f"{description} "
            at_bat.append(response)
            
            if isinstance(row.events, str):
                # response = ""
                event = row.events 
                event = event.replace("_", " ")
                at_bat.append(event)
        
                output = ""
                for pitch in at_bat:
                    output =f"{output}{pitch}"
                score = "" if (row.post_bat_score - row.bat_score) < 1 else f" and {(row.post_bat_score - row.bat_score)} runs scores"
                output = f"{output}{score}"                         
                baseball.append(
                    {
                    'instruction':instruction,
                    'input':input_data,
                    "output":output
                    }
                )
                lines.append(f"<s>###instruction {instruction} ###input {input_data} ###output {output}</s>\n") 
            
           
    season_count+=1

    with open(f'twenty_years_of_baseball_2.json', 'w', encoding='utf-8') as f:
        json.dump(baseball, f, ensure_ascii=True, indent=4, allow_nan=True)
        f.close()

with open(f'twenty_years_of_baseball_2.txt', 'w', encoding='utf-8') as f:
    f.writelines(lines)
    f.close()

season-2003.pickle
season-2004.pickle
season-2005.pickle
season-2006.pickle
season-2007.pickle
season-2008.pickle
season-2009.pickle
season-2010.pickle
season-2011.pickle
season-2012.pickle
season-2014.pickle
season-2015.pickle
season-2016.pickle
season-2018.pickle
season-2019.pickle
season-2020.pickle
season-2021.pickle
season-2022.pickle
season-2023.pickle
