In [None]:
%pip install unidecode

In [2]:
import warnings
warnings.filterwarnings('ignore')
from pybaseball import statcast
from pybaseball import playerid_reverse_lookup
from unidecode import unidecode 
import os
import json
import time
import pickle

## Scraping Data
Finding a good data source for training data is very hard.  You need massive amounts of data. One of the approaches is to use derived text to train a specific model.  You can turn statistical data into sturctured data by encoding the data with a complex transformation.  Below I take twenty years of baseball data and encode it into a structured data.  This way we use llm to generate structured data a program can understand. 

We first download and cache all the the mlb statcast events from the start of the season to the end of the season from 2003 to 2023 (this will take a long time)

In [None]:

years = [2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2018, 2019, 2020, 2021, 2022, 2023]
for year in years:
    events = statcast(start_dt=f"{year}-03-01", end_dt=f"{year}-11-15")
    with open(f"../data/seasons/season-{year}.pickle", "wb") as scores:
        pickle.dump(events, scores)
    time.sleep(5)

Here comes the fun part!  We must examine this data to see what it contains and what we can use to encode the information we want to know.  We want to know the outcome of each pitch to the batter and use that to build a response that contains the at-bat of each player and the subsequent event or the outcome of the at-bat.  

But first we use the cache to create a dictionary of the player UUIDs to their actual names




In [15]:
players = {}

In [1]:
path = "../data/seasons/"
files = os.listdir(path)
files.sort()
for file in files:
    print(file)
    pickle_in = open(f"{path}/{file}","rb")
    every_pitch = pickle.load(pickle_in)
    every_pitch = every_pitch.iloc[::-1]
    for index, row in every_pitch.iterrows(): 
        batter = str(row.batter)
        pitcher = str(row.pitcher)
        if pitcher not in players:
            name_data = playerid_reverse_lookup([row.pitcher], key_type='mlbam')
            if len(name_data['name_first']) > 0 and len(name_data['name_last']) > 0:
                players[pitcher] = f"{unidecode(name_data['name_first'][0])} {unidecode(name_data['name_last'][0])}"
            else:
                players[pitcher] = pitcher
        if batter not in players:
            name_data = playerid_reverse_lookup([row.batter], key_type='mlbam')
            if len(name_data['name_first']) > 0 and len(name_data['name_last']) > 0:
                players[batter] = f"{unidecode(name_data['name_first'][0])} {unidecode(name_data['name_last'][0])}"
            else:
                players[batter] = batter
       
with open(f'../data/players.json', 'w', encoding='utf-8') as f:
    json.dump(players, f, ensure_ascii=True, indent=4, allow_nan=True)
    f.close()


NameError: name 'os' is not defined

In [4]:
with open(f'../data/players.json', 'r', encoding='utf-8') as f:
    players = json.load(f)

In [6]:
from decimal import Decimal
from pandas._libs.missing import NAType

class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if obj is None:
            return ""
        if isinstance(obj, Decimal):
            return str(obj)
        if isinstance(obj, NAType):
            return ""
        # 👇️ otherwise use the default behavior
        return json.JSONEncoder.default(self, obj)

In [10]:
path = "../data/seasons/"
files = os.listdir(path)
files.sort()
baseball = []
input_data = []
results_data = []
lines = []
json_data = {}
json_data['data'] = []
test = []
season_count = 0
import csv
with open('eggs.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=' ',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerow(['instruction', 'output', 'answer'])
    for file in files:
        print(file)
        if "2003" not in file:
            pickle_in = open(f"{path}/{file}","rb")
            every_pitch = pickle.load(pickle_in)
            every_pitch = every_pitch.iloc[::-1]
            season = []
            at_bat = []
            pre_batter = ""
            for index, row in every_pitch.iterrows():
                current_batter = str(row.batter)
                if current_batter in players and str(row.pitcher) in players:
                    if current_batter not in pre_batter:
                        at_bat = []
                        pitch_type = []
                        release_speed = []
                        pitch_name = []
                        description = []
                    pre_batter = current_batter
                    # if str(row.pitcher) in players and str(row.batter) in players:
                    at_bat.append(row.description)
                    pitch_type.append(row.pitch_type)
                    release_speed.append(row.release_speed)
                    pitch_name.append(row.pitch_name)
                    
                    if isinstance(row.events, str):
                        on_1b = row.on_1b
                        if str(row.on_1b) in players:
                            on_1b = row.on_1b

                        on_2b = row.on_2b
                        if str(row.on_2b) in players:
                            on_2b = row.on_2b

                        on_3b = row.on_3b
                        if str(row.on_3b) in players:
                            on_3b = row.on_3b

                        input = {
                                    'pitcher':{
                                        'id':row.pitcher,
                                        'name':players[str(row.pitcher)]
                                    },
                                    'batter':{
                                        'id':row.batter,
                                        'name':players[str(row.batter)]
                                    },
                                    "p_throws":row.p_throws,
                                    "stand":row.stand,
                                    "inning_topbot":row.inning_topbot,
                                    "inning":row.inning,
                                    "outs_when_up":row.outs_when_up,
                                    "on_1b":on_1b,
                                    "on_2b":on_2b,
                                    "on_3b":on_3b,
                                    "home_score":row.home_score,
                                    "away_score":row.away_score
                                }

                        result = {
                                    "event":row.events,
                                    "type":row.type,
                                    "zone":row.zone,
                                    "des":unidecode(row.des),
                                    "at_bat_number":row.at_bat_number,
                                    "pitch_number":row.pitch_number,
                                    "pitch_name":row.pitch_name,
                                    "hit_location":row.hit_location,
                                    "launch_speed":row.launch_speed,
                                    "launch_speed_angle":row.launch_speed_angle,
                                    "runs_scored":row.post_bat_score - row.bat_score,
                                    "zone":row.zone,
                                    "at_bat":at_bat,
                                    "pitch_type":pitch_type,
                                    "release_speed":release_speed,
                                    "pitch_name":pitch_name,
                                }
                        # input_data.append({
                        #     "input":input,
                        # })
                        # results_data.append({
                        #     "result":result
                        # }
                        # )
                        baseball.append(
                            {
                                "instruction":input,
                                "output":result,
                                "answer":unidecode(row.des)
                            }
                        )
                        csv_writer.writerow(json.dumps(baseball[-1]["instruction"], ensure_ascii=True, allow_nan=True, cls=MyEncoder), json.dumps(baseball[-1]["output"], ensure_ascii=True, allow_nan=True, cls=MyEncoder), json.dumps(baseball[-1]["answer"], ensure_ascii=True, allow_nan=True, cls=MyEncoder))
                        # json_data['data'].append(f'''Question: {json.dumps(input_data[-1], ensure_ascii=True, allow_nan=True, cls=MyEncoder)}? \n Output: {json.dumps(results_data[-1], ensure_ascii=True, allow_nan=True, cls=MyEncoder)}\n''') 
                        # lines.append(f'''Question: {json.dumps(input_data[-1], ensure_ascii=True, allow_nan=True, cls=MyEncoder)}? Output: {json.dumps(results_data[-1], ensure_ascii=True, allow_nan=True, cls=MyEncoder)}\n''')
                        
                
            season_count+=1

            # with open(f'twenty_years_of_baseball_structed_lines.json', 'w', encoding='utf-8') as f:
            #     json.dump(json_data, f, ensure_ascii=True, indent=4, allow_nan=True, cls=MyEncoder)
            #     f.close()
            
            with open(f'twenty_years_of_baseball_structed.json', 'w', encoding='utf-8') as f:
                json.dump(baseball, f, ensure_ascii=True, indent=4, allow_nan=True, cls=MyEncoder)
                f.close()

# with open(f'twenty_years_of_baseball_question_answer.txt', 'w', encoding='utf-8') as f:
#     f.writelines(lines)
#     f.close()

season-2011.pickle


KeyboardInterrupt: 

In [8]:
import csv
import json
def convert_csv_to_json(csv_file_path):
    # Read CSV file
    with open(csv_file_path, 'r') as file:
        reader = csv.DictReader(file)
        rows = list(reader)
    print(rows)
    # Convert CSV data to JSON
    json_data = json.dumps(rows, indent=4)

    # Save JSON data to a file (optional)
    with open('../data/2018_Central_Park_Squirrel_Census_-_Stories_20240204.json', 'w') as json_file:
        json_file.write(json_data)

    return json_data

# Specify the path to your CSV file
csv_file_path = '../data/2018_Central_Park_Squirrel_Census_-_Stories_20240204.csv'

# Convert CSV to JSON
json_data = convert_csv_to_json(csv_file_path)

print("Conversion completed. JSON data:")
print(json_data)

Conversion completed. JSON data:
[
    {
        "Hectare": "01A",
        "Shift": "PM",
        "Date": "10142018",
        "Note Squirrel & Park Stories": "Observed a squirrel with a cache of peanuts that he was eating. Strangely, none of the other squirrels were eating those peanuts.",
        "Story Topic: Squirrel Experience or Squirrel Story": "true",
        "Story Topic: Park Experience or Census Taker Story": "",
        "Story Topic: Dogs": "",
        "Story Topic: Other Animals": "",
        "Story Topic: Accidental Poems": "",
        "Story Topic: Squirrels Acting Odd": "",
        "Story Topic: Census Takers Recognized": "",
        "Story Topic: Other": ""
    },
    {
        "Hectare": "01A",
        "Shift": "AM",
        "Date": "10182018",
        "Note Squirrel & Park Stories": "Lots of pedestrians and vehicular traffic, food vendors, bike rental guys, city workers doing something with the fountain in Columbus Circle, but not a squirrel to be seen. \n\nA bike ren