Pull statcast from mlb_stats_api.

mlb_stats_api: https://github.com/toddrob99/MLB-StatsAPI

In [1]:
%load_ext jupyternotify
import pandas as pd
import numpy as np
import seaborn as sns
import time
import timeit
import statsapi

from matplotlib import pyplot as plt
from tqdm.auto import tqdm

pd.set_option('display.max_columns', None)

<IPython.core.display.Javascript object>

In [2]:
np.set_printoptions(suppress=True)

In [3]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [4]:
# Get game_pk from current schedule
game_data = pd.read_csv('data/current_schedule.csv', index_col=[0])

In [6]:
game_data['game_date'].max()

'2022-09-24'

In [9]:
# retrive game_pk before the 2022 ASG as a list
game_pk_list = game_data[game_data['game_date']<'2022-07-19']['game_id'].to_list()

In [37]:
def pull_pitch_data(game_pk):
    try:
        data = statsapi.get('game', {'gamePk':game_pk}, force=False)
    except:
        return None
    
    gameData = data['gameData']

    game_data_dict = {}

    game_data_dict['gamePk'] = game_pk
    try:
        game_data_dict['gameType'] = gameData['game']['type']
    except:
        game_data_dict['gameType'] = np.nan
    try:
        game_data_dict['gameNumber'] = gameData['game']['gameNumber']
    except:
        game_data_dict['gameNumber'] = np.nan
    try:
        game_data_dict['season'] = gameData['game']['season']
    except:
        game_data_dict['season'] = np.nan
    try:
        game_data_dict['officialDate'] = gameData['datetime']['officialDate']
    except:
        game_data_dict['officialDate'] = np.nan
    try:
        game_data_dict['abstractGameState'] = gameData['status']['abstractGameState']
    except:
        game_data_dict['abstractGameState'] = np.nan
    try:
        game_data_dict['awayTeamCode'] = gameData['teams']['away']['teamCode']
    except:
        game_data_dict['awayTeamCode'] = np.nan
    try:
        game_data_dict['awayAbbreviation'] = gameData['teams']['away']['abbreviation']
    except:
        game_data_dict['awayAbbreviation'] = np.nan
    try:
        game_data_dict['homeTeamCode'] = gameData['teams']['home']['teamCode']
    except:
        game_data_dict['homeTeamCode'] = np.nan
    try:
        game_data_dict['homeAbbreviation'] = gameData['teams']['home']['abbreviation']
    except:
        game_data_dict['homeAbbreviation'] = np.nan
    try:
        game_data_dict['venueId'] = gameData['venue']['id']
    except:
        game_data_dict['venueId'] = np.nan
    try:
        game_data_dict['venueName'] = gameData['venue']['name']
    except:
        game_data_dict['venueName'] = np.nan
    try:
        game_data_dict['weatherCond'] = gameData['weather']['condition']
    except:
        game_data_dict['weatherCond'] = np.nan
    try:
        game_data_dict['temp'] = gameData['weather']['temp']
    except:
        game_data_dict['temp'] = np.nan
    try:
        game_data_dict['wind'] = gameData['weather']['wind']
    except:
        game_data_dict['wind'] = np.nan

    allPlays = data['liveData']['plays']['allPlays']

    allPlays_df = pd.json_normalize(allPlays, 
                     record_path = ['playEvents'],
                     meta=[['result', 'type'],
                          ['result', 'event'],
                          ['result', 'eventType'],
                          ['result', 'description'],
                          ['result', 'rbi'],
                          ['result', 'awayScore'],
                          ['result', 'homeScore'],
                          ['about', 'atBatIndex'],
                          ['about', 'halfInning'],
                          ['about', 'isTopInning'],
                          ['about', 'inning'],
                          ['about', 'startTime'],
                          ['about', 'endTime'],
                          ['about', 'isComplete'],
                          ['about', 'isScoringPlay'],
                          ['about', 'hasReview'],
                          ['about', 'hasOut'],
                          ['about', 'captivatingIndex'],
                          ['matchup', 'batter', 'id'],
                          ['matchup', 'batter', 'fullName'],
                          ['matchup', 'batter', 'link'],
                          ['matchup', 'batSide', 'code'],
                          ['matchup', 'batSide', 'description'],
                          ['matchup', 'pitcher', 'id'],
                          ['matchup', 'pitcher', 'fullName'],
                          ['matchup', 'pitcher', 'link'],
                          ['matchup', 'pitchHand', 'code'],
                          ['matchup', 'pitchHand', 'description'],
                          ['matchup', 'splits', 'batter'],
                          ['matchup', 'splits', 'pitcher'],
                          ['matchup', 'splits', 'menOnBase']], errors='ignore')

    allPlays_dict = allPlays_df.to_dict('records')

    return [{**play_dict, **game_data_dict} for play_dict in allPlays_dict]

In [38]:
# pull statcast into an array
data_arr_all = np.empty([0])
for game_pk in tqdm(game_pk_list):
    data_arr = pull_pitch_data(game_pk)
    data_arr_all = np.concatenate([data_arr_all, data_arr])

  0%|          | 0/9878 [00:00<?, ?it/s]

In [42]:
data_list = list(data_arr_all)

In [139]:
# split full list into 7 chunks with 500000 elements each
chunk_size = 500000
full_list = data_list
chunked_list = [full_list[i:i+chunk_size] for i in range(0, len(full_list), chunk_size)]

In [146]:
# turn chunked list into dataframes
for i in tqdm(range(len(chunked_list))):
    chunk_df = pd.DataFrame(chunked_list[i])
    chunk_df.to_csv(f'new_statcast/df_{i+1}.csv')

  0%|          | 0/7 [00:00<?, ?it/s]

In [162]:
# df1 = pd.read_csv('new_statcast/df_1.csv', index_col=[0])
# df2 = pd.read_csv('new_statcast/df_2.csv', index_col=[0])
# df3 = pd.read_csv('new_statcast/df_3.csv', index_col=[0])
# df4 = pd.read_csv('new_statcast/df_4.csv', index_col=[0])
# df5 = pd.read_csv('new_statcast/df_5.csv', index_col=[0])
# df6 = pd.read_csv('new_statcast/df_6.csv', index_col=[0])
# df7 = pd.read_csv('new_statcast/df_7.csv', index_col=[0])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [163]:
# df_all = pd.concat([df1, df2, df3, df4, df5, df6, df7])
# df_all.to_csv('new_statcast/df_all.csv')