In [None]:
import sqlite3
import pandas as pd
import json
from urllib.request import urlopen
import numpy as np

## Returns URL for Games Between Two Dates

In [None]:
# Function returns the JSON for the schedule of all games between startDate to endDate
# Dates must be formatted like "YYYY-MM-DD"
def getSchedule (startDate, endDate):
    schedule = "https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate=" + startDate + "&endDate=" + endDate
    return schedule

## Returns List of GamePks Associated to Games in Schedule

In [None]:
# Function returns a list of gamePks (Game IDs) for all games between startDate to endDate
# Dates must be formatted like "YYYY-MM-DD"
def getGamePks (schedule):
    request = urlopen(schedule)
    data_json = json.loads(request.read())

    schedule_by_date = data_json['dates']

    games = []
    for date in range(0, len(schedule_by_date)):
        games_by_date = schedule_by_date[date]['games']
        for game in range(0, len(games_by_date)):
            games.append(games_by_date[game]['gamePk'])
    return games

## Scrapes the JSON Data and Converts to Dataframe

In [None]:
def getGame(gamePk):
    game = "https://statsapi.mlb.com/api/v1.1/game/" + str(gamePk) + "/feed/live"
    request = urlopen(game)
    data_json = json.loads(request.read())
    
    at_bats = data_json['liveData']['plays']['allPlays']
    
    # Game Data
    game_pack =  []
    
    # At Bat Data
    batter_id =  []
    batter_name =  []
    batter_handedness = []
    pitcher_id =  []
    pitcher_name =  []
    pitch_handedness = []
    event = []
    description = []
    rbi = []
    ab_index = []
    
    # Batted ball data
    exit_velo = []
    launch_angle = []
    total_distance = []
       
    # Pitch Data
    in_play = []
    is_strike = []
    is_ball = []
    pitch_type = []
    pitch_velo = []
    spin_rate = []
    pitch_location_x = []
    pitch_location_y = []
    
    # Inning Data
    balls = []
    strikes = []
    outs = []
    home_score = []
    away_score = []
    inning = []
    half_inning = []
    
    for i in range(0, len(at_bats)):
        ab = at_bats[i]
        for pitch in range(0, len(ab['playEvents'])):
            if (ab['playEvents'][pitch]['isPitch'] == True):
                if ("Automatic Ball" not in ab['playEvents'][pitch]['details']['description']):
                    # GamePk as a primary key
                    game_pack.append(gamePk)

                    # At Bat Data
                    batter_id.append(ab['matchup']['batter']['id'])
                    batter_name.append(ab['matchup']['batter']['fullName'])
                    batter_handedness.append(ab['matchup']['batSide']['code'])
                    pitcher_id.append(ab['matchup']['pitcher']['id'])
                    pitcher_name.append(ab['matchup']['pitcher']['fullName'])
                    pitch_handedness.append(ab['matchup']['pitchHand']['code'])
                    event.append(ab['result']['event'])
                    description.append(ab['result']['description'])
                    rbi.append(ab['result']['rbi'])
                    ab_index.append(i+1)

                    # Batted Ball Data
                    if (ab['playEvents'][pitch]['details']['isInPlay'] == True):
                            #print(str(gamePk) + ": " + ab['result']['description'])
                            if ("launchSpeed" in ab['playEvents'][pitch]['hitData']):
                                exit_velo.append(ab['playEvents'][pitch]['hitData']['launchSpeed'])
                            else:
                                exit_velo.append(np.nan)
                            
                            
                            if ("launchAngle" in ab['playEvents'][pitch]['hitData']):
                                launch_angle.append(ab['playEvents'][pitch]['hitData']['launchAngle'])
                            else:
                                launch_angle.append(np.nan)
                            
                            
                            if ("totalDistance" in ab['playEvents'][pitch]['hitData']):
                                total_distance.append(ab['playEvents'][pitch]['hitData']['totalDistance'])
                            else:
                                total_distance.append(np.nan)
                        
                            
                    else:
                        exit_velo.append(np.nan)
                        launch_angle.append(np.nan)
                        total_distance.append(np.nan)

                    # Pitch Data
                    in_play.append(ab['playEvents'][pitch]['details']['isInPlay'])
                    is_strike.append(ab['playEvents'][pitch]['details']['isStrike'])
                    is_ball.append(ab['playEvents'][pitch]['details']['isBall'])
                    
                    if ("type" in ab['playEvents'][pitch]['details']):
                        pitch_type.append(ab['playEvents'][pitch]['details']['type']['description'])
                    else:
                        pitch_type.append(np.nan)
                    
                    if ("startSpeed" in ab['playEvents'][pitch]['pitchData']):
                        pitch_velo.append(ab['playEvents'][pitch]['pitchData']['startSpeed'])
                    else:
                        pitch_velo.append(np.nan)
                    
                    if ("spinRate" in ab['playEvents'][pitch]['pitchData']['breaks']):
                        spin_rate.append(ab['playEvents'][pitch]['pitchData']['breaks']['spinRate'])
                    else:
                        spin_rate.append(np.nan)
                       
                    if ("pX" in ab['playEvents'][pitch]['pitchData']['coordinates']):
                        pitch_location_x.append(ab['playEvents'][pitch]['pitchData']['coordinates']['pX'])
                    else:
                        pitch_location_x.append(np.nan)
                    
                    
                    if ("pZ" in ab['playEvents'][pitch]['pitchData']['coordinates']):
                        pitch_location_y.append(ab['playEvents'][pitch]['pitchData']['coordinates']['pZ'])
                    else:
                        pitch_location_y.append(np.nan)
                    


                    # Inning Data
                    home_score.append(ab['result']['homeScore'])
                    away_score.append(ab['result']['awayScore'])
                    balls.append(ab['playEvents'][pitch]['count']['balls'])
                    strikes.append(ab['playEvents'][pitch]['count']['strikes'])
                    outs.append(ab['playEvents'][pitch]['count']['outs'])
                    inning.append(ab['about']['inning'])
                    half_inning.append(ab['about']['halfInning'])

                
    
    data = {
        # Overall Game Data
        'game_pack' : game_pack,
        
        # At Bat Data
        'batter_id' :  batter_id,
        'batter_name' : batter_name,
        'bats' : batter_handedness,
        'pitcher_id' :  pitcher_id,
        'pitcher_name' : pitcher_name,
        'throws' : pitch_handedness,
        'event' : event,
        'description' : description,
        'rbi' : rbi,
        'ab_index' : ab_index,
        
        # Batted Ball Data
        'exit_velo': exit_velo,
        'launch_angle': launch_angle,
        'total_distance': total_distance,
        
        # Pitch Data
        'in_play' : in_play,
        'is_strike' : is_strike,
        'is_ball' : is_ball,
        'pitch_type' : pitch_type,
        'pitch_velo' : pitch_velo,
        'spin_rate' : spin_rate,
        'pitch_location_x' : pitch_location_x,
        'pitch_location_y' : pitch_location_y,

        
        # Inning Data
        'home_score' : home_score,
        'away_score' : away_score,
        'balls' : balls,
        'strikes' : strikes,
        'outs' : outs,
        'inning' : inning,
        'half_inning' : half_inning
    }
    
    game = pd.DataFrame(data)
    return game.set_index('game_pack')

In [None]:
def getGameInfo(gamePk):
    game = "https://statsapi.mlb.com/api/v1.1/game/" + gamePk + "/feed/live"
    request = urlopen(game)
    data_json = json.loads(request.read())
    
    at_bats = data_json['liveData']['plays']['allPlays']
    
    game_pack = [gamePk]
    date = data_json['gameData']['datetime']['officialDate']
    away_team = data_json['gameData']['teams']['away']['name']
    away_team_id = data_json['gameData']['teams']['away']['id']
    home_team = data_json['gameData']['teams']['home']['name']
    home_team_id = data_json['gameData']['teams']['home']['id']
    venue = data_json['gameData']['venue']['name']
    temp = data_json['gameData']['weather']['temp']
    conditions = data_json['gameData']['weather']['condition']
    wind = data_json['gameData']['weather']['wind']
        
    data = {
        'game_pack' : game_pack,
        'date' : date,
        'away_team' : away_team,
        'away_team_id' : away_team_id,
        'home_team' : home_team,
        'home_team_id' : home_team_id,
        'venue' : venue,
        'temp' : temp,
        'conditions' : conditions,
        'wind' : wind,
    }
    
    game = pd.DataFrame(data)
    
    return game.set_index('game_pack')

## Perform API Scrape on 2021 and 2022 Seasons

In [None]:
schedule_2021 = getSchedule('2021-04-01', '2021-10-03')
schedule_2022 = getSchedule('2022-04-07', '2022-10-05')

gamePks_2021 = getGamePks(schedule_2021)
gamePks_2022 = getGamePks(schedule_2022)

count = 0
for game in gamePks_2021:
    if count == 0:
        games_2021 = getGame(game)
        count = 1
    else:
        temp = getGame(game)
        games_2021 = pd.concat([games_2021, temp])

count = 0
for game in gamePks_2022:
    if count == 0:
        games_2022 = getGame(game)
        count = 1
    else:
        temp = getGame(game)
        games_2022 = pd.concat([games_2022, temp])
        
database = pd.concat([games_2021, games_2022])

In [673]:
schedule_2021 = getSchedule('2021-04-01', '2021-10-03')
schedule_2022 = getSchedule('2022-04-07', '2022-10-05')

schedule_2021 = getSchedule('2021-10-01', '2021-10-01')
schedule_2022 = getSchedule('2022-10-01', '2022-10-01')

gamePk_2021 = getGamePk(schedule_2021)
gamePk_2022 = getGamePk(schedule_2022)

getGame(gamePk_2022[3])


Unnamed: 0_level_0,batter_id,batter_name,bats,pitcher_id,pitcher_name,throws,event,description,rbi,ab_index,...,spin_rate,pitch_location_x,pitch_location_y,home_score,away_score,balls,strikes,outs,inning,half_inning
game_pack,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
663167,663697,Jonathan India,R,592767,Drew Smyly,L,Strikeout,Jonathan India called out on strikes.,0,1,...,2251,-0.42,2.94,0,0,0,1,0,1,top
663167,663697,Jonathan India,R,592767,Drew Smyly,L,Strikeout,Jonathan India called out on strikes.,0,1,...,2321,-0.61,0.39,0,0,1,1,0,1,top
663167,663697,Jonathan India,R,592767,Drew Smyly,L,Strikeout,Jonathan India called out on strikes.,0,1,...,2140,-0.36,2.15,0,0,1,2,0,1,top
663167,663697,Jonathan India,R,592767,Drew Smyly,L,Strikeout,Jonathan India called out on strikes.,0,1,...,2261,-1.20,-0.39,0,0,2,2,0,1,top
663167,663697,Jonathan India,R,592767,Drew Smyly,L,Strikeout,Jonathan India called out on strikes.,0,1,...,2150,0.14,2.76,0,0,2,2,0,1,top
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663167,641584,Jake Fraley,L,655889,Manuel Rodriguez,R,Walk,Jake Fraley walks.,0,66,...,2162,-0.75,3.61,2,1,3,1,2,9,top
663167,641584,Jake Fraley,L,655889,Manuel Rodriguez,R,Walk,Jake Fraley walks.,0,66,...,1446,-0.48,4.18,2,1,4,1,2,9,top
663167,664670,Alejo Lopez,R,676714,Brandon Hughes,L,Strikeout,"Alejo Lopez strikes out swinging, catcher Will...",0,67,...,2445,-0.50,0.87,2,1,0,1,2,9,top
663167,664670,Alejo Lopez,R,676714,Brandon Hughes,L,Strikeout,"Alejo Lopez strikes out swinging, catcher Will...",0,67,...,2446,-0.98,1.09,2,1,0,2,2,9,top
