In [88]:
import sqlite3
import pandas as pd
import json
from urllib.request import urlopen
import numpy as np
from datetime import datetime

## Returns URL for Games Between Two Dates

In [89]:
# Function returns the JSON for the schedule of all games between startDate to endDate
# Dates must be formatted like "YYYY-MM-DD"
def getSchedule (startDate, endDate):
    schedule = "https://statsapi.mlb.com/api/v1/schedule?sportId=1&startDate=" + startDate + "&endDate=" + endDate
    return schedule

## Returns List of GamePks Associated to Games in Span of Time

In [90]:
# Function returns a list of gamePks (Game IDs) for all games between startDate to endDate
# Dates must be formatted like "YYYY-MM-DD"
def getGamePks (schedule):
    request = urlopen(schedule)
    data_json = json.loads(request.read())

    schedule_by_date = data_json['dates']

    games = []
    for date in range(0, len(schedule_by_date)):
        games_by_date = schedule_by_date[date]['games']
        for game in range(0, len(games_by_date)):
            games.append(games_by_date[game]['gamePk'])
    return games

## Scrapes the Pitch by Pitch Data for Given Game and Converts to Dataframe

In [91]:
def getGame(gamePk):
    game = "https://statsapi.mlb.com/api/v1.1/game/" + str(gamePk) + "/feed/live"
    request = urlopen(game)
    data_json = json.loads(request.read())

    at_bats = data_json['liveData']['plays']['allPlays']

    # Game Data
    game_pack = []
    date = []
    pitcher_team = []
    batter_team = []

    # At Bat Data
    batter_id = []
    batter_name = []
    batter_handedness = []
    pitcher_id = []
    pitcher_name = []
    pitch_handedness = []
    event = []
    rbi = []
    ab_index = []

    # Batted ball data
    exit_velo = []
    launch_angle = []
    total_distance = []

    # Pitch Data
    in_play = []
    is_strike = []
    is_ball = []
    pitch_type = []
    pitch_velo = []
    spin_rate = []
    pitch_location_x = []
    pitch_location_y = []

    # Inning Data
    balls = []
    strikes = []
    outs = []
    home_score = []
    away_score = []
    inning = []
    half_inning = []

    for i in range(0, len(at_bats)):
        ab = at_bats[i]
        for pitch in range(0, len(ab['playEvents'])):
            if (ab['playEvents'][pitch]['isPitch'] == True):
                if ("Automatic Ball" not in ab['playEvents'][pitch]['details']['description']):
                    # GamePk as a primary key
                    game_pack.append(gamePk)
                    date.append(data_json['gameData']['datetime']['officialDate'])
                    
                    if (ab['about']['halfInning'] == 'top'):
                        pitcher_team.append(data_json['gameData']['teams']['home']['name'])
                        batter_team.append(data_json['gameData']['teams']['away']['name'])
                    else:
                        pitcher_team.append(data_json['gameData']['teams']['away']['name'])
                        batter_team.append(data_json['gameData']['teams']['home']['name'])
                    
                    # At Bat Data
                    batter_id.append(ab['matchup']['batter']['id'])
                    batter_name.append(ab['matchup']['batter']['fullName'])
                    batter_handedness.append(ab['matchup']['batSide']['code'])
                    pitcher_id.append(ab['matchup']['pitcher']['id'])
                    pitcher_name.append(ab['matchup']['pitcher']['fullName'])
                    pitch_handedness.append(ab['matchup']['pitchHand']['code'])
                    event.append(ab['playEvents'][pitch]['details']['description'])
                    rbi.append(ab['result']['rbi'])
                    ab_index.append(i + 1)

                    # Batted Ball Data
                    if (ab['playEvents'][pitch]['details']['isInPlay'] == True):
                        # print(str(gamePk) + ": " + ab['result']['description'])
                        if ("launchSpeed" in ab['playEvents'][pitch]['hitData']):
                            exit_velo.append(ab['playEvents'][pitch]['hitData']['launchSpeed'])
                        else:
                            exit_velo.append(np.nan)

                        if ("launchAngle" in ab['playEvents'][pitch]['hitData']):
                            launch_angle.append(ab['playEvents'][pitch]['hitData']['launchAngle'])
                        else:
                            launch_angle.append(np.nan)

                        if ("totalDistance" in ab['playEvents'][pitch]['hitData']):
                            total_distance.append(ab['playEvents'][pitch]['hitData']['totalDistance'])
                        else:
                            total_distance.append(np.nan)


                    else:
                        exit_velo.append(np.nan)
                        launch_angle.append(np.nan)
                        total_distance.append(np.nan)

                    # Pitch Data
                    in_play.append(ab['playEvents'][pitch]['details']['isInPlay'])
                    is_strike.append(ab['playEvents'][pitch]['details']['isStrike'])
                    is_ball.append(ab['playEvents'][pitch]['details']['isBall'])

                    if ("type" in ab['playEvents'][pitch]['details']):
                        pitch_type.append(ab['playEvents'][pitch]['details']['type']['description'])
                    else:
                        pitch_type.append(np.nan)

                    if ("startSpeed" in ab['playEvents'][pitch]['pitchData']):
                        pitch_velo.append(ab['playEvents'][pitch]['pitchData']['startSpeed'])
                    else:
                        pitch_velo.append(np.nan)

                    if ("spinRate" in ab['playEvents'][pitch]['pitchData']['breaks']):
                        spin_rate.append(ab['playEvents'][pitch]['pitchData']['breaks']['spinRate'])
                    else:
                        spin_rate.append(np.nan)

                    if ("pX" in ab['playEvents'][pitch]['pitchData']['coordinates']):
                        pitch_location_x.append(ab['playEvents'][pitch]['pitchData']['coordinates']['pX'])
                    else:
                        pitch_location_x.append(np.nan)

                    if ("pZ" in ab['playEvents'][pitch]['pitchData']['coordinates']):
                        pitch_location_y.append(ab['playEvents'][pitch]['pitchData']['coordinates']['pZ'])
                    else:
                        pitch_location_y.append(np.nan)

                    # Inning Data
                    home_score.append(ab['result']['homeScore'])
                    away_score.append(ab['result']['awayScore'])
                    balls.append(ab['playEvents'][pitch]['count']['balls'])
                    strikes.append(ab['playEvents'][pitch]['count']['strikes'])
                    outs.append(ab['playEvents'][pitch]['count']['outs'])
                    inning.append(ab['about']['inning'])
                    half_inning.append(ab['about']['halfInning'])

    data = {
        # Overall Game Data
        'game_pack': game_pack,
        'date' : date,
        'pitcher_team' : pitcher_team,
        'batter_team' : batter_team,
        
        # At Bat Data
        'ab_index': ab_index,
        'batter_id': batter_id,
        'batter_name': batter_name,
        'bats': batter_handedness,
        'pitcher_id': pitcher_id,
        'pitcher_name': pitcher_name,
        'throws': pitch_handedness,
        'event': event,
        'rbi': rbi,
        
        # Batted Ball Data
        'exit_velo': exit_velo,
        'launch_angle': launch_angle,
        'total_distance': total_distance,

        # Pitch Data
        'in_play': in_play,
        'is_strike': is_strike,
        'is_ball': is_ball,
        'pitch_type': pitch_type,
        'pitch_velo': pitch_velo,
        'spin_rate': spin_rate,
        'pitch_location_x': pitch_location_x,
        'pitch_location_y': pitch_location_y,

        # Inning Data
        'home_score': home_score,
        'away_score': away_score,
        'balls': balls,
        'strikes': strikes,
        'outs': outs,
        'inning': inning,
        'half_inning': half_inning
    }

    game = pd.DataFrame(data)
    return game


## Perform API Scrape on All Pitches During 2021 and 2022 Seasons

In [None]:
schedule_2021 = getSchedule('2021-04-01', '2021-10-03')
schedule_2022 = getSchedule('2022-04-07', '2022-10-05')

gamePks_2021 = getGamePks(schedule_2021)
gamePks_2022 = getGamePks(schedule_2022)

# Scrape all pitches for 2021 season
count = 0
for game in gamePks_2021:
    if count == 0:
        df_2021 = getGame(game)
        count = 1
    else:
        df_2021 = pd.concat([df_2021, getGame(game)])

# Scrape all pitches for 2022 season
count = 0
for game in gamePks_2022:
    if count == 0:
        df_2022 = getGame(game)
        count = 1
    else:
        df_2022 = pd.concat([df_2022, getGame(game)])      
        

df = pd.concat([df_2021, df_2022])
df.to_csv("gameData.csv", index=False)

## Perform API Scrape of Player Info

In [93]:
def getPlayerInfo(gamePk):
    game = "https://statsapi.mlb.com/api/v1/people?personIds=" + str(gamePk)
    request = urlopen(game)
    data_json = json.loads(request.read())["people"]
    df = pd.json_normalize(data_json)
    
    return df[["id", "fullName", "birthDate"]]

In [94]:
df = pd.read_csv("gameData.csv")

# Update the batterInfo dataframe using all batters in the dataframe
count=0
for i in df["batter_id"].unique():
    if count == 0:
        batter_birthDates = getPlayerInfo(int(i))
        count += 1
    else:
        batter_birthDates = pd.concat([batter_birthDates,getPlayerInfo(int(i))])

batter_birthDates = batter_birthDates.set_index('id')

# Update the pitcherInfo dataframe using all pitchers in the dataframe
count=0
for i in df["pitcher_id"].unique():
    if count == 0:
        pitcher_birthDates = getPlayerInfo(int(i))
        count += 1
    else:
        pitcher_birthDates = pd.concat([pitcher_birthDates,getPlayerInfo(int(i))])
        
pitcher_birthDates = pitcher_birthDates.set_index('id')

# Save player info dataframes as CSV files
batter_birthDates.to_csv("batter_birthDates.csv", index=False)
pitcher_birthDates.to_csv("pitcher_birthDates.csv", index=False)

## Only Run This Section To the Dataframes With the Most Current Data

In [None]:
# Read old dataframe
gameData = pd.read_csv("gameData.csv")

# Get latest date in the data frame
startDate = datetime.strftime(datetime.strptime(max(gameData['date']), '%Y-%m-%d') + pd.DateOffset(1), '%Y-%m-%d')

# Get today's date (YYYY-MM-DD)
today = datetime.today().strftime('%Y-%m-%d')

# Get gamePks for all games played today
games = getGamePks(getSchedule(startDate, today))

# Iterate through gamePks and add each pitch to a new dataframe
count = 0
for game in games:
    if count == 0:
        df_today = getGame(game)
        count = 1
    else:
        df_today = pd.concat([df_today, getGame(game)])

# Concatenate the new data from today to the bottom of the old dataframe
gameData = pd.concat([gameData, df_today])

# Update the batterInfo dataframe using all batters through today's games
count=0
for i in gameData["batter_id"].unique():
    if count == 0:
        batter_birthDates = getPlayerInfo(int(i))
        count += 1
    else:
        batter_birthDates = pd.concat([batter_birthDates, getPlayerInfo(int(i))])

batter_birthDates = batter_birthDates.set_index('id')

# Update the pitcherInfo dataframe using all pitchers through today's games
count=0
for i in gameData["pitcher_id"].unique():
    if count == 0:
        pitcher_birthDates = getPlayerInfo(int(i))
        count += 1
    else:
        pitcher_birthDates = pd.concat([pitcher_birthDates, getPlayerInfo(int(i))])
        
pitcher_birthDates = pitcher_birthDates.set_index('id')


# Update CSV Files
gameData.to_csv("gameData.csv", index=False)
batter_birthDates.to_csv("batter_birthDates.csv", index=False)
pitcher_birthDates.to_csv("pitcher_birthDates.csv", index=False)

## Upload Game Data Dataframe to Google Cloud

In [96]:
import pandas as pd
from google.cloud import bigquery
import os

# Tell os where to look for the Google Cloud credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="service-account-key/baseball-database-365316-a1d891f3a3c2.json"

# Read the CSV file
gameData = pd.read_csv("gameData.csv")

client = bigquery.Client(project='baseball-database-365316')

table_id = 'baseball.gameData'

job_config = bigquery.LoadJobConfig(
    # BigQuery appends loaded rows to an existing table by default,
    # but with WRITE_TRUNCATE it replaces the table with the loaded data.
    write_disposition="WRITE_TRUNCATE",
)


# Make an API request
job = client.load_table_from_dataframe(
    gameData, table_id, job_config=job_config
)

job.result()  # Wait for the job to complete.

LoadJob<project=baseball-database-365316, location=us-east4, id=dad895a5-7227-4a88-99b3-b09dc19f6886>

## Upload Pitcher Info Dataframe to Google Cloud

In [97]:
import pandas as pd
from google.cloud import bigquery
import os

# Tell os where to look for the Google Cloud credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="service-account-key/baseball-database-365316-a1d891f3a3c2.json"

# Read the CSV file
pitcherInfo = pd.read_csv("pitcher_birthDates.csv")

client = bigquery.Client(project='baseball-database-365316')

table_id = 'baseball.pitcherInfo'

# Make an API request
job = client.load_table_from_dataframe(
    pitcherInfo, table_id
)

job.result()  # Wait for the job to complete.

LoadJob<project=baseball-database-365316, location=us-east4, id=4a2d2004-9e1e-4440-808c-383871a28f16>

## Upload Batter Info Dataframe to Google Cloud

In [98]:
import pandas as pd
from google.cloud import bigquery
import os

# Tell os where to look for the Google Cloud credentials
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="service-account-key/baseball-database-365316-a1d891f3a3c2.json"

# Read the CSV file
batterInfo = pd.read_csv("batter_birthDates.csv")

client = bigquery.Client(project='baseball-database-365316')

table_id = 'baseball.batterInfo'

# Make an API request
job = client.load_table_from_dataframe(
    batterInfo, table_id
)

job.result()  # Wait for the job to complete.

LoadJob<project=baseball-database-365316, location=us-east4, id=7252c628-89ff-4090-b910-0c252ebd18a5>

## Directions on How to Do Your Own Query

&nbsp; 1) Navigate to https://console.cloud.google.com/bigquery?cloudshell=false&project=baseball-database-365316&ws=!1m0

&nbsp; 2) In the editor tab, enter your query. \
&nbsp; &nbsp; Example Query: **SELECT  batter_name, bats, pitcher_name, throws, event, balls, strikes from baseball.gameData where batter_name = "Adley Rutschman" and event = "In play, no out" limit 10;** \
&nbsp; &nbsp; Please take note that the "baseball." prefix to "baseball.gameData" is required when using Big Query.

&nbsp; 3) Press **RUN**

&nbsp; 4) Query result will display in a terminal below the query entry.

&nbsp; 5) See the **example-queries.docx** file I sent for more example queries you can run!