# Daily update tool
Run once a day after all games are finished

- Scrape data
- Update categories
- Calculate FiFaX
- Save database

In [78]:
reset -fs

In [79]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [80]:
from datetime import date, timedelta
from selenium import webdriver
from bs4 import BeautifulSoup
import time, os

In [81]:
today = date.today()
yesterday = today #+ timedelta(days = -1)
yesterday_date = '-'.join([str(yesterday.year), str(yesterday.month), str(yesterday.day)])

In [82]:
columns = ['date', 't1', 'pitcher', 't2', 'batter', 'inning', 'result', 'pitch_type', 'mph', 
           'rpm', 'vbreak', 'up_down', 'hbreak', 'left_right', 'count']

In [83]:
import pandas as pd
import numpy as np

In [84]:
chromedriver = '/Applications/chromedriver'
os.environ['webdriver.chrome.driver'] = chromedriver
driver = webdriver.Chrome(chromedriver)

  driver = webdriver.Chrome(chromedriver)


In [85]:
target_date = yesterday_date
path = 'https://baseballsavant.mlb.com/gamefeed?date=' + target_date + '&chartType=pitch&legendType=pitchName' + \
        '&playerType=pitcher&inning=&count=&pitchHand=&batSide=&descFilter=&ptFilter=&resultFilter=&hf=pitchVelocity#706856'
driver.get(path)
time.sleep(10)
soup = BeautifulSoup(driver.page_source, 'html.parser')

df = pd.DataFrame(columns = columns)
games = soup.find_all('div', {'class': 'game-container step'})
teams_left = soup.find_all('div', {'class':'team-left'})
teams_right = soup.find_all('div', {'class':'team-right'})
# Iterating through each game in the day
for game, team_left, team_right in zip(games, teams_left, teams_right):
    try:
        pitches = game.find('tbody').find_all('tr')

        t1 = team_left.find('div', {'class':'team-name'}).text.replace('\n', '').strip()

        t2 = team_right.find('div', {'class':'team-name'}).text.replace('\n', '').strip()

    except: # Breaks when no more games are found
        print('Break')
        break
    count_b = 0
    count_s = 0
    for pitch in pitches[::-1]: # Iterating through each pitch in the game
        try:
            row = pitch.find_all('span')

            pitcher = row[1].text
            batter = row[3].text
            inning = int(row[7].text)
            result = row[8].text
            pitch_type = row[9].text
            mph = float(row[10].text)
            rpm = int(row[11].text)

            vbreak = int(row[12].text)
            up_down = row[13].text
            hbreak = int(row[14].text)
            l_r = row[15].text
            entry = [target_date, t1, pitcher, t2, batter, inning, result, pitch_type, mph, rpm, vbreak, up_down, hbreak, l_r, (count_b, count_s)]
            df.loc[len(df)] = entry
        except:
            pass
        if ('Ball' in result) or ('Pitchout' in result): # Logic tree to figure out what the ball/strike count is
            count_b += 1
        elif ('Strike' in result) or ('Missed Bunt' == result) or ('Foul Bunt' == result):
            count_s += 1
        elif 'Foul' in result:
            if count_s < 2:
                count_s += 1
        if (count_b == 4) or (count_s == 3) or ('In play' in result) or ('Hit by Pitch' == result):
            count_b = 0
            count_s = 0

Break


In [86]:
# Distilling categories to the four we care about
result_dict = {'Ball': 'Ball',
               'Foul': 'Foul',
               'Called Strike': 'Strike',
               'Swinging Strike': 'Strike',
               'In play, out(s)': 'Contact',
               'In play, no out': 'Contact',
               'Ball In Dirt': 'Ball',
               'In play, run(s)': 'Contact',
               'Foul Tip': 'Strike',
               'Hit By Pitch': 'Ball',
               'Foul Bunt': 'Strike',
               'Missed Bunt': 'Strike',
               'Pitchout': 'Ball'}

# Distilling pitches into groups we care about
def group_pitches(x):
    if 'Fastball' in x:
        return 'Fastball'
    elif 'Curve' in x:
        return 'Curveball'
    elif ('Knuc' in x) or (x == 'Splitter'):
        return 'Splitter'
    else:
        return x


In [87]:
# Making sure we keep the raw data
df['left_right_raw'] = df['left_right']
df['result_raw'] = df['result']
df['pitch_type_raw'] = df['pitch_type']

# Narrowing down categories
df['left_right'] = df['left_right'].apply(lambda x: x == '←')
df['result'] = df['result'].map(result_dict)
df['pitch_type'] = df['pitch_type'].apply(group_pitches)

In [88]:
pitch_types = ['Fastball', 'Slider', 'Sinker', 'Changeup', 'Curveball', 'Splitter', 'Cutter']

In [89]:
from sklearn.ensemble import RandomForestClassifier
from joblib import load

In [90]:
rf_dict = {}
for pitch_type in pitch_types:
    rf_dict[pitch_type] = load(pitch_type + '.rf')

In [91]:
def calculate_fifax(args):
    p_type, mph, rpm, vbreak, hbreak, is_left = args[0], args[1], args[2], args[3], args[4], args[5]
    X = np.array([mph, rpm, vbreak, hbreak, is_left]).reshape(1,5)
    return rf_dict[p_type].predict_proba(X)[0][0]

In [92]:
# Calculating FiFaX for each pitch thrown
df['fifax'] = df[['pitch_type', 'mph', 'rpm', 'vbreak', 'hbreak', 'left_right']].apply(calculate_fifax, axis = 1)

In [93]:
# df.to_csv(target_date + '.csv')

In [94]:
pip install google-cloud-storage

Note: you may need to restart the kernel to use updated packages.


In [95]:
from google.cloud import storage

In [96]:
def upload_blob(bucket_name, source_file_name, destination_blob_name):
    
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    
    blob.upload_from_filename(source_file_name)
    
    print('{} with contents {} uploaded to {}'.format(destination_blob_name, source_file_name, bucket_name))
    
    

In [97]:
# Uploading to cloud database
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'marine-tractor-346520-426bbf8c0e09.json'
upload_blob('the-filthiest', target_date + '.csv', 'pitch-data/' + target_date + '.csv')

pitch-data/2022-4-7.csv with contents 2022-4-7.csv uploaded to the-filthiest
