In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from tqdm.auto import tqdm

import datetime
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_columns', None)

In [3]:
np.set_printoptions(suppress=True)

In [41]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))

https://baseballsavant.mlb.com/leaderboard/statcast-park-factors?type=year&year=2022&batSide=&stat=index_wOBA&condition=All&rolling=no

In [5]:
url = 'https://baseballsavant.mlb.com/leaderboard/statcast-park-factors'

In [6]:
#Selenium options
options = webdriver.ChromeOptions()

### Un-comment bellow if you want to run in headless mode ###
# options.add_argument("--headless")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(url)

[WDM] - Downloading: 100%|█████████████████| 8.61M/8.61M [00:00<00:00, 32.1MB/s]


In [102]:
headers = [
    'Rank',
    'team',
    'venue',
    'year',
    'park_factor',
    'wOBACon',
    'xwOBACon',
    'BACON',
    'xBACON',
    'HardHit',
    'R',
    'OBP',
    'H',
    '1B',
    '2B',
    '3B',
    'HR',
    'BB',
    'SO',
    'PA',
    'bat_side',
    'condition',
    'rolling_3yrs'
]

In [129]:
year_dict = {
    2022:1,
    2021:2,
    2020:3,
    2019:4,
    2018:5,
    2017:6,
    2016:7,
    2015:8
}

bat_side_dict = {
    "Both":1,
    "R":2,
    "L":3
}

cond_dict = {
    'All':1,
    'Day':2,
    'Night':3,
    'RoofClosed':4,
    'OpenAir':5
}

roll_dict = {
    'yes':1,
    'no':2
}

In [140]:
all_list = []
for year in tqdm([2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015]):
    year_index = year_dict[year]
    # click year
    WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, f"#ddlSeason > option:nth-child({year_index})"))).click()
    for bat_side in ['Both', 'L', 'R']:
        bat_side_index = bat_side_dict[bat_side]
        # click bat side
        WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, f"#ddlBatSide > option:nth-child({bat_side_index})"))).click()
        for cond in ['All', 'Day', 'Night', 'RoofClosed', 'OpenAir']:
            cond_index = cond_dict[cond]
            # click condition
            WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, f"#ddlCondition > option:nth-child({cond_index})"))).click()
            roll = 'no'
            roll_index = roll_dict[roll]
            # click rolling
            WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, f"#ddlRoll > option:nth-child({roll_index})"))).click()
            
            # click update
            WebDriverWait(driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#btnUpdate'))).click()
            page_list = []
            row_list = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'default-table-row')))

            for row in row_list:
                row_list = []
                cell_list = WebDriverWait(row,5).until(EC.presence_of_all_elements_located((By.TAG_NAME, 'td')))
                for cell in cell_list:
                    if cell.text != '':
                        row_list+=[cell.text]
                    else:
                        row_list+=[np.nan]
                row_list+=[bat_side, cond, roll]
                page_list+=[row_list]
            all_list+=page_list

  0%|          | 0/8 [00:00<?, ?it/s]

In [142]:
df = pd.DataFrame(all_list, columns=headers)

In [143]:
df

Unnamed: 0,Rank,team,venue,year,park_factor,wOBACon,xwOBACon,BACON,xBACON,HardHit,R,OBP,H,1B,2B,3B,HR,BB,SO,PA,bat_side,condition,rolling_3yrs
0,1,Rockies,Coors Field,2022,115,113,103,110,103,107,132,110,118,111,125,143,143,95,84,6112,Both,All,no
1,2,Reds,Great American Ball Park,2022,109,112,101,108,101,101,119,106,107,102,105,70,143,107,102,6244,Both,All,no
2,3,Red Sox,Fenway Park,2022,107,108,103,107,102,103,114,104,110,105,113,261,118,93,96,6225,Both,All,no
3,4,Dodgers,Dodger Stadium,2022,104,107,98,105,98,97,108,101,103,98,104,94,125,94,106,6260,Both,All,no
4,5,Phillies,Citizens Bank Park,2022,104,105,99,103,99,102,108,101,102,99,102,116,120,95,98,6754,Both,All,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3022,25,Giants,Oracle Park,2015,95,95,100,96,99,108,90,97,93,94,94,156,75,115,108,3317,R,OpenAir,no
3023,26,Mariners,T-Mobile Park,2015,95,95,99,96,99,94,90,96,92,95,86,80,89,101,108,3313,R,OpenAir,no
3024,27,Yankees,Yankee Stadium,2015,93,94,101,95,100,100,86,94,91,96,70,53,102,97,110,2725,R,OpenAir,no
3025,28,Mets,Citi Field,2015,91,91,95,91,97,90,83,92,91,92,89,98,88,101,105,3722,R,OpenAir,no


In [144]:
df.to_csv('data/ParkFactors_full_15-22.csv')