In [1]:
# imports
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
import pandas as pd


In [2]:
# TODO - Update code to only hit refresh a certain number of times, then append new results to old 2025 results and remove duplicates
# this will be faster than repulling the entire szn every time I want new games

In [3]:
# set target year var
target_year = '2025'

# create url vars
url_2024 = "https://www.flashscoreusa.com/baseball/usa/mlb-2024/#/KID1DKF8/table/overall"
url_2025 = 'https://www.flashscoreusa.com/baseball/usa/mlb/results/'

# pick url based on target year
if target_year == '2024':
    master_url = url_2024
elif target_year == '2025':
    master_url = url_2025


In [4]:
# event better code to extract the game date plus time as well
driver = webdriver.Chrome()
driver.get(master_url)
time.sleep(5)

# Click reject button if it shows up
try:
    driver.find_element(By.ID, "onetrust-reject-all-handler").click()
    time.sleep(2)
except NoSuchElementException:
    pass

# Keep clicking "Show more games" until it's gone
while True:
    try:
        driver.find_element(By.LINK_TEXT, "Show more games").click()
        time.sleep(5)
    except NoSuchElementException:
        break

# Get all game divs
games = driver.find_elements(By.CSS_SELECTOR, "div.event__match.event__match--static")

# get len of games
len_games = len(games)

game_data = []

for idx, game in enumerate(games):
    if idx % 100 == 0:
        print(f'{idx} of {len_games}')
    try:
        home_team = game.find_element(By.CSS_SELECTOR, "div.event__participant--home").text
        away_team = game.find_element(By.CSS_SELECTOR, "div.event__participant--away").text
        home_score = game.find_element(By.CSS_SELECTOR, "span.event__score--home").text
        away_score = game.find_element(By.CSS_SELECTOR, "span.event__score--away").text
        game_time = game.find_element(By.CSS_SELECTOR, "div.event__time.event__time--usFormat").text
    except NoSuchElementException:
        home_team = away_team = home_score = away_score = game_time = None

    # thanks to the @ home is the bottom team even though the site labeling is off
    # this is somewhat concerning for other leagues need to deeper dive to ensure home vs away is correct
    # or is it good bc i did check NPB (will check again) but @ seems like a lazy fix to accomodate MLB
    game_data.append({
        "game_date_time": game_time,
        "Home Team": away_team, 
        "Away Team": home_team,
        "Home Score": away_score,
        "Away Score": home_score
    })

# Convert to DataFrame
df = pd.DataFrame(game_data)

# close webpage
driver.quit()

# inspect df
df.head()



0 of 1040
100 of 1040
200 of 1040
300 of 1040
400 of 1040
500 of 1040
600 of 1040
700 of 1040
800 of 1040
900 of 1040
1000 of 1040


Unnamed: 0,game_date_time,Home Team,Away Team,Home Score,Away Score
0,Jun 13\n10:10 PM,@\nSeattle Mariners,Cleveland Guardians,7,2
1,Jun 13\n10:10 PM,@\nLos Angeles Dodgers,San Francisco Giants,2,6
2,Jun 13\n09:40 PM,@\nArizona Diamondbacks,San Diego Padres,5,1
3,Jun 13\n08:10 PM,@\nKansas City Royals,Athletics,4,6
4,Jun 13\n08:10 PM,@\nHouston Astros,Minnesota Twins,10,3


In [5]:
# filter out any null rows
df = df[~df.isnull().any(axis=1)]

# clean out @\n
df['Home Team'] = df['Home Team'].str.replace("@\n", "")

# inspect
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1040 entries, 0 to 1039
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   game_date_time  1040 non-null   object
 1   Home Team       1040 non-null   object
 2   Away Team       1040 non-null   object
 3   Home Score      1040 non-null   object
 4   Away Score      1040 non-null   object
dtypes: object(5)
memory usage: 40.8+ KB


In [6]:
# basic data cleaning & enriching

# split date and time
df[['Date', 'Time']] = df['game_date_time'].str.split('\n', n=1, expand=True)

# add total score col
df['Total Score'] = df['Home Score'].astype(int) + df['Away Score'].astype(int)


In [7]:
# More basic data cleaning

# strip whitespace from date and time
df['Date'] = df['Date'].str.strip()
df['Time'] = df['Time'].str.strip()

# TODO: Enforce correct year in the date column
# convert to datetime and replace year
df['Date'] = f'{target_year} ' + df['Date'].astype(str)
df['Date'] = pd.to_datetime(df['Date'], format='%Y %b %d') 

# Filter out rows where 'Time' contains 'Abn'
df = df[~df['Time'].str.contains('Abn', na=False)]

# inspect
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1040 entries, 0 to 1039
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   game_date_time  1040 non-null   object        
 1   Home Team       1040 non-null   object        
 2   Away Team       1040 non-null   object        
 3   Home Score      1040 non-null   object        
 4   Away Score      1040 non-null   object        
 5   Date            1040 non-null   datetime64[ns]
 6   Time            1040 non-null   object        
 7   Total Score     1040 non-null   int32         
dtypes: datetime64[ns](1), int32(1), object(6)
memory usage: 61.1+ KB


Unnamed: 0,game_date_time,Home Team,Away Team,Home Score,Away Score,Date,Time,Total Score
0,Jun 13\n10:10 PM,Seattle Mariners,Cleveland Guardians,7,2,2025-06-13,10:10 PM,9
1,Jun 13\n10:10 PM,Los Angeles Dodgers,San Francisco Giants,2,6,2025-06-13,10:10 PM,8
2,Jun 13\n09:40 PM,Arizona Diamondbacks,San Diego Padres,5,1,2025-06-13,09:40 PM,6
3,Jun 13\n08:10 PM,Kansas City Royals,Athletics,4,6,2025-06-13,08:10 PM,10
4,Jun 13\n08:10 PM,Houston Astros,Minnesota Twins,10,3,2025-06-13,08:10 PM,13


In [8]:
# drop orig date + time col
df = df.drop('game_date_time', axis=1)


In [9]:
# TODO: Move this script into Google Collab
# save to excel
df.to_excel(fr'G:\My Drive\Big League Chew\Data\flashscore_mlb_results_{target_year}.xlsx', index=False)