In [130]:
# Install any packages needed for model training
%pip install mlb-statsapi

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Set API Key from SportRadar and import needed libraries for data gathering
import pandas as pd
import time
from datetime import datetime, timedelta
from dotenv import load_dotenv
import os
import mysql.connector
import statsapi

load_dotenv()

db_password = os.getenv('DB_PASSWORD')

# Load SQL database created in mySQL (SQL code provided in "sql" folder)
db = mysql.connector.connect(
    host="localhost",
    user="dgiletto",
    password=db_password,
    database="mlb"
)

cursor = db.cursor()

In [70]:
def get_games_on_date(date_str: str):
    """
    Get scheduled MLB games for a certain date using mlbstatsapi
    Parameters:
        date_str (str): A string representation of the date in the format (YYYY-MM-DD)
    Returns:
        List containing a dict for each game on the day
    """
    games = statsapi.schedule(date=date_str)
    return games

get_games_on_date("2025-07-04")

[{'game_id': 777245,
  'game_datetime': '2025-07-04T15:05:00Z',
  'game_date': '2025-07-04',
  'game_type': 'R',
  'status': 'Final',
  'away_name': 'Boston Red Sox',
  'home_name': 'Washington Nationals',
  'away_id': 111,
  'home_id': 120,
  'doubleheader': 'N',
  'game_num': 1,
  'home_probable_pitcher': 'Michael Soroka',
  'away_probable_pitcher': 'Lucas Giolito',
  'home_pitcher_note': '',
  'away_pitcher_note': '',
  'away_score': 11,
  'home_score': 2,
  'current_inning': 9,
  'inning_state': 'Bottom',
  'venue_id': 3309,
  'venue_name': 'Nationals Park',
  'national_broadcasts': ['MLBN (out-of-market only)'],
  'series_status': 'BOS leads 1-0',
  'winning_team': 'Boston Red Sox',
  'losing_team': 'Washington Nationals',
  'winning_pitcher': 'Lucas Giolito',
  'losing_pitcher': 'Michael Soroka',
  'save_pitcher': None,
  'summary': '2025-07-04 - Boston Red Sox (11) @ Washington Nationals (2) (Final)'},
 {'game_id': 777255,
  'game_datetime': '2025-07-04T17:05:00Z',
  'game_date'

In [71]:
def get_total_runs(game_id: str):
    """
    Get total runs scored for a particular game
    Parameters:
        game_id (str): ID which identifies a unique game
    Returns:
        The total amount of runs scored between the home and away team
    """
    try:
        box = statsapi.boxscore_data(gamePk=game_id)
        home_runs = box['home']['teamStats']['batting']['runs']
        away_runs = box['away']['teamStats']['batting']['runs']
        return home_runs + away_runs
    except:
        return None

get_total_runs("777245")

13

In [101]:
def get_starting_pitcher_stats(game_id, team_type='home'):
    """
    Get the starting pitcher ERA and WHIP for a given team in a specific game
    
    Parameters:
        game_id (str): ID which identifies a unique game
        team_type (str): 'home' or 'away' to specify which team's starter to get

    Returns:
        dict: Dictionary with pitcher name, ERA, and WHIP.
    """
    # Get the full boxscore data
    boxscore = statsapi.get("game_boxscore", {"gamePk": game_id})
    players = boxscore['teams'][team_type]['players']

    for player_id, player_data in players.items():
        if 'stats' in player_data and 'pitching' in player_data['stats']:
            if player_data['position']['code'] == '1':  # Position code 1 = Pitcher
                if player_data['stats']['pitching'].get('gamesStarted', 0) > 0:
                    stats = player_data['seasonStats']['pitching']
                    return {
                        "name": player_data['person']['fullName'],
                        "ERA": stats.get('era', 'N/A'),
                        "WHIP": stats.get('whip', 'N/A'),
                        "HR/9": stats.get('homeRunsPer9', 'N/A'),
                        "Ks": stats.get('strikeOuts', 'N/A')
                    }

    return {"error": "Starting pitcher not found."}

get_starting_pitcher_stats("777245", team_type="away")

{'name': 'Lucas Giolito',
 'ERA': '3.66',
 'WHIP': '1.28',
 'HR/9': '1.09',
 'Ks': 58}

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import time

def get_total_lines(date_str):
    url = f"https://www.sportsbookreview.com/betting-odds/mlb-baseball/totals/full-game/?date={date_str}"
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-gpu")
    driver = webdriver.Chrome(options=options)

    try:
        driver.get(url)

        # Wait up to 15 seconds for any span with "O" or "U" + number
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.TAG_NAME, "span"))
        )
        time.sleep(2)  # Add buffer for JavaScript to render fully

        # Now grab all bolds and spans and filter for total lines
        spans = driver.find_elements(By.TAG_NAME, "span")
        bolds = driver.find_elements(By.TAG_NAME, "b")
        totals = []
        pattern = re.compile(r'^[O]\s\d+(\.\d+)?$')

        for span in spans:
            text = span.text.strip()
            if pattern.match(text):
                totals.append({"home": "", "away": "", "line": text[2:]})
        
        totals_index = 0
        is_away = True
        # Use knowledge of <b> holding team abbreviations to access the matchup of the day
        for bold in bolds:
            text = bold.text.strip()
            if len(text) <= 3:
                if totals_index >= len(totals):
                    break  # Avoid IndexError
                
                if is_away:
                    totals[totals_index]["away"] = text
                else:
                    totals[totals_index]["home"] = text
                    totals_index += 1

                is_away = not is_away

        return totals

    finally:
        driver.quit()

get_total_lines("2025-07-05")

[{'home': 'MIN', 'away': 'TB', 'line': '9.5'},
 {'home': 'CHC', 'away': 'STL', 'line': '10.5'},
 {'home': 'TOR', 'away': 'LAA', 'line': '9'},
 {'home': 'PHI', 'away': 'CIN', 'line': '8.5'},
 {'home': 'WAS', 'away': 'BOS', 'line': '10'},
 {'home': 'AZ', 'away': 'KC', 'line': '8.5'},
 {'home': 'NYM', 'away': 'NYY', 'line': '9'},
 {'home': 'ATL', 'away': 'BAL', 'line': '10'},
 {'home': 'MIA', 'away': 'MIL', 'line': '8.5'},
 {'home': 'CLE', 'away': 'DET', 'line': '9'},
 {'home': 'LAD', 'away': 'HOU', 'line': '8.5'},
 {'home': 'COL', 'away': 'CHW', 'line': '11.5'},
 {'home': 'SD', 'away': 'TEX', 'line': '8.5'},
 {'home': 'ATH', 'away': 'SF', 'line': '9'},
 {'home': 'SEA', 'away': 'PIT', 'line': '7.5'}]

In [17]:
def abbreviation_conversion(abb):
    teams = {
        "ARI": "Arizona Diamondbacks",
        "ATL": "Atlanta Braves",
        "BAL": "Baltimore Orioles",
        "BOS": "Boston Red Sox",
        "CWS": "Chicago White Sox",
        "CHC": "Chicago Cubs",
        "CIN": "Cincinnati Reds",
        "CLE": "Cleveland Guardians",
        "COL": "Colorado Rockies",
        "DET": "Detroit Tigers",
        "HOU": "Houston Astros",
        "KC": "Kansas City Royals",
        "LAA": "Los Angeles Angels",
        "LAD": "Los Angeles Dodgers",
        "MIA": "Miami Marlins",
        "MIL": "Milwaukee Brewers",
        "MIN": "Minnesota Twins",
        "NYY": "New York Yankees",
        "NYM": "New York Mets",
        "OAK": "Oakland Athletics",
        "PHI": "Philadelphia Phillies",
        "PIT": "Pittsburgh Pirates",
        "SD": "San Diego Padres",
        "SF": "San Francisco Giants",
        "SEA": "Seattle Mariners",
        "STL": "St. Louis Cardinals",
        "TB": "Tampa Bay Rays",
        "TEX": "Texas Rangers",
        "TOR": "Toronto Blue Jays",
        "WSH": "Washington Nationals"
    }
    return teams.get(abb.upper(), "Unknown Team")

abbreviation_conversion("BOS")

'Boston Red Sox'

In [103]:
def get_team_stats(game_id, team_type='home'):
    """
    Fetch season stats for the given team type of the game inputted
    
    Parameters:
        game_id (str): ID which identifies a unique game
        team_type (str): 'home' or 'away' to specify which team's starter to get
    Returns:
        A dictionary which stores stats like batting average, era, obs, etc.
    """
    try:
        stats = statsapi.boxscore_data(gamePk=game_id)
        avg = stats[team_type]['teamStats']['batting']['avg']
        slg = stats[team_type]['teamStats']['batting']['slg']
        obp = stats[team_type]['teamStats']['batting']['obp']
        ops = stats[team_type]['teamStats']['batting']['ops']
        team_era = stats[team_type]['teamStats']['pitching']['era'] # Get bullpen data included
        pitcher_obp = stats[team_type]['teamStats']['pitching']['obp']

        return {
            'batting_avg': avg,
            'slg': slg,
            'obp': obp,
            'ops': ops,
            'team_era': team_era,
            'pitcher_obp': pitcher_obp
        }
    except:
        return None
    
get_team_stats("777245", team_type='away')

{'batting_avg': '.253',
 'slg': '.423',
 'obp': '.325',
 'ops': '.748',
 'team_era': '3.98',
 'pitcher_obp': '.257'}

In [None]:
def insert_game_record(db, cursor, record):
    try:
        query="""
        INSERT INTO mlb_game_data (
            game_date, home_team, away_team, 
            home_ba, away_ba, home_obp, away_obp, home_slg, away_slg, 
            home_era, away_era, home_whip, away_whip, 
            home_hr_allowed, away_hr_allowed, home_fpct, away_fpct, 
            total_runs
        )
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
        """
        values = (
            record["date"],
            record["home_team"],
            record["away_team"],
            record["home_ba"],
            record["away_ba"],
            record["home_obp"],
            record["away_obp"],
            record["home_slg"],
            record["away_slg"],
            record["home_era"],
            record["away_era"],
            record["home_whip"],
            record["away_whip"],
            record["home_hr_allowed"],
            record["away_hr_allowed"],
            record["home_fpct"],
            record["away_fpct"],
            record["total_runs"]
        )
        cursor.execute(query, values)
        db.commit()
    except mysql.connector.Error as err:
        print("Insert failed: ", err)

In [None]:
from datetime import datetime, timedelta

# Assemble Dataset using a combination of our past functions
def collect_game_data(start_date, end_date):
    dataset = []
    current = start_date
    # Iterate throught each day of games until we get to the last day
    while current <= end_date:
        # Get all of the games on the current day
        games = get_games_on_date(current.strftime("%m-%d-%Y"))
        for game in games:
            game_id = game["id"]
            home_id = game["home"]["id"]
            away_id = game["away"]["id"]
            home_name = game["home"]["name"]
            away_name = game["away"]["name"]
        
            # Obtain our target
            total_runs = get_total_runs(game_id)
            if total_runs is None:
                continue

            # Obtain our features
            home_stats = get_home_stats(home_id)
            away_stats = get_away_stats(away_id)
            if not home_stats or not away_stats:
                continue

            # Append it all into a dict
            record = {
                'date': current.strftime("%Y-%m-%d"),
                'home_team': home_name,
                'away_team': away_name,
                'home_ba': home_stats["batting_avg"],
                'away_ba': away_stats["batting_avg"],
                'home_obp': home_stats["batting_obp"],
                'away_obp': away_stats["batting_obp"],
                'home_slg': home_stats["batting_slg"],
                'away_slg': away_stats["batting_slg"],
                'home_era': home_stats["era"],
                'away_era': away_stats["era"],
                'home_whip': home_stats["whip"],
                'away_whip': away_stats["whip"],
                'home_hr_allowed': home_stats["hr_allowed"],
                'away_hr_allowed': away_stats["hr_allowed"],
                'home_fpct': home_stats["fpct"],
                'away_fpct': away_stats["fpct"],
                'total_runs': total_runs
            }

            insert_game_record(db, cursor, record)
            time.sleep(1) # to avoid wait limit
        current += timedelta(days=1)
    # Return a data frame with rows for each game
    return dataset

collect_game_data(
    start_date=datetime(2025, 7, 3),
    end_date=datetime(2025, 7, 4)
)

[]