In [1]:
import pandas as pd
import statsmodels.api as sm
import pybaseball as pyb
import numpy as np
import xgboost

In [None]:
team_batting = pyb.team_batting(2024)
team_batting = team_batting[
    ["Team", "G", "AB", "R", "H", "HR", "RBI", "SB", "OBP", "SLG"]
]
team_batting["OPS"] = team_batting["OBP"] + team_batting["SLG"]
team_pitching = pyb.team_pitching(
    2024, end_date="2024-09-29"
)  # Limit to regular season
team_pitching = team_pitching[["Team", "W", "L", "ERA", "IP", "SO", "WHIP", "FIP"]]

In [3]:
teams = team_batting["Team"].unique().tolist()

In [4]:
games = []
for team in teams:
    try:
        log = pyb.schedule_and_record(2024, team)
        home_games = log[log["Home_Away"] == "Home"]
        for _, row in home_games.iterrows():
            home_team = row["Tm"]
            away_team = row["Opp"]
            home_win = (
                1 if row["W/L"].startswith("W") else 0
            )  # Handle 'W-wo' etc., but assuming 'W' or 'L'
            games.append(
                {"home_team": home_team, "away_team": away_team, "home_win": home_win}
            )
    except Exception as e:
        print(f"Error fetching log for {team}: {e}")
games_df = pd.DataFrame(games)

In [None]:
games_df = (
    games_df.merge(
        team_batting[team_batting["G"] <= 162][["Team", "OPS"]],
        left_on="home_team",
        right_on="Team",
    )
    .rename(columns={"OPS": "home_OPS"})
    .drop("Team", axis=1)
)

games_df = (
    games_df.merge(team_batting[["Team", "OPS"]], left_on="away_team", right_on="Team")
    .rename(columns={"OPS": "away_OPS"})
    .drop("Team", axis=1)
)
games_df = games_df.merge(
    team_pitching[["Team", "ERA", "WHIP"]], left_on="home_team", right_on="Team"
).drop("Team", axis=1)
games_df = games_df.merge(
    team_pitching[["Team", "ERA", "WHIP"]],
    left_on="away_team",
    right_on="Team",
    suffixes=("", "_away"),
).drop("Team", axis=1)
games_df.rename(
    columns={"ERA_away": "away_ERA", "WHIP_away": "away_WHIP"}, inplace=True
)

In [6]:
games_df = games_df.sample(n=130, random_state=42)
print(f"Training on {len(games_df)} games")

Training on 130 games


In [7]:
features = ["home_OPS", "away_OPS", "ERA", "away_ERA", "WHIP", "away_WHIP"]
X = games_df[features]
y = games_df["home_win"]

In [8]:
model = xgboost.DMatrix(X, label=y)
params = {
    "objective": "binary:logistic",
    "max_depth": 3,
    "eta": 0.1,
    "eval_metric": "logloss",
}
model = xgboost.train(params, model, num_boost_round=100)

In [9]:
home_team = "LAD"
away_team = "NYY"
home_OPS = team_batting.loc[team_batting["Team"] == home_team, "OPS"].values[0]
away_OPS = team_batting.loc[team_batting["Team"] == away_team, "OPS"].values[0]
home_ERA = team_pitching.loc[team_pitching["Team"] == home_team, "ERA"].values[0]
away_ERA = team_pitching.loc[team_pitching["Team"] == away_team, "ERA"].values[0]
home_WHIP = team_pitching.loc[team_pitching["Team"] == home_team, "WHIP"].values[0]
away_WHIP = team_pitching.loc[team_pitching["Team"] == away_team, "WHIP"].values[0]

In [10]:
home_team = "LAD"
away_team = "NYY"
home_OPS = team_batting.loc[team_batting["Team"] == home_team, "OPS"].values[0]
away_OPS = team_batting.loc[team_batting["Team"] == away_team, "OPS"].values[0]
home_ERA = team_pitching.loc[team_pitching["Team"] == home_team, "ERA"].values[0]
away_ERA = team_pitching.loc[team_pitching["Team"] == away_team, "ERA"].values[0]
home_WHIP = team_pitching.loc[team_pitching["Team"] == home_team, "WHIP"].values[0]
away_WHIP = team_pitching.loc[team_pitching["Team"] == away_team, "WHIP"].values[0]

new_data = pd.DataFrame(
    {
        "home_OPS": [home_OPS],
        "away_OPS": [away_OPS],
        "ERA": [home_ERA],
        "away_ERA": [away_ERA],
        "WHIP": [home_WHIP],
        "away_WHIP": [away_WHIP],
    }
)

In [11]:
dtest = xgboost.DMatrix(new_data)
prob = model.predict(dtest)[0]
print(f"Predicted probability that LAD wins against NYY (LAD home): {prob:.1%}")

Predicted probability that LAD wins against NYY (LAD home): 61.7%
