In [110]:
import statsapi
import pandas as pd

In [111]:
game_logs = statsapi.schedule(start_date='01/01/2025', end_date='08/31/2025')
game_logs = pd.DataFrame(game_logs)
pitching = pd.read_csv("../data/pitching_stats_2025.csv")
batting = pd.read_csv("../data/team_batting_stats_2025.csv")

In [112]:
# RUN ONLY ONCE
import re
pitching['Player'] = pitching['Player'].str.replace(r'[*#]$', '', regex=True).str.strip() # Remove * or #
pitching = pitching.drop_duplicates(subset='Player', keep='first')

batting = batting.rename(columns = {'SO' : 'SO_batting'})
batting = batting.rename(columns = {'BB' : 'BB_batting'})

In [113]:
game_logs = game_logs[['game_id','home_name', 'away_name', 'home_probable_pitcher','away_probable_pitcher', 'away_score', 'home_score']]
pitching_features = pitching[['Player', 'ERA', 'ER', 'R', 'SO','BB', 'SO/BB', 'WHIP']]
batting_features = batting[['Tm', 'OBP', 'SLG', 'HR', 'R/G', 'BB_batting', 'SO_batting', 'IBB']]

#Merge Home Pitcher
pitching_home = pitching_features.copy()
pitching_home.columns = ['home_' + col if col != 'Player' else col for col in pitching_home.columns]
game_logs = game_logs.merge(
    pitching_home,
    left_on = 'home_probable_pitcher',
    right_on = 'Player',
    how = 'left',
).drop(columns=['Player'])

#Merge Away Pitcher
pitching_away = pitching_features.copy()
pitching_away.columns = ['away_' + col if col != 'Player' else col for col in pitching_away.columns]
game_logs = game_logs.merge(
    pitching_away,
    left_on = 'away_probable_pitcher',
    right_on = 'Player',
    how = 'left',
).drop(columns=['Player'])

#Merge Home Batters
batting_home = batting_features.copy()
batting_home.columns = ['home_' + col if col != 'Tm' else col for col in batting_home.columns]
game_logs = game_logs.merge(
    batting_home,
    left_on = 'home_name',
    right_on = 'Tm',
    how = 'left',
).drop(columns=['Tm'])

#Merge Away Batters
batting_away = batting_features.copy()
batting_away.columns = ['away_' + col if col != 'Tm' else col for col in batting_home.columns]
game_logs = game_logs.merge(
    batting_away,
    left_on = 'away_name',
    right_on = 'Tm',
    how = 'left',
).drop(columns=['Tm'])

In [114]:
'''
import requests
def fetch_weather(gamePk: int) -> dict:
    url = f"https://statsapi.mlb.com/api/v1.1/game/{gamePk}/feed/live"
    resp = requests.get(url)
    if resp.status_code != 200:
        return {"temp": None, "condition": None, "wind": None}

    data = resp.json()
    weather = data.get("gameData", {}).get("weather", {})
    return {
        "game_id" : gamePk,
        "temp": weather.get("temp"),
        "condition": weather.get("condition"),
        "wind": weather.get("wind")

    }

weather_list = []
for i, gamePk in enumerate(game_logs["game_id"], start=1):
    w = fetch_weather(gamePk)
    weather_list.append(w)
    if i % 100 == 0:
        print(f"Saved {i}/{len(game_logs)} games")
'''

Saved 100/2574 games
Saved 200/2574 games
Saved 300/2574 games
Saved 400/2574 games
Saved 500/2574 games
Saved 600/2574 games
Saved 700/2574 games
Saved 800/2574 games
Saved 900/2574 games
Saved 1000/2574 games
Saved 1100/2574 games
Saved 1200/2574 games
Saved 1300/2574 games
Saved 1400/2574 games
Saved 1500/2574 games
Saved 1600/2574 games
Saved 1700/2574 games
Saved 1800/2574 games
Saved 1900/2574 games
Saved 2000/2574 games
Saved 2100/2574 games
Saved 2200/2574 games
Saved 2300/2574 games
Saved 2400/2574 games
Saved 2500/2574 games


In [115]:
weather_list = pd.DataFrame(weather_list)
game_logs = game_logs.merge(weather_list, on='game_id', how='left')

In [116]:
game_logs["condition"] = game_logs["condition"].astype("category")
condition_dummies = pd.get_dummies(game_logs["condition"], prefix="condition")
game_logs = pd.concat([game_logs, condition_dummies], axis=1)

In [117]:
game_logs.to_csv("../data/final_game_logs_2025.csv")

In [136]:
import time
def safe_float(v):
    try:
        return float(v)
    except:
        return None

def extract_ops(player_obj):
    batting = player_obj.get("seasonStats", {}).get("batting", {})
    ops = safe_float(batting.get("ops"))
    if ops is not None:
        return ops
    return None

def get_starters(players_dict):
    starters = []
    for p in players_dict.values():
        if "battingOrder" in p:
            starters.append(p)
    starters.sort(key=lambda x : int(x.get("battingOrder", "999"))) #999 = Fail Code
    return starters

def lineup_score(gameid):
    url = f"https://statsapi.mlb.com/api/v1/game/{gameid}/boxscore"
    r = requests.get(url, timeout=30)
    if r.status_code != 200:
        return None
    data = r.json()

    teams = data.get("teams", {})
    home_players = teams.get("home", {}).get("players", {})
    away_players = teams.get("away", {}).get("players", {})

    home_ops = []
    for p in get_starters(home_players):
        ops_val = extract_ops(p)
        if ops_val is not None:
            home_ops.append(ops_val)

    away_ops = []
    for p in get_starters(away_players):
        ops_val = extract_ops(p)
        if ops_val is not None:
            away_ops.append(ops_val)

    return {
        "game_id" : gameid,
        "home_lineup_score" : sum(home_ops)/len(home_ops) if home_ops else None,
        "away_lineup_score" : sum(away_ops)/len(away_ops) if home_ops else None,
        "home_n_starters" : len(home_ops),
        "away_n_starters" : len(away_ops)
    }

In [119]:
# Run Only Once, May take a while
results = []
total_games = len(game_logs)
for i, id in enumerate(game_logs['game_id'], start=1):
    try:
        row = lineup_score(id)
        results.append(row)
    except Exception as e:
        print(f"Error {id}: {e}")
    time.sleep(0.05)
    
    if i % 100 == 0 or i == total_games:
        print(f"{i}/{total_games} games processed")


100/2638 games processed
200/2638 games processed
300/2638 games processed
400/2638 games processed
500/2638 games processed
600/2638 games processed
700/2638 games processed
800/2638 games processed
900/2638 games processed
1000/2638 games processed
1100/2638 games processed
1200/2638 games processed
1300/2638 games processed
1400/2638 games processed
1500/2638 games processed
1600/2638 games processed
1700/2638 games processed
1800/2638 games processed
1900/2638 games processed
2000/2638 games processed
2100/2638 games processed
Error 777009: HTTPSConnectionPool(host='statsapi.mlb.com', port=443): Read timed out. (read timeout=30)
Error 777011: HTTPSConnectionPool(host='statsapi.mlb.com', port=443): Read timed out.
2200/2638 games processed
2300/2638 games processed
2400/2638 games processed
2500/2638 games processed
2600/2638 games processed
2638/2638 games processed


In [120]:
results = pd.DataFrame(results)
game_logs = game_logs.merge(results, on="game_id", how="left")

In [130]:
game_logs.shape

(3022, 52)

In [145]:
game_logs.drop_duplicates(subset=["game_id"], inplace=True)
print(len(game_logs))
print(game_logs["game_id"].nunique())

2542
2542


In [149]:
game_logs.to_csv("../data/final_game_logs_2025.csv", index=False)

-----------------------------------------------------------------CHECKPOINT------------------------------------------------------------------

In [162]:
games = statsapi.schedule(start_date='01/01/2025', end_date='08/31/2025')

In [166]:
games = pd.DataFrame(games)

In [170]:
games["game_datetime"] = pd.to_datetime(games["game_datetime"])

In [176]:
games["home_win"] = (games["home_score"] > games["away_score"]).astype(int)
games["away_win"] = (games["away_score"] > games["home_score"]).astype(int)

home_df = games[["game_id", "game_datetime", "home_name", "home_win"]].rename(
    columns={"home_name": "team", "home_win": "result", "game_datetime": "date"}
)
away_df = games[["game_id", "game_datetime", "away_name", "away_win"]].rename(
    columns={"away_name": "team", "away_win": "result", "game_datetime": "date"}
)

df = pd.concat([home_df, away_df], ignore_index=True)
df = df.sort_values(["team", "date"]).reset_index(drop=True)

In [178]:
df["last_7win_pct"] = None
for team, group in df.groupby("team"):
    shifted = group["result"].shift()
    win_pct = shifted.rolling(window=7).mean()
    df.loc[group.index, "last_7win_pct"] = win_pct

In [182]:
def compute_streak(results):
    streaks = []
    streak = 0
    for result in results:
        if result == 1: 
            streak = streak + 1 if streak > 0 else 1
        else:
            streak = streak - 1 if streak < 0 else -1
        streaks.append(streak)
    return streaks

In [184]:
df["streak"] = None
for team,group in df.groupby("team"):
    streaks = compute_streak(group["result"].tolist())
    df.loc[group.index, "streak"] = streaks

In [186]:
df

Unnamed: 0,game_id,date,team,result,last_7win_pct,streak
0,778566,2025-07-16 00:00:00+00:00,American League All-Stars,0,,-1
1,787940,2025-02-21 20:10:00+00:00,Arizona Diamondbacks,0,,-1
2,778808,2025-02-22 20:10:00+00:00,Arizona Diamondbacks,0,,-2
3,779020,2025-02-23 20:10:00+00:00,Arizona Diamondbacks,0,,-3
4,778806,2025-02-24 20:10:00+00:00,Arizona Diamondbacks,1,,1
...,...,...,...,...,...,...
5143,776588,2025-08-26 23:05:00+00:00,Washington Nationals,0,0.428571,-4
5144,776570,2025-08-27 17:05:00+00:00,Washington Nationals,0,0.428571,-5
5145,776545,2025-08-29 22:45:00+00:00,Washington Nationals,0,0.285714,-6
5146,776525,2025-08-30 20:05:00+00:00,Washington Nationals,0,0.142857,-7


In [188]:
df = df.drop_duplicates(subset=["game_id", "team"])

home = df.rename(columns={
    "last_7win_pct": "home_last7_win_pct",
    "streak": "home_streak"})[["game_id", "team", "home_last7_win_pct", "home_streak"]]

away = df.rename(columns={
    "last_7win_pct": "away_last7_win_pct",
    "streak": "away_streak"})[["game_id", "team", "away_last7_win_pct", "away_streak"]]


In [196]:
final_df = pd.read_csv("../data/final_game_logs_2025.csv")

In [198]:
final_df = final_df.merge(
    home, 
    left_on=["game_id", "home_name"], 
    right_on=["game_id", "team"], 
    how="left"
).drop(columns=["team"])

final_df = final_df.merge(
    away, 
    left_on=["game_id", "away_name"], 
    right_on=["game_id", "team"], 
    how="left"
).drop(columns=["team"])

In [202]:
final_df.to_csv("../data/final_game_logs_2025.csv", index=False)

In [212]:
import re
condition_cols = [col for col in game_logs.columns if col.startswith("condition_")]
condition_cols.sort()
rename_map = {old: f"condition_{i}" for i, old in enumerate(condition_cols)}
final_df.rename(columns=rename_map, inplace=True)

In [216]:
final_df["wind_x"] = (
    final_df["wind"]
    .astype(str)
    .str.extract(r"(\d+)")
    .astype(float)
)

In [222]:
final_df.rename(columns={'temp' : 'temp_x'}, inplace=True)

In [226]:
original_df = pd.read_csv("../data/final_game_logs.csv")

In [230]:
original_features = [col for col in original_df.columns if col not in ["target", "result", "win"]]
final_df = final_df.reindex(columns=original_features, fill_value=0)

In [234]:
final_df.to_csv("../data/final_game_logs_2025.csv", index=False)