In [None]:
from google.colab import drive
import pandas as pd
import os

drive.mount('/content/drive')

project_dir = "/content/drive/MyDrive/mlb-project-data"
data_path = f"{project_dir}/mlb_data_master.parquet"

if os.path.exists(data_path):
    df = pd.read_parquet(data_path)
    print(f"Loaded {len(df)} rows from Google Drive!")
    print(df.head(3))
else:
    print("File not found. Go back to Notebook 1 and run the 'Save to Drive' cell.")

In [None]:
def create_team_centric_df(df_master):

    df_home = df_master.copy()
    df_home = df_home.rename(columns={
        'home_team_abbr': 'team',
        'away_team_abbr': 'opponent',
        'home_score': 'runs_scored',
        'away_score': 'runs_allowed',
        'home_hits': 'hits',
        'home_errors': 'errors',
        'away_hits': 'opp_hits',
        'away_errors': 'opp_errors',
        'home_moneyline': 'moneyline_closing',
        'away_moneyline': 'moneyline_opp'
    })
    df_home['is_home'] = 1
    df_home['result'] = (df_home['runs_scored'] > df_home['runs_allowed']).astype(int)

    df_away = df_master.copy()
    df_away = df_away.rename(columns={
        'away_team_abbr': 'team',
        'home_team_abbr': 'opponent',
        'away_score': 'runs_scored',
        'home_score': 'runs_allowed',
        'away_hits': 'hits',
        'away_errors': 'errors',
        'home_hits': 'opp_hits',
        'home_errors': 'opp_errors',
        'away_moneyline': 'moneyline_closing',
        'home_moneyline': 'moneyline_opp'
    })
    df_away['is_home'] = 0
    df_away['result'] = (df_away['runs_scored'] > df_away['runs_allowed']).astype(int)

    cols_to_keep = [
        'date', 'team', 'opponent', 'is_home', 'result',
        'runs_scored', 'runs_allowed', 'hits', 'errors',
        'moneyline_closing'
    ]

    df_long = pd.concat([df_home[cols_to_keep], df_away[cols_to_keep]])

    df_long = df_long.sort_values(['team', 'date']).reset_index(drop=True)

    return df_long

df_team = create_team_centric_df(df)
print(f"Original Rows: {len(df)}")
print(f"Team Rows: {len(df_team)}")
df_team.head()

In [None]:
def calculate_rolling_features(df_long, window_size=10):
    df_features = df_long.copy()

    metrics = ['runs_scored', 'runs_allowed', 'hits', 'errors']

    for metric in metrics:
        col_name = f'rolling_{window_size}_{metric}'

        df_features[col_name] = (
            df_features.groupby('team')[metric]
            .transform(lambda x: x.shift(1).rolling(window=window_size).mean())
        )

    rolling_runs = df_features.groupby('team')['runs_scored'].transform(lambda x: x.shift(1).rolling(window_size).sum())
    rolling_allowed = df_features.groupby('team')['runs_allowed'].transform(lambda x: x.shift(1).rolling(window_size).sum())

    df_features['rolling_pythag_win_pct'] = (rolling_runs ** 2) / ((rolling_runs ** 2) + (rolling_allowed ** 2) + 1e-9)

    return df_features

df_features = calculate_rolling_features(df_team, window_size=10)

print("Checking start of season (should be NaNs due to shift):")
print(df_features[df_features['team'] == 'NYY'][['date', 'runs_scored', 'rolling_10_runs_scored']].head(12))

In [None]:
def calculate_advanced_features(df):
    df_adv = df.copy()

    df_adv['rest_days'] = df_adv.groupby('team')['date'].diff().dt.days

    df_adv['rest_days'] = df_adv['rest_days'].fillna(5)

    opponent_stats = df_adv[['date', 'team', 'rolling_pythag_win_pct']].copy()
    opponent_stats = opponent_stats.rename(columns={
        'team': 'opponent',
        'rolling_pythag_win_pct': 'opp_pythag_win_pct'
    })

    df_adv = pd.merge(
        df_adv,
        opponent_stats,
        on=['date', 'opponent'],
        how='left'
    )

    A = df_adv['rolling_pythag_win_pct']
    B = df_adv['opp_pythag_win_pct']

    numerator = A * (1 - B)
    denominator = (A * (1 - B)) + (B * (1 - A))

    df_adv['log5_prob'] = numerator / denominator

    return df_adv

df_final = calculate_advanced_features(df_features)

cols = ['date', 'team', 'opponent', 'rolling_pythag_win_pct', 'opp_pythag_win_pct', 'log5_prob', 'result']
print(df_final[cols].tail())

In [None]:
df_final['team'].value_counts()

In [None]:
df_final.sample(n=5)

In [None]:
df_final['game_num'] = df_final.groupby('team').cumcount() + 1

missing_by_game = df_final.groupby('game_num')['log5_prob'].apply(lambda x: x.isna().mean())

print("--- MISSING DATA REPORT ---")
print(f"Game 1 Missing %:   {missing_by_game.iloc[0]:.1%}")   # Should be 100%
print(f"Game 10 Missing %:  {missing_by_game.iloc[9]:.1%}")   # Should be 100% (if window=10)
print(f"Game 11 Missing %:  {missing_by_game.iloc[10]:.1%}")  # Should drop drastically
print(f"Game 20 Missing %:  {missing_by_game.iloc[19]:.1%}")  # Should be 0.0% (or very close)

print(f"\nTotal Rows: {len(df_final)}")
print(f"Rows with valid Log5: {df_final['log5_prob'].notna().sum()}")

In [None]:
missing_team_rows = df_final[df_final['team'].isna() | (df_final['team'] == 'None') | (df_final['team'] == 'UNKNOWN')]

print(f"Rows with corrupt Team Names: {len(missing_team_rows)}")
if len(missing_team_rows) > 0:
    print(missing_team_rows[['date', 'opponent', 'runs_scored', 'moneyline_closing']].head())

game_counts = df_final['team'].value_counts()
print("\n--- Game Count Report ---")
print(game_counts.tail(5))

worst_team = game_counts.idxmin()
worst_team_games = df_final[df_final['team'] == worst_team]['date']

season_start = '2023-03-30'
season_end = '2023-10-01'
all_dates = pd.date_range(start=season_start, end=season_end)

print(f"\nAnalyzing missing games for: {worst_team}")
print(f"Total Games Found: {len(worst_team_games)}")

In [None]:
def finalize_training_data(df_final):

    df_train = df_final.copy()

    original_len = len(df_train)
    df_train = df_train.dropna(subset=['rolling_10_runs_scored'])
    print(f"Dropped {original_len - len(df_train)} rows (Early Season Warm-up).")

    df_train['team_code'] = df_train['team'].astype('category').cat.codes
    df_train['opponent_code'] = df_train['opponent'].astype('category').cat.codes

    features = [
        'is_home',
        'rest_days',
        'log5_prob',
        'rolling_10_runs_scored',
        'rolling_10_runs_allowed',
        'rolling_10_hits',
        'rolling_10_errors',
        'rolling_pythag_win_pct',
        'opp_pythag_win_pct',
        'team_code',
        'opponent_code',
        'moneyline_closing' ]

    target = 'result'

    return df_train[features + [target, 'date', 'team', 'opponent']]

df_model_ready = finalize_training_data(df_final)

save_path = f"{project_dir}/model_ready_data.parquet"
df_model_ready.to_parquet(save_path)
print(f"Modeling Data Saved: {len(df_model_ready)} rows.")
print(df_model_ready.head())