In [None]:
import sys, os
from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

project_dir = "/content/drive/MyDrive/mlb-project-data"
data_path = f"{project_dir}/model_ready_data.parquet"

if os.path.exists(data_path):
    df_train = pd.read_parquet(data_path)
    print(f"Loaded {len(df_train)} rows for modeling.")
else:
    print("Data not found. Run Notebook 2 first.")

from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss, accuracy_score, log_loss
from sklearn.calibration import calibration_curve

In [None]:
df_train.head()

In [None]:
feature_cols = [
    'rest_days',
    'log5_prob',
    'rolling_10_runs_scored',
    'rolling_10_runs_allowed',
    'rolling_10_hits',
    'rolling_10_errors',
    'rolling_pythag_win_pct',
    'opp_pythag_win_pct',
    'team_code',
    'opponent_code',
    'is_home'
]
df_train = df_train.sort_values(['date', 'team']).reset_index(drop=True)

X = df_train[feature_cols]

y = df_train['result']

from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)

print(f"Features configured: {len(feature_cols)}")
print(f"Splitter configured: 5-Fold Time Series")

In [None]:
print("--- CROSS VALIDATION PLAN ---")
for split_idx, (train_index, test_index) in enumerate(tscv.split(X)):
    train_dates = df_train.iloc[train_index]['date']
    test_dates = df_train.iloc[test_index]['date']

    print(f"Split {split_idx + 1}:")
    print(f"  Train: {train_dates.min().date()} -> {train_dates.max().date()} ({len(train_index)} games)")
    print(f"  Test:  {test_dates.min().date()} -> {test_dates.max().date()} ({len(test_index)} games)")
    print("-" * 30)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

all_predictions = []
all_actuals = []
all_dates = []

model = make_pipeline(SimpleImputer(strategy='mean'),StandardScaler(), LogisticRegression())

print("Starting Time-Series Training Loop...")

for split_idx, (train_index, test_index) in enumerate(tscv.split(X)):

    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model.fit(X_train, y_train)

    probs = model.predict_proba(X_test)[:, 1]

    all_predictions.extend(probs)
    all_actuals.extend(y_test)
    all_dates.extend(df_train.iloc[test_index]['date'])

    acc = accuracy_score(y_test, (probs > 0.5).astype(int))
    print(f"  Split {split_idx + 1}: Accuracy = {acc:.3f}")

print("Training Loop Complete.")

In [None]:
print(X.isna().sum())

In [None]:
model_step = model.named_steps['logisticregression']
coeffs = pd.DataFrame({
    'Feature': feature_cols,
    'Weight': model_step.coef_[0]
}).sort_values('Weight', ascending=False)

print("--- WHAT DID THE MODEL LEARN? ---")
print(coeffs)

prob_true, prob_pred = calibration_curve(all_actuals, all_predictions, n_bins=10)

plt.figure(figsize=(8, 8))
plt.plot([0, 1], [0, 1], linestyle='--', label='Perfect Calibration')
plt.plot(prob_pred, prob_true, marker='.', label='Your Model')
plt.xlabel('Predicted Probability')
plt.ylabel('Actual Win Percentage')
plt.title('Calibration Curve: Does 60% mean 60%?')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
from sklearn.impute import SimpleImputer

X_raw = df_train[feature_cols].values
y_train_final = df_train['result'].values

imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()

X_imputed = imputer.fit_transform(X_raw)
X_scaled = scaler.fit_transform(X_imputed)

print(f"Original shape: {X_raw.shape}")
print(f"Processed shape: {X_scaled.shape}")
print(f"Any NaNs left? {np.isnan(X_scaled).sum()}")

In [None]:
import pymc as pm
import arviz as az

X_train_final = X_scaled

print(f"Training Bayesian Model on {len(X_train_final)} games...")

with pm.Model() as bayesian_model:
    alpha = pm.Normal("alpha", mu=0, sigma=1)

    betas = pm.Normal("betas", mu=0, sigma=1, shape=X_train_final.shape[1])

    mu = alpha + pm.math.dot(X_train_final, betas)

    theta = pm.math.sigmoid(mu)

    y_obs = pm.Bernoulli("y_obs", p=theta, observed=y_train_final)

    print("Thinking (Sampling)... this might take 2-3 minutes...")
    trace = pm.sample(1000, tune=1000, chains=2, return_inferencedata=True)

print("Bayesian Model Trained!")

In [None]:
import matplotlib.pyplot as plt

az.plot_forest(trace, var_names=["betas"], combined=True, figsize=(10, 6))

plt.yticks(
    ticks=range(len(feature_cols)),
    labels=reversed(feature_cols)
plt.title("Bayesian Feature Weights (94% Credible Interval)")
plt.grid(True, alpha=0.3)
plt.xlabel("Impact on Winning (Negative <---> Positive)")
plt.axvline(0, color='red', linestyle='--')
plt.show()

In [None]:
az.plot_trace(trace, var_names=["alpha", "betas"], compact=True)
plt.tight_layout()
plt.show()

summary = az.summary(trace, var_names=["alpha", "betas"])

summary_betas = summary[summary.index.str.contains("betas")].copy()
summary_betas['Feature'] = feature_cols

print("Bayesian feature importance:")
print(summary_betas[['Feature', 'mean', 'sd', 'r_hat']].sort_values('mean', ascending=False))

In [None]:
with bayesian_model:
    ppc = pm.sample_posterior_predictive(trace, var_names=["y_obs"])

bayesian_probs = ppc.posterior_predictive['y_obs'].mean(dim=["chain", "draw"]).values

df_simulation = df_train.copy()
df_simulation['my_prob'] = bayesian_probs

print(f"Generated predictions for {len(df_simulation)} games.")
print(df_simulation[['date', 'team', 'opponent', 'my_prob', 'result']].head())

In [None]:
def simulate_betting(df, threshold=0.05, stake=100):

    sim = df.copy()

    def us_odds_to_prob(odds):
        if pd.isna(odds): return np.nan
        if odds > 0:
            return 100 / (odds + 100)
        else:
            return (-odds) / (-odds + 100)

    sim['vegas_prob'] = sim['moneyline_closing'].apply(us_odds_to_prob)

    sim['edge'] = sim['my_prob'] - sim['vegas_prob']

    sim['bet_placed'] = sim['edge'] > threshold

    def calculate_pnl(row):
        if not row['bet_placed']: return 0

        if row['moneyline_closing'] > 0:
            decimal_odds = 1 + (row['moneyline_closing'] / 100)
        else:
            decimal_odds = 1 + (100 / -row['moneyline_closing'])

        if row['result'] == 1:
            return stake * (decimal_odds - 1)
        else:
            return -stake

    sim['pnl'] = sim.apply(calculate_pnl, axis=1)
    sim['bankroll'] = sim['pnl'].cumsum()

    return sim

results = simulate_betting(df_simulation, threshold=0.05)

total_bets = results['bet_placed'].sum()
total_profit = results['pnl'].sum()
roi = (total_profit / (total_bets * 100)) * 100 if total_bets > 0 else 0

print(f"--- BETTING SIMULATION RESULTS ---")
print(f"Total Bets Placed: {total_bets}")
print(f"Total Profit: ${total_profit:.2f}")
print(f"ROI: {roi:.2f}%")

plt.figure(figsize=(10, 6))
plt.plot(results[results['bet_placed']]['date'], results[results['bet_placed']]['bankroll'])
plt.title(f"Bankroll Simulation (Threshold=5%) - Profit: ${total_profit:.0f}")
plt.xlabel("Date")
plt.ylabel("Profit ($)")
plt.grid(True)
plt.show()