
# Rocket League Match Winner Predictor (Rebuilt)

This notebook trains a **no‑leak match‑winner predictor** using your two CSVs:

- `matches_by_teams.csv` — per-team stats per match (includes `winner`)
- `main.csv` — match/game metadata with a date column

**Design choices:**
- Uses **only past data** via `shift(1)` + rolling means
- Builds **match‑level features** by subtracting Team B from Team A
- **Time‑ordered** 80/20 split for evaluation
- Trains **Logistic Regression** (with scaling) and **Random Forest**
- Saves artifacts for reuse (`rl_match_predictor.pkl`, `test_set_predictions.csv`)
- Includes an **inference helper** (predict probability for two teams as of a date)


In [None]:

# 1) Imports & Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, ConfusionMatrixDisplay, RocCurveDisplay

import pickle
import warnings
warnings.filterwarnings("ignore")

print("Ready. pandas:", pd.__version__)


## Load Data

In [None]:

# 2) Load Data (adjust paths if needed)
matches_path = 'matches_by_teams.csv'
main_path    = 'main.csv'

matches = pd.read_csv(matches_path)
main    = pd.read_csv(main_path)

print("matches shape:", matches.shape)
print("main shape:", main.shape)


## Dates & Per-Match Metadata

In [None]:

# 3) Dates & Per-Match Metadata
date_col = 'game_date' if 'game_date' in main.columns else ('match_date' if 'match_date' in main.columns else None)
if date_col is None:
    raise ValueError("No date column found in main.csv. Expected 'game_date' or 'match_date'.")

main[date_col] = pd.to_datetime(main[date_col], errors='coerce', utc=True)

meta_candidates = ['match_id', date_col, 'event', 'event_region', 'event_tier',
                   'match_format', 'stage', 'stage_is_lan', 'stage_is_qualifier']
meta_cols = [c for c in meta_candidates if c in main.columns]

main_match = (main.sort_values([date_col, 'match_id'])
                  .drop_duplicates(subset=['match_id'], keep='first')[meta_cols])

main_match.head(3)


## Per-Team Rows & Feature Selection

In [None]:

# 4) Per-Team Rows & Feature Selection
team_cols_pref = ['match_id','team_id','team_slug','team_name','team_region','color','winner']
team_cols = [c for c in team_cols_pref if c in matches.columns]
if 'match_id' not in team_cols or 'team_id' not in team_cols:
    raise ValueError("Expected 'match_id' and 'team_id' in matches_by_teams.csv.")

# Auto-select numeric features (exclude ids/target)
exclude_exact = {'match_id','team_id','winner'}
numeric_cols = [c for c in matches.columns
                if c not in exclude_exact and np.issubdtype(matches[c].dtype, np.number)]

# Prefer typical RL stat families; fall back to all numeric if none match
preferred_prefixes = ['core_', 'boost_', 'movement_', 'positioning_', 'demo_']
rl_like = [c for c in numeric_cols if any(c.startswith(p) for p in preferred_prefixes)]
feature_candidates = rl_like if rl_like else numeric_cols
if len(feature_candidates) == 0:
    raise ValueError("No usable numeric feature columns found in matches_by_teams.csv.")

print(f"Using {len(feature_candidates)} numeric features (first 12): {sorted(feature_candidates)[:12]}")

# Build per_team INCLUDING feature columns
per_team = matches[team_cols + feature_candidates].copy()

# Normalize winner to 0/1
if 'winner' in per_team.columns:
    per_team['winner'] = per_team['winner'].astype(int) if per_team['winner'].dtype != bool else per_team['winner'].astype(int)
else:
    raise ValueError("Target column 'winner' missing in matches_by_teams.csv.")

# Attach date/meta and sort
per_team = (per_team.merge(main_match, on='match_id', how='left')
                    .sort_values(['team_id', date_col])
                    .reset_index(drop=True))

per_team.head(3)


## Rolling (Past-Only) Features

In [None]:

# 5) Rolling (Past-Only) Features
ROLL_N = 5

def _roll_mean(s):
    # last N matches BEFORE current match
    return s.shift(1).rolling(ROLL_N, min_periods=1).mean()

roll_frames = []
for col in feature_candidates:
    r = (per_team.groupby('team_id')[col]
         .apply(_roll_mean)
         .rename(f'{col}_roll{ROLL_N}_mean'))
    roll_frames.append(r)

rolled = pd.concat(roll_frames, axis=1).reset_index(level=0, drop=True)
per_team_rolled = pd.concat([per_team, rolled], axis=1)

roll_cols = [c for c in per_team_rolled.columns if c.endswith(f'_roll{ROLL_N}_mean')]
per_team_rolled[roll_cols] = per_team_rolled[roll_cols].fillna(per_team_rolled[roll_cols].mean())

print(f"Built {len(roll_cols)} rolling features over last {ROLL_N} matches.")
per_team_rolled[roll_cols].head(3)


## Build Match-Level Examples (Team A − Team B)

In [None]:

# 6) Build Match-Level Examples (Team A − Team B)
def build_pairs(df, date_col, roll_cols):
    rows = []
    for mid, grp in df.groupby('match_id', sort=False):
        if len(grp) != 2:
            continue
        sort_key = 'team_name' if 'team_name' in grp.columns else 'team_id'
        grp = grp.sort_values(sort_key)
        a, b = grp.iloc[0], grp.iloc[1]
        diff = a[roll_cols].values - b[roll_cols].values
        y = int(a['winner'])  # whether Team A (alphabetical) won
        rows.append((diff, y, {
            'match_id': mid,
            'date': a[date_col],
            'teamA_id': a['team_id'],
            'teamA_name': a.get('team_name', str(a['team_id'])),
            'teamB_id': b['team_id'],
            'teamB_name': b.get('team_name', str(b['team_id']))
        }))

    if not rows:
        return pd.DataFrame(), np.array([]), pd.DataFrame()

    X = np.vstack([r[0] for r in rows])
    y = np.array([r[1] for r in rows])
    meta_df = pd.DataFrame([r[2] for r in rows])
    X_df = pd.DataFrame(X, columns=[f'diff__{c}' for c in roll_cols])
    return X_df, y, meta_df

X_df, y_vec, meta_df = build_pairs(per_team_rolled, date_col, roll_cols)
if X_df.empty:
    raise RuntimeError("No valid matches with exactly two teams found after processing.")

data = pd.concat([meta_df.reset_index(drop=True), X_df.reset_index(drop=True)], axis=1)
data = data.dropna(subset=['date']).sort_values('date').reset_index(drop=True)
data['label'] = y_vec.astype(int)
feature_cols = [c for c in data.columns if c.startswith('diff__')]

print(f"Final training rows: {len(data)} | Features per row: {len(feature_cols)}")
data.head(3)


## Train/Test Split, Train Models, Evaluate

In [None]:

# 7) Train/Test Split, Train Models, Evaluate
split_idx = int(len(data) * 0.80)
train_df = data.iloc[:split_idx].copy()
test_df  = data.iloc[split_idx:].copy()

X_train, y_train = train_df[feature_cols].values, train_df['label'].values
X_test,  y_test  = test_df[feature_cols].values,  test_df['label'].values

logreg = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(max_iter=1000))])
rf = RandomForestClassifier(n_estimators=400, min_samples_split=4, min_samples_leaf=2, random_state=42, n_jobs=-1)

logreg.fit(X_train, y_train)
rf.fit(X_train, y_train)

def evaluate(name, model):
    yhat_tr = model.predict(X_train)
    yhat_te = model.predict(X_test)
    proba_tr = model.predict_proba(X_train)[:,1]
    proba_te = model.predict_proba(X_test)[:,1]
    return {
        'model': name,
        'train_acc': accuracy_score(y_train, yhat_tr),
        'test_acc':  accuracy_score(y_test,  yhat_te),
        'train_auc': roc_auc_score(y_train, proba_tr),
        'test_auc':  roc_auc_score(y_test,  proba_te),
        'train_logloss': log_loss(y_train, proba_tr),
        'test_logloss':  log_loss(y_test,  proba_te)
    }, (yhat_te, proba_te)

r1, (yhat_log_te, proba_log_te) = evaluate('LogisticRegression', logreg)
r2, (yhat_rf_te,  proba_rf_te)  = evaluate('RandomForest', rf)

perf = pd.DataFrame([r1, r2]).round(4)
perf


## Plots

In [None]:

# 8) Plots
plt.figure()
RocCurveDisplay.from_predictions(y_test, proba_log_te, name='LogReg')
RocCurveDisplay.from_predictions(y_test, proba_rf_te,  name='RandomForest')
plt.title('ROC Curve (Test)')
plt.show()

best_name = 'LogisticRegression' if r1['test_auc'] >= r2['test_auc'] else 'RandomForest'
best_pred = yhat_log_te if best_name == 'LogisticRegression' else yhat_rf_te
ConfusionMatrixDisplay.from_predictions(y_test, best_pred)
plt.title(f'Confusion Matrix - {best_name} (Test)')
plt.show()


## Save Artifacts

In [None]:

# 9) Save Artifacts
best_model = logreg if r1['test_auc'] >= r2['test_auc'] else rf
artifact = {
    'model_name': 'LogisticRegression' if r1['test_auc'] >= r2['test_auc'] else 'RandomForest',
    'model': best_model,
    'feature_cols': feature_cols,
    'roll_cols': roll_cols,
    'roll_window': 5,
    'built_on': datetime.utcnow().isoformat() + 'Z'
}
with open('rl_match_predictor.pkl', 'wb') as f:
    pickle.dump(artifact, f)

out_preds = test_df[['match_id','date','teamA_name','teamB_name','label']].copy()
out_preds['prob_A_wins'] = best_model.predict_proba(test_df[feature_cols].values)[:,1]
out_preds.rename(columns={'label':'actual_A_won'}, inplace=True)
out_preds.to_csv('test_set_predictions.csv', index=False)

print("Saved -> rl_match_predictor.pkl")
print("Saved -> test_set_predictions.csv")


## Inference Helper

In [None]:

# 10) Inference Helper
def _latest_roll_for_team(team_name, as_of=None):
    df = per_team_rolled.copy()
    if as_of is not None:
        as_of = pd.to_datetime(as_of, utc=True)
        df = df[df['game_date' if 'game_date' in df.columns else 'match_date'] < as_of]
    rows = df[df['team_name'] == team_name].sort_values('game_date' if 'game_date' in df.columns else 'match_date')
    if rows.empty:
        return None
    return rows.iloc[-1][roll_cols].values

def predict_match_prob(teamA_name, teamB_name, as_of=None):
    vecA = _latest_roll_for_team(teamA_name, as_of=as_of)
    vecB = _latest_roll_for_team(teamB_name, as_of=as_of)
    if vecA is None or vecB is None:
        raise ValueError("Missing history for one or both teams before the given date.")
    diff = (vecA - vecB).reshape(1, -1)
    diff_df = pd.DataFrame(diff, columns=roll_cols)
    X = diff_df.rename(columns={c: f'diff__{c}' for c in roll_cols})[artifact['feature_cols']].values
    return float(artifact['model'].predict_proba(X)[:,1])

# Example (adjust names/date to your dataset):
# predict_match_prob('G2 ESPORTS', 'NRG', as_of='2022-06-01')
