# This notebook is for training an XGBRegressor

## Goal
Predict a Valorant player’s **expected kills** for an upcoming match using only **pre-match information**.  
This keeps the model realistic for live use (no leakage from in-match facts like agent or map).

---

## Data Source
- **Table:** `player_matches` (≈33k–37k rows) from Supabase Postgres  
- **Key columns used:** `player_name`, `opposite_team`, `player_team`, `match_date`, `kills`  
- **Access pattern:** loaded directly with SQLAlchemy (no CSV dependencies)

### Cleaning and Training XGBoost


In [None]:
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit

from xgboost import XGBRegressor
from scipy.stats import randint, uniform

#Connect & Load 
engine = create_engine(
    "",
    pool_pre_ping=True
)
df = pd.read_sql("SELECT * FROM player_matches;", engine)
print(f"Loaded {len(df):,} rows from player_matches")

#Basic checks & cleaning
required = ["player_name", "opposite_team", "player_team", "match_date", "kills"]
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}")

df["match_date"] = pd.to_datetime(df["match_date"], errors="coerce")
df["kills"]      = pd.to_numeric(df["kills"], errors="coerce")
df = (
    df.dropna(subset=["match_date","kills"])
      .sort_values("match_date")
      .reset_index(drop=True)
)

#Time-aware features (all shifted so they use only prior info) 
#Player rolling/cumulative
df["player_recent_kills_5"] = (
    df.groupby("player_name", group_keys=False)
      .apply(lambda g: g["kills"].shift(1).rolling(5, min_periods=1).mean())
)
df["player_cum_kills_avg"] = (
    df.groupby("player_name", group_keys=False)
      .apply(lambda g: g["kills"].shift(1).expanding(min_periods=1).mean())
)

#Opponent historical conceded (across all players)
df["opponent_allowed_kills_avg"] = (
    df.groupby("opposite_team", group_keys=False)
      .apply(lambda g: g["kills"].shift(1).expanding(min_periods=1).mean())
)

#Player vs this opponent (matchup)
df["player_vs_opp_recent_kills_5"] = (
    df.groupby(["player_name","opposite_team"], group_keys=False)
      .apply(lambda g: g["kills"].shift(1).rolling(5, min_periods=1).mean())
)

#Player trend with exponential decay (more weight to most recent)
df["player_trend_ewm"] = (
    df.groupby("player_name", group_keys=False)
      .apply(lambda g: g["kills"].shift(1).ewm(alpha=0.4, adjust=False).mean())
)

#Team offense (avg kills by players on player's team, prior only)
df["player_team_offense_recent5"] = (
    df.groupby("player_team", group_keys=False)
      .apply(lambda g: g["kills"].shift(1).rolling(5, min_periods=1).mean())
)

#Opponent defense (allowed across all players vs opponent), prior only
df["opponent_defense_allowed_recent5"] = (
    df.groupby("opposite_team", group_keys=False)
      .apply(lambda g: g["kills"].shift(1).rolling(5, min_periods=1).mean())
)

#Fallbacks for cold starts
num_cols = [
    "player_recent_kills_5","player_cum_kills_avg","opponent_allowed_kills_avg",
    "player_vs_opp_recent_kills_5","player_trend_ewm",
    "player_team_offense_recent5","opponent_defense_allowed_recent5"
]
global_means = {c: float(df[c].mean(skipna=True)) for c in num_cols}
for c in num_cols:
    df[c] = df[c].fillna(global_means[c])

#Time-based split (last 20% for test)
n = len(df)
cut = int(n * 0.80)
train_df = df.iloc[:cut].copy()
test_df  = df.iloc[cut:].copy()

features_cat = ["player_name","opposite_team"]
features_num = num_cols
target = "kills"

pre = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), features_cat),
        ("num", "passthrough", features_num),
    ]
)

base = XGBRegressor(
    objective="count:poisson",
    n_estimators=800,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.1,
    random_state=42,
    n_jobs=4,
)

pipe = Pipeline(steps=[("pre", pre), ("model", base)])

#Small random search
param_dist = {
    "model__n_estimators": randint(500, 1200),
    "model__max_depth": randint(4, 10),
    "model__learning_rate": uniform(0.02, 0.06),  # 0.02–0.08
    "model__subsample": uniform(0.7, 0.3),        # 0.7–1.0
    "model__colsample_bytree": uniform(0.7, 0.3), # 0.7–1.0
    "model__reg_lambda": uniform(0.8, 0.8),       # 0.8–1.6
}

tscv = TimeSeriesSplit(n_splits=3)

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=20,
    scoring="neg_mean_absolute_error",
    cv=tscv,
    n_jobs=4,
    verbose=0,
    random_state=42,
)
search.fit(train_df[features_cat + features_num], train_df[target])

pipe_best = search.best_estimator_

#Evaluate on true forward holdout
pred_test = np.clip(pipe_best.predict(test_df[features_cat + features_num]), 0, None)
mae  = mean_absolute_error(test_df[target], pred_test)
rmse = mean_squared_error(test_df[target], pred_test, squared=False)
r2   = r2_score(test_df[target], pred_test)

print("Best params:", search.best_params_)
print("Evaluation on last 20% (time-based holdout):")
print(f"  MAE : {mae:.3f}")
print(f"  RMSE: {rmse:.3f}")
print(f"  R^2 : {r2:.3f}")

#objects for prediction cell
pipe = pipe_best  
player_lookup = (
    df.groupby("player_name", as_index=False)
      .tail(1)[["player_name","player_recent_kills_5","player_cum_kills_avg","player_trend_ewm"]]
      .reset_index(drop=True)
)
opponent_lookup = (
    df.groupby("opposite_team", as_index=False)
      .tail(1)[["opposite_team","opponent_allowed_kills_avg","opponent_defense_allowed_recent5"]]
      .reset_index(drop=True)
)
pair_lookup = (
    df.groupby(["player_name","opposite_team"], as_index=False)
      .tail(1)[["player_name","opposite_team","player_vs_opp_recent_kills_5"]]
      .reset_index(drop=True)
)
print("Training v2 complete.")


Loaded 37,149 rows from player_matches
Best params: {'model__colsample_bytree': 0.8075397185632818, 'model__learning_rate': 0.026952143571507783, 'model__max_depth': 4, 'model__n_estimators': 532, 'model__reg_lambda': 0.8508466802288189, 'model__subsample': 0.7932946965146986}
Evaluation on last 20% (time-based holdout):
  MAE : 4.103
  RMSE: 5.165
  R^2 : 0.069
Training v2 complete.




### Prediction for choice of player and opponent

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error

def _row_features(player_name: str, opposite_team: str) -> pd.DataFrame:
    # Player summaries
    rp = player_lookup[player_lookup["player_name"] == player_name]
    pr5  = float(rp["player_recent_kills_5"].iloc[0]) if not rp.empty else global_means["player_recent_kills_5"]
    pcum = float(rp["player_cum_kills_avg"].iloc[0])  if not rp.empty else global_means["player_cum_kills_avg"]

    # Opponent summaries
    ro = opponent_lookup[opponent_lookup["opposite_team"] == opposite_team]
    oavg  = float(ro["opponent_allowed_kills_avg"].iloc[0])        if not ro.empty else global_means["opponent_allowed_kills_avg"]
    odef5 = float(ro["opponent_defense_allowed_recent5"].iloc[0])  if "opponent_defense_allowed_recent5" in global_means else None
    if odef5 is None:
        odef5 = global_means.get("opponent_defense_allowed_recent5", oavg)

    # Matchup summary
    rpair = pair_lookup[(pair_lookup["player_name"] == player_name) &
                        (pair_lookup["opposite_team"] == opposite_team)]
    pvo = float(rpair["player_vs_opp_recent_kills_5"].iloc[0]) if not rpair.empty else global_means.get("player_vs_opp_recent_kills_5", pr5)

    # Optional trend
    ptr = None
    if "player_trend_ewm" in player_lookup.columns:
        ptr = float(rp["player_trend_ewm"].iloc[0]) if not rp.empty else global_means.get("player_trend_ewm", pr5)
    else:
        ptr = global_means.get("player_trend_ewm", pr5)

    # Optional team offense 
    team_off5 = global_means.get("player_team_offense_recent5", oavg)

    cols = {
        "player_name": player_name,
        "opposite_team": opposite_team,
        "player_recent_kills_5": pr5,
        "player_cum_kills_avg": pcum,
        "opponent_allowed_kills_avg": oavg,
        "player_vs_opp_recent_kills_5": pvo,
        "player_trend_ewm": ptr,
        "player_team_offense_recent5": team_off5,
        "opponent_defense_allowed_recent5": odef5,
    }

    # Keep only the features the pipeline expects
    needed = ["player_name","opposite_team"] + [c for c in cols if c in features_num]
    for k in list(cols):
        if k not in needed:
            cols.pop(k, None)

    return pd.DataFrame([cols])

def predict_with_interval(player_name: str, opposite_team: str, q_lo=0.05, q_hi=0.95):
    """Predict kills and attach empirical interval using test residuals from best available slice."""
    X_row = _row_features(player_name, opposite_team)
    y_hat = float(pipe.predict(X_row)[0])
    y_hat = max(0.0, y_hat)

    #Build residuals on test set
    X_test = test_df[features_cat + features_num]
    y_test = test_df["kills"].values
    y_pred_test = np.clip(pipe.predict(X_test), 0, None)
    residuals = y_test - y_pred_test

    #Choose the best slice available
    # 1) exact pair
    mask_pair = (test_df["player_name"] == player_name) & (test_df["opposite_team"] == opposite_team)
    if mask_pair.any():
        res_slice = residuals[mask_pair.values]
        basis = "pair (player vs opponent)"
    else:
        # 2) player-only
        mask_p = (test_df["player_name"] == player_name)
        if mask_p.any():
            res_slice = residuals[mask_p.values]
            basis = "player-only"
        else:
            # 3) opponent-only
            mask_o = (test_df["opposite_team"] == opposite_team)
            if mask_o.any():
                res_slice = residuals[mask_o.values]
                basis = "opponent-only"
            else:
                # 4) global
                res_slice = residuals
                basis = "global"

    #Compute slice MAE and interval
    slice_mae = float(np.mean(np.abs(res_slice))) if res_slice.size else float("nan")
    lo = float(np.quantile(res_slice, q_lo)) if res_slice.size else 0.0
    hi = float(np.quantile(res_slice, q_hi)) if res_slice.size else 0.0
    lo_pred = max(0.0, y_hat + lo)
    hi_pred = max(0.0, y_hat + hi)

    return {
        "prediction": y_hat,
        "interval": (lo_pred, hi_pred),
        "slice_basis": basis,
        "slice_n": int(res_slice.size),
        "slice_mae": slice_mae,
    }

#Inputs (edit these)
player_name_input   = "TenZ"
opposite_team_input = "LOUD"

#Run
res = predict_with_interval(player_name_input, opposite_team_input)

print("Kill Prediction")
print(f"Player          : {player_name_input}")
print(f"Opposing Team   : {opposite_team_input}")
print(f"Predicted Kills : {res['prediction']:.2f}")
print(f"90% Interval    : [{res['interval'][0]:.2f}, {res['interval'][1]:.2f}]  (basis: {res['slice_basis']}, n={res['slice_n']}, slice MAE={res['slice_mae']:.2f})")

print("\n Overall Test Metrics (last 20%)")
print(f"MAE  : {mae:.3f}")
print(f"RMSE : {rmse:.3f}")
print(f"R^2  : {r2:.3f}")


=== Kill Prediction ===
Player          : TenZ
Opposing Team   : LOUD
Predicted Kills : 17.05
90% Interval    : [9.14, 24.06]  (basis: player-only, n=10, slice MAE=4.58)

=== Overall Test Metrics (last 20%) ===
MAE  : 4.103
RMSE : 5.165
R^2  : 0.069
