In [None]:
from dotenv import load_dotenv
import os
from typing import Any, Dict, List
import pickle

import numpy as np
import pymysql
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [15]:

load_dotenv()

connection: pymysql.connections.Connection = pymysql.connect(
    user=os.getenv('DB_USER'),
    password=os.getenv('DB_PASSWORD'),
    host=os.getenv('DB_HOST'),
    database=os.getenv('DB_NAME')
)

#  TODO: add timeouts for pos and def teams
with open('../sql/play_call.sql', 'r') as f:
    play_type_query: str = f.read()

play_type: pd.DataFrame = pd.read_sql(play_type_query, con=connection)
play_type

  play_type: pd.DataFrame = pd.read_sql(play_type_query, con=connection)


Unnamed: 0,clock,score_diff,down,distance,yards_to_endzone,abbreviation
0,900,0.0,0,0,65,K
1,891,0.0,1,10,28,REC
2,858,0.0,2,1,19,PEN
3,847,0.0,1,10,13,RUSH
4,808,0.0,2,2,5,
...,...,...,...,...,...,...
1331476,21,0.0,1,10,0,PASS
1331477,21,0.0,2,4,0,
1331478,7,3.0,3,4,0,
1331479,0,3.0,1,0,0,EP


In [16]:
# Map play types to simplified categories
play_type_mapping: Dict[str, str] = {
    'PASS': 'pass',
    'INT': 'pass',
    'INTR': 'pass',
    'REC': 'pass',
    
    'RUSH': 'run',
    
    'FG': 'fg',
    'FGM': 'fg',
    'AFG': 'fg',
    'BFG': 'fg',
    
    'PUNT': 'punt',
    'BP': 'punt'
}

# Apply mapping and filter to only the 4 categories
play_type['play_category'] = play_type['abbreviation'].map(play_type_mapping)
play_type_filtered: pd.DataFrame = play_type[play_type['play_category'].notna()].copy()

print(f"Original rows: {len(play_type)}")
print(f"Filtered rows: {len(play_type_filtered)}")
print(f"\nPlay categories distribution:")
play_type_filtered['play_category'].value_counts()

play_type_filtered

Original rows: 1331481
Filtered rows: 861463

Play categories distribution:


Unnamed: 0,clock,score_diff,down,distance,yards_to_endzone,abbreviation,play_category
1,891,0.0,1,10,28,REC,pass
3,847,0.0,1,10,13,RUSH,run
5,774,0.0,3,3,6,RUSH,run
10,707,-7.0,1,20,75,RUSH,run
12,663,-7.0,3,18,73,RUSH,run
...,...,...,...,...,...,...,...
1331470,81,0.0,1,10,0,PASS,pass
1331471,56,0.0,2,5,0,PASS,pass
1331473,41,0.0,2,10,0,PASS,pass
1331475,29,0.0,2,10,0,PASS,pass


In [20]:
feature_cols: List[str] = [c for c in play_type_filtered.columns if c not in ['abbreviation', 'play_category']]
X: pd.DataFrame = play_type_filtered[feature_cols]
y: pd.Series = play_type_filtered['play_category']

numeric_cols: List[str] = X.select_dtypes(include=['number']).columns.tolist()

preprocess: Pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=False))
])

model: LogisticRegression = LogisticRegression(max_iter=3000)

clf: Pipeline = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', model)
])

split: tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series] = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train, X_test, y_train, y_test = split

clf.fit(X_train, y_train)

# probabilities for each class
probs: np.ndarray = clf.predict_proba(X_test)
probs_df: pd.DataFrame = pd.DataFrame(probs, columns=clf.named_steps['model'].classes_)

probs_df.head()

Unnamed: 0,fg,pass,punt,run
0,0.000124,0.49816,1.657021e-10,0.501715
1,2e-06,0.545032,1.147948e-16,0.454966
2,1e-06,0.371394,1.213574e-15,0.628605
3,0.010164,0.064861,0.9016052,0.02337
4,0.0038,0.012577,0.9798662,0.003758


In [22]:
# Save model + metadata (no external dependency)
model_bundle = {
    'pipeline': clf,
    'feature_cols': feature_cols,
    'model_classes': clf.named_steps['model'].classes_.tolist()
}

with open('play_type_model.pkl', 'wb') as f:
    pickle.dump(model_bundle, f)

print('Saved to play_type_model.pkl')

Saved to play_type_model.pkl


In [10]:
# predict for all rows and append to original dataframe
probs_all: np.ndarray = clf.predict_proba(X)
preds_all: np.ndarray = clf.predict(X)

pred_df: pd.DataFrame = pd.DataFrame(probs_all, columns=[f"prob_{c}" for c in clf.named_steps['model'].classes_])

play_type_with_preds: pd.DataFrame = play_type_filtered.reset_index(drop=True).copy()
play_type_with_preds['predicted_category'] = preds_all
play_type_with_preds['predicted_confidence'] = pred_df.max(axis=1)

play_type_with_preds = pd.concat([play_type_with_preds, pred_df], axis=1)

play_type_with_preds.head()

Unnamed: 0,clock,score_diff,down,distance,yards_to_endzone,abbreviation,play_category,predicted_category,predicted_confidence,prob_fg,prob_pass,prob_punt,prob_run
0,891,0.0,1,10,28,REC,pass,run,0.671907,1.1e-05,0.328082,2.31802e-19,0.671907
1,847,0.0,1,10,13,RUSH,run,run,0.684116,2.1e-05,0.315863,3.1059819999999996e-20,0.684116
2,774,0.0,3,3,6,RUSH,run,run,0.512092,0.032955,0.454952,8.959645e-09,0.512092
3,707,-7.0,1,20,75,RUSH,run,pass,0.721648,7e-06,0.721648,3.885226e-16,0.278346
4,663,-7.0,3,18,73,RUSH,run,pass,0.88954,0.010795,0.88954,0.000351224,0.099314


In [23]:
# accuracy
(play_type_with_preds["play_category"] == play_type_with_preds["predicted_category"]).mean()

np.float64(0.661320335290082)

In [24]:
def apply_tuning_probs(
    prob_dict: Dict[str, float],
    down: int,
    fourth_down_aggression: float = 0.0,
    pass_run_bias: float = 0.0
) -> Dict[str, float]:
    prob_cols: List[str] = list(prob_dict.keys())
    probs: np.ndarray = np.array([prob_dict[c] for c in prob_cols], dtype=float)

    # convert to log space for easy additive bias
    eps = 1e-9
    probs = np.clip(probs, eps, 1 - eps)
    logits = np.log(probs)

    if 'prob_pass' in prob_cols and 'prob_run' in prob_cols:
        pass_idx = prob_cols.index('prob_pass')
        run_idx = prob_cols.index('prob_run')

        # global pass-vs-run bias
        logits[pass_idx] += pass_run_bias
        logits[run_idx] -= pass_run_bias

        # extra aggression on 4th down
        if down == 4:
            logits[pass_idx] += fourth_down_aggression
            logits[run_idx] -= fourth_down_aggression

    # softmax back to probabilities
    exp_logits = np.exp(logits - logits.max())
    tuned_probs = exp_logits / exp_logits.sum()

    return {c: float(tuned_probs[i]) for i, c in enumerate(prob_cols)}

# predict a single scenario with optional bias tuning
def predict_with_bias(
    clock: float,
    score_diff: float,
    down: int,
    distance: float,
    yards_to_endzone: float,
    fourth_down_aggression: float = 0.0,
    pass_run_bias: float = 0.0
) -> pd.DataFrame:
    scenario: Dict[str, Any] = {
        'clock': clock,
        'score_diff': score_diff,
        'down': down,
        'distance': distance,
        'yards_to_endzone': yards_to_endzone
    }
    scenario_df = pd.DataFrame([scenario]).reindex(columns=feature_cols)

    # predict probabilities from the trained model
    probs: np.ndarray = clf.predict_proba(scenario_df)
    prob_cols_local: List[str] = [f"prob_{c}" for c in clf.named_steps['model'].classes_]
    probs_df = pd.DataFrame(probs, columns=prob_cols_local)

    prob_dict: Dict[str, float] = probs_df.iloc[0].to_dict()
    tuned_dict: Dict[str, float] = apply_tuning_probs(
        prob_dict,
        down=down,
        fourth_down_aggression=fourth_down_aggression,
        pass_run_bias=pass_run_bias
    )

    tuned_df = pd.DataFrame([{**scenario, **tuned_dict}])
    tuned_df['predicted_category_tuned'] = pd.Series(tuned_dict).idxmax().replace('prob_', '')
    tuned_df['predicted_confidence_tuned'] = max(tuned_dict.values())

    return tuned_df

In [25]:
# Quick check: baseline vs biased pass probability for the same scenario
baseline = predict_with_bias(
    clock=600,
    score_diff=1,
    down=4,
    distance=2,
    yards_to_endzone=50,
    fourth_down_aggression=0.0,
    pass_run_bias=0.0
)

biased = predict_with_bias(
    clock=600,
    score_diff=1,
    down=4,
    distance=2,
    yards_to_endzone=50,
    fourth_down_aggression=1.0,
    pass_run_bias=0.2
)

cols = ['prob_pass', 'prob_run', 'prob_fg', 'prob_punt', 'predicted_category_tuned', 'predicted_confidence_tuned']
print('Baseline:')
print(baseline[cols])
print('\nBiased:')
print(biased[cols])

Baseline:
   prob_pass  prob_run   prob_fg  prob_punt predicted_category_tuned  \
0   0.141077  0.067751  0.074396   0.716776                     punt   

   predicted_confidence_tuned  
0                    0.716776  

Biased:
   prob_pass  prob_run   prob_fg  prob_punt predicted_category_tuned  \
0    0.36594  0.015943  0.058123   0.559995                     punt   

   predicted_confidence_tuned  
0                    0.559995  
