# NFL Offensive Scheme Classifier
This notebook trains a machine learning model to classify NFL teams into one of 8 offensive schemes based on their play data.

In [2]:

import nfl_data_py as nfl
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
    

## Load and Preprocess Data

In [3]:

# Load play-by-play data for 2024
pbp_data = nfl.import_pbp_data([2024])

# Filter for offensive plays only (exclude special teams & defensive plays)
pbp_offense = pbp_data[
    (pbp_data['play_type'].isin(['pass', 'run'])) & 
    (pbp_data['posteam'].notna())
]

# Select relevant columns
columns_to_keep = [
    "posteam", "week", "pass_attempt", "rush_attempt", "shotgun", "no_huddle",
    "qb_scramble", "first_down_rush", "first_down_pass", "qb_dropback",
    "pass_length", "pass_location", "run_location", "run_gap", "yards_after_catch",
    "play_type", "yards_gained", "air_yards", "epa", "ydstogo", "down"
]
pbp_offense = pbp_offense[columns_to_keep]
    

2024 done.
Downcasting floats.


## Feature Engineering

In [4]:

# Function to calculate per-game team stats
def process_team_data(df):
    num_plays = df["qb_dropback"].sum() + df["rush_attempt"].sum()

    return pd.Series({
        "pass_to_run": df["qb_dropback"].sum() / df["rush_attempt"].sum(),
        "shotgun_freq": df["shotgun"].sum() / num_plays,
        "no_huddle_freq": df["no_huddle"].sum() / num_plays,
        "short_passes_freq": (df["pass_length"] == "short").sum() / num_plays,
        "deep_passes_freq": (df["pass_length"] == "deep").sum() / num_plays,
        "middle_passes": (df["pass_location"] == "middle").sum() / df["pass_attempt"].sum(),
        "side_passes": (df["pass_location"].isin(["left", "right"])).sum() / df["pass_attempt"].sum(),
        "scramble_freq": df["qb_scramble"].sum() / df["rush_attempt"].sum(),
        "first_down_rush_pct": df["first_down_rush"].sum() / (df["first_down_rush"].sum() + df["first_down_pass"].sum()),
        "first_down_pass_pct": df["first_down_pass"].sum() / (df["first_down_rush"].sum() + df["first_down_pass"].sum()),
        "epa_pass": df[df["play_type"] == "pass"]["epa"].mean(),
        "epa_run": df[df["play_type"] == "run"]["epa"].mean(),
        "yac": df[df["play_type"] == "pass"]["yards_after_catch"].mean(),
        "inside_run_pct": df[(df["play_type"] == "run") & (df["run_gap"] == "guard") & (df["run_location"] == "middle")].shape[0] / df[df["play_type"] == "run"].shape[0],
        "outside_run_pct": df[(df["play_type"] == "run") & (df["run_gap"].isin(["tackle", "end"])) & (df["run_location"] != "middle")].shape[0] / df[df["play_type"] == "run"].shape[0],
        "yards_gained_1": df[df["down"] == 1]["yards_gained"].mean(),
        "yards_gained_2": df[df["down"] == 2]["yards_gained"].mean(),
        "yards_gained_3": df[df["down"] == 3]["yards_gained"].mean(),
        "yards_gained_4": df[df["down"] == 4]["yards_gained"].mean(),
        "ydstogo_3rd_down": df[df["down"] == 3]["ydstogo"].mean()
    })

# Aggregate stats per game for each team
team_weekly_data = pbp_offense.groupby(["posteam", "week"]).apply(process_team_data).reset_index()

# Compute seasonal averages for each team
team_seasonal_data = team_weekly_data.groupby("posteam").mean().reset_index()
    

## Assign Offensive Schemes

In [6]:

# Manually assign offensive schemes to teams
team_schemes = {
    "MIN": "McVay System", "LAR": "McVay System",
    "WAS": "Air Raid", "PHI": "Spread Option",
    "LAC": "Coryell Vertical", "GB": "West Coast",
    "PIT": "Run Power", "NYG": "Pistol Power Spread",
    "SF": "Shanahan Wide Zone"
}

# Assign labels
team_seasonal_data["scheme"] = team_seasonal_data["posteam"].map(team_schemes)

# Remove teams with unknown schemes
team_seasonal_data = team_seasonal_data.dropna(subset=["scheme"])

# Encode target variable (scheme)
label_encoder = LabelEncoder()
team_seasonal_data["scheme"] = label_encoder.fit_transform(team_seasonal_data["scheme"])
    

## Train Machine Learning Model

In [7]:

X = team_seasonal_data.drop(columns=["posteam", "scheme"])
y = team_seasonal_data["scheme"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model
try:
    y_pred = clf.predict(X_test)
    if len(y_test) > 0:
        print("Model Accuracy:", accuracy_score(y_test, y_pred))
        print("\nDetailed Classification Report:")
        print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    else:
        print("Warning: No test data available for evaluation")
except Exception as e:
    print(f"Error evaluating model: {str(e)}")
    

Model Accuracy: 0.0


ValueError: Number of classes, 4, does not match size of target_names, 8. Try specifying the labels parameter

## Predict Bears' Offensive Scheme

In [None]:

if "CHI" in pbp_offense["posteam"].values:
    # Process Bears data the same way as training data
    bears_weekly = pbp_offense[pbp_offense["posteam"] == "CHI"].groupby("week").apply(process_team_data).mean()
    bears_data = pd.DataFrame([bears_weekly])
    
    # Ensure features match training data
    bears_features = bears_data[X.columns]
    bears_scaled = scaler.transform(bears_features)
    
    # Predict and decode
    bears_scheme = clf.predict(bears_scaled)
    print("Predicted Scheme for Bears:", label_encoder.inverse_transform(bears_scheme)[0])
else:
    print("No data available for the Bears.")
    