In [20]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import matplotlib.animation as animation
from IPython.display import HTML
from IPython.display import Image

#Code to always set working directory one level up so we can access /data easily
import os
if "NOTEBOOK_DIR" not in globals():
    NOTEBOOK_DIR = os.getcwd()
    print('Notebook Directory Set:', os.getcwd())

os.chdir(os.path.join(NOTEBOOK_DIR, ".."))
print("Current working directory:", os.getcwd())

Current working directory: c:\Users\aidan\EAS-508-Data-Science-Project


In [21]:
df = pd.DataFrame()

In [38]:
# ========================================
# Predict Play Type & Simulate Play Prediction
# ========================================

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, mean_absolute_error, accuracy_score, classification_report

# -------------------------------
# Load and filter play-by-play data
# -------------------------------
pbp = pd.read_csv("C:/Users/aidan/EAS-508-Data-Science-Project/DATA/pbp_data/pbp_2016.csv", low_memory=False)

pbp = pbp[(pbp['play_type'].isin(['run', 'pass'])) & (pbp['epa'].notna())].copy()

# Basic cleanup
pbp['yards_gained'] = pd.to_numeric(pbp['yards_gained'], errors='coerce')
pbp['yards_to_go'] = pd.to_numeric(pbp['ydstogo'], errors='coerce')

# -------------------------------
# Calculate Yards to Success
# -------------------------------
def calc_yards_to_success(row):
    """
    Success threshold depends on down:
    - 1st down: 50% of yards to go
    - 2nd down: 70% of yards to go
    - 3rd/4th down: 100% of yards to go
    """
    if row['down'] == 1:
        threshold = row['yards_to_go'] * 0.5
    elif row['down'] == 2:
        threshold = row['yards_to_go'] * 0.7
    else:
        threshold = row['yards_to_go']
    return row['yards_gained'] - threshold

pbp['yards_to_success'] = pbp.apply(calc_yards_to_success, axis=1)

# -------------------------------
# Pre-snap feature selection
# -------------------------------
pre_snap_features = [
    'down', 'yards_to_go', 'yardline_100', 'wp', 'def_wp',
    'offense_formation', 'offense_personnel', 'defense_personnel',
    'shotgun', 'no_huddle', 
    'roof', 'surface', 'temp', 'wind', 'weather'
]

# Clean and encode features
X = pbp[pre_snap_features].copy()
X = pd.get_dummies(X, drop_first=True)
X = X.fillna(0)

# -------------------------------
# Model 1: Predict Play Type (Run/Pass)
# -------------------------------
y_play_type = (pbp['play_type'] == 'pass').astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_play_type, test_size=0.2, random_state=42
)

model_play = RandomForestClassifier(n_estimators=200, random_state=42)
model_play.fit(X_train, y_train)

y_pred_play = model_play.predict(X_test)

print("üèà Model 1: Play Type (Run=0, Pass=1)")
print(f"Accuracy: {accuracy_score(y_test, y_pred_play):.3f}")
print(classification_report(y_test, y_pred_play, target_names=['Run', 'Pass']))
print("-" * 50)

# -------------------------------
# Simulation: Predict a Hypothetical Play Type
# -------------------------------
example_play = pd.DataFrame([{
    'down': 1,
    'yards_to_go': 10,
    'yardline_100': 50,
    'wp': 0.5,
    'def_wp': 0.5,
    'offense_formation': 'SHOTGUN',
    'offense_personnel': '11 personnel',
    'defense_personnel': 'nickel',
    'shotgun': 1,
    'no_huddle': 0,
    'roof': 'outdoors',
    'surface': 'grass',
    'temp': 70,
    'wind': 5,
    'weather': 'clear'
}])

# Encode the example to match training features
example_encoded = pd.get_dummies(example_play)
example_encoded = example_encoded.reindex(columns=X.columns, fill_value=0)

# Predict play type
pred_play_prob = model_play.predict_proba(example_encoded)[0][1]
pred_play_type = model_play.predict(example_encoded)[0]

print("üèà Example Play Prediction")
print(f"Predicted Play Type: {'Pass' if pred_play_type == 1 else 'Run'}")
print(f"Probability of Pass: {pred_play_prob:.2%}")


üèà Model 1: Play Type (Run=0, Pass=1)
Accuracy: 0.733
              precision    recall  f1-score   support

         Run       0.67      0.64      0.65      2652
        Pass       0.77      0.80      0.78      4106

    accuracy                           0.73      6758
   macro avg       0.72      0.72      0.72      6758
weighted avg       0.73      0.73      0.73      6758

--------------------------------------------------
üèà Example Play Prediction
Predicted Play Type: Pass
Probability of Pass: 81.00%
