# NFL Analytics


In [None]:
import nfl_data_py as nfl

import pandas as pd
import numpy as np

import plotly.graph_objects as go

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [None]:
# Define the seasons you want to load
desired_seasons = [2021, 2022, 2023]

# Load play-by-play data using nfl_data_py
df = nfl.import_pbp_data(
    years=desired_seasons,
    columns=[
        'game_id',
        'posteam',
        'defteam',
        'home_team',
        'away_team',
        'posteam_score',
        'defteam_score',
        'yardline_100',
        'game_date',
        'quarter_seconds_remaining',
        'game_seconds_remaining',
        'down',
        'wp',
        'score_differential',
        'ydstogo',
    ]
)

# Inspect the shape of the data
print(df.shape)

2021 done.
2022 done.
2023 done.
Downcasting floats.
(149021, 36)


In [None]:
df.head()

Unnamed: 0,game_id,posteam,defteam,home_team,away_team,posteam_score,defteam_score,yardline_100,game_date,quarter_seconds_remaining,...,offense_players,defense_players,n_offense,n_defense,ngs_air_yards,time_to_throw,was_pressure,route,defense_man_zone_type,defense_coverage_type
0,2021_01_ARI_TEN,,,TEN,ARI,,,,2021-09-12,900.0,...,,,0.0,0.0,,,,,,
1,2021_01_ARI_TEN,TEN,ARI,TEN,ARI,0.0,0.0,35.0,2021-09-12,900.0,...,00-0032560;00-0036356;00-0035705;00-0036896;00...,00-0032496;00-0036652;00-0032355;00-0033455;00...,11.0,11.0,,,,,,
2,2021_01_ARI_TEN,TEN,ARI,TEN,ARI,0.0,0.0,75.0,2021-09-12,900.0,...,00-0029413;00-0032764;00-0029701;00-0027648;00...,00-0036356;00-0035705;00-0036933;00-0035236;00...,11.0,11.0,,,,,,
3,2021_01_ARI_TEN,TEN,ARI,TEN,ARI,0.0,0.0,78.0,2021-09-12,863.0,...,00-0029413;00-0032764;00-0029701;00-0027648;00...,00-0036356;00-0035705;00-0036933;00-0035236;00...,11.0,11.0,2.13,2.536,False,ANGLE,ZONE_COVERAGE,COVER_6
4,2021_01_ARI_TEN,TEN,ARI,TEN,ARI,0.0,0.0,75.0,2021-09-12,822.0,...,00-0032355;00-0029413;00-0029701;00-0033455;00...,00-0036356;00-0032127;00-0035705;00-0030528;00...,11.0,11.0,9.82,2.703,False,HITCH,ZONE_COVERAGE,COVER_6


In [None]:
df.sample(5)

Unnamed: 0,game_id,posteam,defteam,home_team,away_team,posteam_score,defteam_score,yardline_100,game_date,quarter_seconds_remaining,...,offense_players,defense_players,n_offense,n_defense,ngs_air_yards,time_to_throw,was_pressure,route,defense_man_zone_type,defense_coverage_type
98430,2022_20_DAL_SF,DAL,SF,SF,DAL,6.0,9.0,52.0,2023-01-22,717.0,...,00-0033077;00-0035679;00-0034764;00-0036358;00...,00-0036563;00-0034573;00-0035279;00-0031298;00...,11.0,11.0,-0.8,2.169,False,FLAT,ZONE_COVERAGE,COVER_3
6079,2021_03_BAL_DET,DET,BAL,DET,BAL,0.0,13.0,58.0,2021-09-26,660.0,...,00-0032464;00-0036258;00-0036963;00-0035328;00...,00-0032770;00-0033294;00-0034265;00-0026190;00...,11.0,11.0,,,,,,
41033,2021_16_LA_MIN,LA,MIN,MIN,LA,20.0,13.0,11.0,2021-12-26,751.0,...,00-0032242;00-0033110;00-0034114;00-0036603;00...,00-0035150;00-0033546;00-0033579;00-0036335;00...,11.0,11.0,,,,,,
11611,2021_05_CHI_LV,LV,CHI,LV,CHI,3.0,14.0,27.0,2021-10-10,705.0,...,00-0034759;00-0036357;00-0036369;00-0036892;00...,00-0030527;00-0036292;00-0032265;00-0032107;00...,11.0,11.0,,,,,,
136290,2023_15_CHI_CLE,CHI,CLE,CLE,CHI,17.0,7.0,67.0,2023-12-17,284.0,...,00-0037097;00-0033757;00-0035386;00-0033765;00...,00-0038098;00-0036133;00-0037332;00-0034639;00...,11.0,11.0,,,,,,


In [None]:
df.columns.values

array(['game_id', 'posteam', 'defteam', 'home_team', 'away_team',
       'posteam_score', 'defteam_score', 'yardline_100', 'game_date',
       'quarter_seconds_remaining', 'game_seconds_remaining', 'down',
       'wp', 'score_differential', 'ydstogo', 'play_id', 'old_game_id',
       'season', 'nflverse_game_id', 'possession_team',
       'offense_formation', 'offense_personnel', 'defenders_in_box',
       'defense_personnel', 'number_of_pass_rushers', 'players_on_play',
       'offense_players', 'defense_players', 'n_offense', 'n_defense',
       'ngs_air_yards', 'time_to_throw', 'was_pressure', 'route',
       'defense_man_zone_type', 'defense_coverage_type'], dtype=object)

In [None]:
valid_point_increments = {1, 2, 3, 6, 7, 8}

def validate_scores(df):
    # Check for valid score decreases
    invalid_home_score_decrease = df['home_team_score'].diff().fillna(0) < 0
    invalid_away_score_decrease = df['away_team_score'].diff().fillna(0) < 0

    # Check for valid point increments for both home and away scores
    home_score_increments = df['home_team_score'].diff().fillna(0)
    away_score_increments = df['away_team_score'].diff().fillna(0)

    # Invalid increments that are not in the valid football point increments
    invalid_home_increments = ~home_score_increments.isin(valid_point_increments.union({0}))
    invalid_away_increments = ~away_score_increments.isin(valid_point_increments.union({0}))

    # Combine all invalid rows into one boolean mask
    invalid_rows = invalid_home_score_decrease | invalid_away_score_decrease | invalid_home_increments | invalid_away_increments

    # Get the indices of invalid rows
    invalid_indices = df.index[invalid_rows]

    if not invalid_indices.empty:
        print(f"Invalid datapoints found at indices: {list(invalid_indices)}")
        # Optionally drop or fix invalid rows
        df = df.drop(invalid_indices).reset_index(drop=True)

    return df

In [None]:
# Filter for valid rows and reset index
valid_df = df[df['wp'].notnull() & df['posteam_score'].notnull() & df['defteam_score'].notnull()].reset_index(drop=True)

# Select the relevant features (time remaining, home team score, away team score)
valid_df['home_team_score'] = valid_df['posteam_score'].where(valid_df['posteam'] == valid_df['home_team'], valid_df['defteam_score'])
valid_df['away_team_score'] = valid_df['defteam_score'].where(valid_df['posteam'] == valid_df['home_team'], valid_df['posteam_score'])

valid_df['score_differential'] = valid_df['posteam_score'] - valid_df['defteam_score']  # Score differential
valid_df['score_differential_lag1'] = valid_df['score_differential'].shift(1)
valid_df['score_differential_lag2'] = valid_df['score_differential'].shift(2)

valid_df['field_position'] = valid_df['yardline_100']
valid_df['possession_status'] = (valid_df['posteam'] == valid_df['home_team']).astype(int)
valid_df.fillna(0, inplace=True)

# Example of using the function
valid_df = validate_scores(valid_df)

# Define the features we care about: time remaining in quarter, home and away scores
X = valid_df[['quarter_seconds_remaining', 'home_team_score', 'away_team_score', 'field_position', 'possession_status', 'game_seconds_remaining', 'down', 'ydstogo']]
# X = valid_df[['home_team_score', 'away_team_score']]
y = valid_df['wp']  # Win probability

# Remove NaN values and handle missing data
X = X.dropna()
y = y[X.index]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Gradient Boosting Regressor model
model = RandomForestRegressor(max_depth=16, random_state=42)
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Invalid datapoints found at indices: [168, 362, 507, 660, 851, 1000, 1146, 1338, 1503, 1657, 1850, 2013, 2193, 2370, 2528, 2710, 2882, 3049, 3198, 3357, 3518, 3669, 3819, 3977, 4133, 4289, 4453, 4613, 4766, 4950, 5103, 5287, 5456, 5614, 5768, 5924, 6081, 6243, 6407, 6561, 6735, 6939, 7107, 7256, 7425, 7581, 7754, 7920, 8100, 8270, 8437, 8610, 8764, 8910, 9066, 9208, 9383, 9553, 9714, 9875, 10043, 10214, 10414, 10591, 10771, 10932, 11115, 11274, 11438, 11605, 11776, 11941, 12103, 12257, 12430, 12610, 12771, 12945, 13095, 13261, 13422, 13587, 13755, 13932, 14077, 14230, 14400, 14550, 14724, 14896, 15070, 15275, 15454, 15610, 15778, 15945, 16111, 16284, 16431, 16595, 16753, 16912, 17077, 17235, 17408, 17572, 17722, 17881, 18057, 18231, 18390, 18553, 18719, 18891, 19062, 19232, 19388, 19547, 19704, 19876, 20066, 20213, 20373, 20542, 20700, 20876, 21033, 21200, 21356, 21529, 21682, 21837, 22026, 22179, 22353, 22523, 22684, 22866, 23034, 23194, 23348, 23546, 23715, 23882, 24035, 24205, 24371

In [None]:
# Use SimpleImputer to replace missing values with the mean or a constant value like 0
imputer = SimpleImputer(strategy='mean')

# Randomly select a game_id from valid_df
random_game_id = valid_df['game_id'].sample(1).values[0]

# Filter for the random game's data
game_df = valid_df[valid_df['game_id'] == random_game_id].reset_index(drop=True)

# Assign new labels for each play in the game
game_df['chronological_play_number'] = game_df.index + 1

home_team = game_df['home_team'].unique()[0]
away_team = game_df['away_team'].unique()[0]
game_date = game_df['game_date'].unique()[0]

# After this, game_df will contain the data for the randomly selected game

# Impute missing values for the selected features in the random game
game_df_imputed = pd.DataFrame(imputer.fit_transform(game_df[['quarter_seconds_remaining', 'home_team_score', 'away_team_score', 'field_position', 'possession_status', 'game_seconds_remaining', 'down', 'ydstogo']]),
                               columns=['quarter_seconds_remaining', 'home_team_score', 'away_team_score', 'field_position', 'possession_status', 'game_seconds_remaining', 'down', 'ydstogo'])

# Predict win probability for the random game
game_df['predicted_win_prob'] = model.predict(game_df_imputed)

# Fix the score assignment: Ensure that home and away team scores are consistent, regardless of possession
game_df['home_team_score'] = game_df['home_team_score'].fillna(method='ffill')  # Fill forward to ensure no NaN values
game_df['away_team_score'] = game_df['away_team_score'].fillna(method='ffill')  # Fill forward to ensure no NaN values

# Ensure consistency of home/away team scores even after possession changes
game_df['home_team_score'] = np.where(game_df['posteam'] == home_team, game_df['posteam_score'], game_df['defteam_score'])
game_df['away_team_score'] = np.where(game_df['posteam'] == away_team, game_df['posteam_score'], game_df['defteam_score'])

# Add the final play at 0:00 with 100% win probability for the winning team
final_real_play = game_df.iloc[-1]

final_home_score = final_real_play['home_team_score']
final_away_score = final_real_play['away_team_score']

# Determine the final winner
if final_home_score > final_away_score:
    final_win_prob = 1  # Home team wins
else:
    final_win_prob = 0  # Away team wins

final_play = pd.DataFrame({
    'chronological_play_number': [final_real_play['chronological_play_number'] + 1],
    'game_seconds_remaining': [0],  # Time = 0:00
    'home_team_score': [final_home_score],  # Correct final score for home team
    'away_team_score': [final_away_score],  # Correct final score for away team
    'predicted_win_prob': [final_win_prob],  # 100% win probability for the winning team
})

# Append final play to the game dataframe
game_df = pd.concat([game_df, final_play], ignore_index=True)

# Convert game_seconds_remaining to "mm:ss" format
def format_time_remaining(seconds_remaining):
    minutes = seconds_remaining // 60
    seconds = seconds_remaining % 60
    return f"{int(minutes)}:{int(seconds):02d}"

  game_df['home_team_score'] = game_df['home_team_score'].fillna(method='ffill')  # Fill forward to ensure no NaN values
  game_df['away_team_score'] = game_df['away_team_score'].fillna(method='ffill')  # Fill forward to ensure no NaN values


In [None]:
# Prepare the transformed win probability for display (100-50-100 scale)
def transform_win_prob_for_display(combined_win_prob):
    # If the combined_win_prob is for the home team, leave it as is (scaled from 50% to 100%)
    return np.where(combined_win_prob >= 0.5, combined_win_prob * 100, (1 - combined_win_prob) * 100)

# Prepare data for interactive graph
game_df_sorted = game_df.sort_values(by='game_seconds_remaining', ascending=False)  # Sort by time remaining

time_remaining = game_df_sorted['game_seconds_remaining']  # Use the actual seconds remaining as the x-axis
combined_win_prob = game_df_sorted['predicted_win_prob']  # This is the predicted win probability for both teams
home_team_scores = game_df_sorted['home_team_score']
away_team_scores = game_df_sorted['away_team_score']

# Apply the transformation for display (hovertext)
display_win_prob = transform_win_prob_for_display(combined_win_prob)

# Detect possession changes
game_df['possession_change'] = game_df['posteam'] != game_df['posteam'].shift(1)

# Get the time remaining for each possession change
possession_change_times = game_df[game_df['possession_change']]['game_seconds_remaining']

# Create interactive plot with Plotly for the selected random game
fig = go.Figure()

# Add combined win probability line for both teams
fig.add_trace(go.Scatter(
    x=time_remaining,  # X-axis is now time remaining in the game (in seconds)
    y=combined_win_prob * 100,  # Rescale to 0% to 100% on y-axis
    mode='lines',
    name=f'Win Probability',
    hovertext=[f"{home_team} Score: {hs}<br>{away_team} Score: {as_}<br>Win Probability: {wp:.2f}%<br>Time Remaining: {format_time_remaining(tr)}"
               for hs, as_, wp, tr in zip(home_team_scores, away_team_scores, display_win_prob, time_remaining)],  # Use transformed win prob for hovertext
    hoverinfo='text',
    line=dict(color='firebrick', width=3),
))

dataset_wp_transformed = transform_win_prob_for_display(game_df_sorted['wp'])

# Add the actual win probability line (from the dataset)
fig.add_trace(go.Scatter(
    x=time_remaining,
    y=game_df_sorted['wp'] * 100,  # Actual WP from dataset (rescaled to 0%-100%)
    mode='lines',
    hovertext=[f"Dataset Win Probability: {round(float(wp), 2)}%" for wp in dataset_wp_transformed],  # Display actual WP for hovertext
    name=f'Real Dataset WP',
    hoverinfo='text',
    line=dict(color='blue', width=2, dash='dash'),
))

# Plot all possession changes as a single trace (avoid multiple legend entries)
possession_change_x = []
for possession_time in possession_change_times:
    possession_change_x.extend([possession_time, possession_time, None])  # None breaks the line, ensuring separate lines

# Add possession change trace (one trace for all changes)
fig.add_trace(go.Scatter(
    x=possession_change_x,
    y=[0, 100, None] * len(possession_change_times),  # Repeat y-range for each possession change
    mode='lines',
    name='Possession Change',
    line=dict(color="green", width=1, dash="dash"),
    showlegend=True,
))

# Generate x-axis tick values every 5 minutes (300 seconds)
total_seconds = int(game_df_sorted['game_seconds_remaining'].max())  # Get the maximum seconds remaining
tickvals = list(range(0, total_seconds + 1, 300))  # Create ticks every 300 seconds

# Convert tick values to "mm:ss" format for display
ticktext = [format_time_remaining(seconds) for seconds in tickvals]

# Update layout with y-axis scaled 0%-100% and time remaining on the x-axis
fig.update_layout(
    title=f"Learning Win Probability for {home_team} vs {away_team} on {game_date}",
    xaxis_title="Time Remaining (mm:ss)",
    yaxis_title=f"Win Probability",
    yaxis=dict(
        range=[-1, 101],
        tickvals=[0, 25, 50, 75, 100],
        ticktext=[f"{away_team} 100%", "75%", "50%", "75%", f"{home_team} 100%"],
    ),
    xaxis=dict(
        tickvals=tickvals,
        ticktext=ticktext,
        range=[total_seconds, 0],
        autorange="reversed",
    ),
    width=1000,
    hovermode='closest',
    showlegend=True
)

fig.show()