In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib


In [2]:
# Load data
df = pd.read_csv('../pbp_data.csv')

# Filter for relevant columns
cols_to_keep = [
    'posteam', 'defteam', 'play_type', 'yards_gained', 'pass_attempt', 
    'rush_attempt', 'touchdown', 'pass_touchdown', 'rush_touchdown', 
    'game_seconds_remaining', 'yardline_100', 'down', 'ydstogo', 
    'score_differential', 'air_yards', 'yards_after_catch', 
    'passer_player_name', 'rusher_player_name', 'receiver_player_name',
    'game_id', 'shotgun', 'qb_scramble', 'play_type', 'pass_length', 'pass_location'
]
df = df[cols_to_keep]

# Drop rows with missing player names
df.dropna(subset=['passer_player_name', 'rusher_player_name', 'receiver_player_name'], how='all', inplace=True)

# Fill NaNs in numerical columns with 0
df.fillna(0, inplace=True)

print("Initial data shape:", df.shape)
print("\nSample of yards_gained:", df['yards_gained'].head())
print("\nSample of player names:", df['passer_player_name'].head())


  df = pd.read_csv('../pbp_data.csv')


Initial data shape: (72066, 20)

Sample of yards_gained: 2    19.0
3     0.0
4     5.0
5     0.0
7     4.0
Name: yards_gained, dtype: float64

Sample of player names: 2            0
3     J.Flacco
4            0
5     J.Flacco
7    L.Jackson
Name: passer_player_name, dtype: object


In [3]:
df['play_total_yards'] = df['yards_gained']  # Use yards_gained instead of trying to sum non-existent columns

# Create game-level statistics for each player first
player_game_stats = df.groupby(['passer_player_name', 'game_id']).agg({
    'yards_gained': 'sum',
    'pass_attempt': 'sum',
    'rush_attempt': 'sum',
    'touchdown': 'sum'
}).reset_index()

# Then calculate historical averages from game-level stats
player_historical_stats = player_game_stats.groupby('passer_player_name').agg({
    'yards_gained': ['mean', 'std', 'count'],
    'pass_attempt': 'mean',
    'rush_attempt': 'mean',
    'touchdown': 'mean'
}).reset_index()

# Flatten column names
player_historical_stats.columns = ['player_name', 'avg_yards', 'std_yards', 'num_games', 
                                 'avg_pass_att', 'avg_rush_att', 'avg_td']

# Filter out players with too few games
player_historical_stats = player_historical_stats[player_historical_stats['num_games'] >= 5]

#TODO: filter out players who are inactive/retired

# Fill NaN values with 0 for std_yards
player_historical_stats['std_yards'] = player_historical_stats['std_yards'].fillna(0)

# Print some stats to verify
print("\nSample of player stats:")
print(player_historical_stats.sort_values('avg_yards', ascending=False).head())
print("\nAverage yards range:", 
      player_historical_stats['avg_yards'].min(), 
      "to", 
      player_historical_stats['avg_yards'].max())


Sample of player stats:
      player_name   avg_yards   std_yards  num_games  avg_pass_att  \
145       T.Brady  270.777778   75.127778         18     46.166667   
128     P.Mahomes  269.675000   67.841213         40     38.950000   
157  T.Tagovailoa  258.903226   80.320755         31     34.000000   
99      K.Cousins  258.384615   86.675984         26     40.769231   
76       J.Flacco  255.818182  103.996941         11     42.181818   

     avg_rush_att    avg_td  
145           0.0  1.500000  
128           0.0  2.075000  
157           0.0  1.903226  
99            0.0  2.000000  
76            0.0  2.090909  

Average yards range: 15.25 to 270.77777777777777


In [4]:
# First create game-level statistics
player_game_stats = df.groupby(['passer_player_name', 'game_id', 'posteam', 'defteam']).agg({
    'yards_gained': 'sum',
    'pass_attempt': 'sum',
    'rush_attempt': 'sum',
    'touchdown': 'sum'
}).reset_index()

# Rename yards_gained to total_yards for clarity
player_game_stats = player_game_stats.rename(columns={'yards_gained': 'total_yards'})

# Create features for the model
features = player_game_stats[[
    'total_yards', 
    'pass_attempt', 
    'rush_attempt',
    'posteam',
    'defteam'
]].copy()

# Create dummy variables for teams
features = pd.get_dummies(features, columns=['posteam', 'defteam'])

# Create historical stats for prediction
player_historical_stats = player_game_stats.groupby('passer_player_name').agg({
    'total_yards': ['mean', 'std', 'count'],
    'pass_attempt': 'mean',
    'rush_attempt': 'mean',
    'touchdown': 'mean'
}).reset_index()

# Flatten column names
player_historical_stats.columns = ['player_name', 'avg_yards', 'std_yards', 'num_games', 
                                 'avg_pass_att', 'avg_rush_att', 'avg_td']

# Filter out players with too few games
player_historical_stats = player_historical_stats[player_historical_stats['num_games'] >= 5]

#TODO: Filter out players who are inactive/retired
#TODO: Look at only REG season games

# Fill NaN values with 0 for std_yards
player_historical_stats['std_yards'] = player_historical_stats['std_yards'].fillna(0)

# Print some stats to verify
print("\nTop 10 players by average yards:")
print(player_historical_stats.sort_values('avg_yards', ascending=False).head(10))

target = player_game_stats['total_yards']


Top 10 players by average yards:
      player_name   avg_yards   std_yards  num_games  avg_pass_att  \
145       T.Brady  270.777778   75.127778         18     46.166667   
128     P.Mahomes  269.675000   67.841213         40     38.950000   
157  T.Tagovailoa  258.903226   80.320755         31     34.000000   
99      K.Cousins  258.384615   86.675984         26     40.769231   
76       J.Flacco  255.818182  103.996941         11     42.181818   
80         J.Goff  255.621622   57.973821         37     37.000000   
67        J.Allen  249.594595   74.056457         37     36.972973   
36       C.Stroud  248.588235   92.288988         17     35.000000   
83      J.Herbert  248.096774   69.842850         31     40.967742   
55     D.Prescott  246.531250   73.345838         32     36.906250   

     avg_rush_att    avg_td  
145           0.0  1.500000  
128           0.0  2.075000  
157           0.0  1.903226  
99            0.0  2.000000  
76            0.0  2.090909  
80            0

In [5]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [6]:
# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Save model
joblib.dump(model, 'nfl_player_stat_predictor.joblib')
print("\nMost important features:")
feature_importance = pd.DataFrame({
    'feature': features.columns,
    'importance': model.feature_importances_
})
print(feature_importance.sort_values('importance', ascending=False).head(10))


Mean Squared Error: 0.1591984313725493

Most important features:
         feature  importance
0    total_yards    0.999897
22   posteam_MIA    0.000043
1   pass_attempt    0.000020
48   defteam_IND    0.000006
37   defteam_BAL    0.000006
9    posteam_CIN    0.000005
18    posteam_KC    0.000003
32    posteam_TB    0.000003
52   defteam_LAC    0.000002
65   defteam_TEN    0.000002


In [7]:
def predict_player_stats(player_name, team_name, opponent_name, model, player_historical_stats):
    """
    Predicts NFL player stats based on historical data and team matchups.
    
    Args:
        player_name (str): Name of the player (e.g., 'P.Mahomes')
        team_name (str): Team code (e.g., 'KC')
        opponent_name (str): Opponent team code (e.g., 'LV')
        model: Trained machine learning model
        player_historical_stats (DataFrame): Historical player statistics
    
    Returns:
        float: Predicted total yards for the player
    """
    # Convert player_name to string and ensure player_historical_stats names are strings
    player_name = str(player_name)
    player_historical_stats['player_name'] = player_historical_stats['player_name'].astype(str)
    
    # Check if player exists in historical stats
    if player_name not in player_historical_stats['player_name'].values:
        print(f"\nPlayer '{player_name}' not found. Available similar players:")
        similar_players = [str(p) for p in player_historical_stats['player_name'].unique() 
                         if player_name.lower() in str(p).lower()]
        print(similar_players[:5])
        return 0
    
    # Get historical stats for the player
    player_stats = player_historical_stats[player_historical_stats['player_name'] == player_name]
    
    # Check if player has enough historical data
    if len(player_stats) == 0 or player_stats['num_games'].iloc[0] < 5:
        print(f"\nNot enough historical data for player '{player_name}'")
        return 0
    
    # Create feature vector matching training data
    input_features = np.zeros(len(model.feature_names_in_))
    feature_dict = {name: i for i, name in enumerate(model.feature_names_in_)}
    
    # Set team indicators
    team_feature = f'posteam_{team_name}'
    opp_feature = f'defteam_{opponent_name}'
    
    # Validate team names
    if team_feature not in feature_dict:
        print(f"Warning: Team {team_name} not found in training data")
        return 0
    
    if opp_feature not in feature_dict:
        print(f"Warning: Opponent {opponent_name} not found in training data")
        return 0
    
    # Set team indicators
    input_features[feature_dict[team_feature]] = 1
    input_features[feature_dict[opp_feature]] = 1
    
    # Set player stats using max between 0 and the actual value for yards
    if 'total_yards' in feature_dict:
        input_features[feature_dict['total_yards']] = max(0, player_stats['avg_yards'].iloc[0])
    if 'pass_attempt' in feature_dict:
        input_features[feature_dict['pass_attempt']] = max(0, player_stats['avg_pass_att'].iloc[0])
    if 'rush_attempt' in feature_dict:
        input_features[feature_dict['rush_attempt']] = max(0, player_stats['avg_rush_att'].iloc[0])
    
    # Debug prints
    print("\nPlayer historical stats:")
    print(player_stats[['player_name', 'avg_yards', 'std_yards', 'num_games', 'avg_pass_att', 'avg_rush_att', 'avg_td']])
    print("\nFeatures being used:")
    for fname, idx in feature_dict.items():
        if input_features[idx] != 0:
            print(f"{fname}: {input_features[idx]}")
    
    # Make prediction and ensure it's not negative
    prediction = model.predict(input_features.reshape(1, -1))
    return max(0, prediction[0])

In [10]:
# Update the main execution block
if __name__ == "__main__":
    # Load the model
    model = joblib.load('nfl_player_stat_predictor.joblib')
    
    # Print some helpful information
    print("\nExample format for names:")
    print("Player name: P.Mahomes")
    print("Team name:", df['posteam'].unique()[0], "(use exactly as shown)")
    print("Opponent name:", df['defteam'].unique()[0], "(use exactly as shown)")
    print("\n")
    
    # Show available players and teams
    print("Sample of available players:")
    print(player_historical_stats['player_name'].head(10))
    print("\nSample of available teams:")
    print(df['posteam'].unique())
    
    # Show model's features
    print("\nModel's expected features:")
    print(model.feature_names_in_)
    
    # Get user input
    player_name = input("Enter player name: ")
    team_name = input("Enter player's team: ")
    opponent_name = input("Enter opponent team: ")
    
    # Make prediction
    prediction = predict_player_stats(player_name, team_name, opponent_name, model, player_historical_stats)
    
    print(f"\nPredicted stats for {player_name} of {team_name} against {opponent_name}: {prediction:.1f} total yards")


Example format for names:
Player name: P.Mahomes
Team name: NYJ (use exactly as shown)
Opponent name: BAL (use exactly as shown)


Sample of available players:
0               0
3        A.Dalton
6     A.O'Connell
8       A.Rodgers
16     B.Mayfield
18        B.Purdy
19       B.Rypien
20        B.Young
21        B.Zappe
23     C.Beathard
Name: player_name, dtype: object

Sample of available teams:
['NYJ' 'BAL' 'BUF' 'LA' 'CAR' 'CLE' 'SEA' 'DEN' 'MIN' 'GB' 'IND' 'HOU'
 'JAX' 'WAS' 'KC' 'ARI' 'LAC' 'LV' 'NE' 'MIA' 'ATL' 'NO' 'NYG' 'TEN' 'DET'
 'PHI' 'PIT' 'CIN' 'CHI' 'SF' 'DAL' 'TB']

Model's expected features:
['total_yards' 'pass_attempt' 'rush_attempt' 'posteam_ARI' 'posteam_ATL'
 'posteam_BAL' 'posteam_BUF' 'posteam_CAR' 'posteam_CHI' 'posteam_CIN'
 'posteam_CLE' 'posteam_DAL' 'posteam_DEN' 'posteam_DET' 'posteam_GB'
 'posteam_HOU' 'posteam_IND' 'posteam_JAX' 'posteam_KC' 'posteam_LA'
 'posteam_LAC' 'posteam_LV' 'posteam_MIA' 'posteam_MIN' 'posteam_NE'
 'posteam_NO' 'posteam_NYG' 'p

