In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib


In [57]:
# Load data
df = pd.read_csv('../pbp_data.csv')

# Filter for relevant columns
cols_to_keep = [
    'posteam', 'defteam', 'play_type', 'yards_gained', 'pass_attempt', 
    'rush_attempt', 'touchdown', 'pass_touchdown', 'rush_touchdown', 
    'game_seconds_remaining', 'yardline_100', 'down', 'ydstogo', 
    'score_differential', 'air_yards', 'yards_after_catch', 
    'passer_player_name', 'rusher_player_name', 'receiver_player_name',
    'game_id', 'shotgun', 'qb_scramble', 'play_type', 'pass_length', 'pass_location', 'complete_pass'
]
df = df[cols_to_keep]

# Drop rows with missing player names
df.dropna(subset=['passer_player_name', 'rusher_player_name', 'receiver_player_name'], how='all', inplace=True)

# Fill NaNs in numerical columns with 0
df.fillna(0, inplace=True)

print("Initial data shape:", df.shape)
print("\nSample of yards_gained:", df['yards_gained'].head())
print("\nSample of player names:", df['passer_player_name'].head())


  df = pd.read_csv('../pbp_data.csv')


Initial data shape: (72066, 26)

Sample of yards_gained: 2    19.0
3     0.0
4     5.0
5     0.0
7     4.0
Name: yards_gained, dtype: float64

Sample of player names: 2            0
3     J.Flacco
4            0
5     J.Flacco
7    L.Jackson
Name: passer_player_name, dtype: object


In [58]:
# First create game-level statistics with passing-specific metrics
player_game_stats = df.groupby(['passer_player_name', 'game_id', 'posteam', 'defteam']).agg({
    'yards_gained': lambda x: x[df['pass_attempt'] == 1].sum(),  # Only passing yards
    'pass_attempt': 'sum',
    'pass_touchdown': 'sum',  # Only passing TDs
    'air_yards': 'mean',      # Average air yards per attempt
    'yards_after_catch': 'mean',  # Added YAC
    'shotgun': 'mean',        # % of plays in shotgun
    'qb_scramble': 'sum'      # Number of scrambles
}).reset_index()

# Add categorical distributions for passing plays only
pass_plays = df[df['pass_attempt'] == 1]  # Filter for passing plays only
pass_length_dist = pass_plays.groupby(['passer_player_name', 'game_id'])['pass_length'].value_counts(normalize=True).unstack(fill_value=0)
pass_location_dist = pass_plays.groupby(['passer_player_name', 'game_id'])['pass_location'].value_counts(normalize=True).unstack(fill_value=0)

# Merge these distributions back to player_game_stats
player_game_stats = player_game_stats.merge(pass_length_dist, on=['passer_player_name', 'game_id'])
player_game_stats = player_game_stats.merge(pass_location_dist, on=['passer_player_name', 'game_id'])

# Rename yards_gained to passing_yards for clarity
player_game_stats = player_game_stats.rename(columns={'yards_gained': 'passing_yards'})

# Print feature information
print("Features included in the model:")
print(features.columns.tolist())
print("\nShape of feature matrix:", features.shape)

Features included in the model:
['pass_attempt', 'pass_touchdown', 'air_yards', 'yards_after_catch', 'shotgun', 'qb_scramble', '0_x', 'deep', 'short', '0_y', 'left', 'middle', 'right', 'pass_yards_allowed_per_game', 'pass_td_allowed_per_game', 'completion_pct_allowed', 'posteam_ARI', 'posteam_ATL', 'posteam_BAL', 'posteam_BUF', 'posteam_CAR', 'posteam_CHI', 'posteam_CIN', 'posteam_CLE', 'posteam_DAL', 'posteam_DEN', 'posteam_DET', 'posteam_GB', 'posteam_HOU', 'posteam_IND', 'posteam_JAX', 'posteam_KC', 'posteam_LA', 'posteam_LAC', 'posteam_LV', 'posteam_MIA', 'posteam_MIN', 'posteam_NE', 'posteam_NO', 'posteam_NYG', 'posteam_NYJ', 'posteam_PHI', 'posteam_PIT', 'posteam_SEA', 'posteam_SF', 'posteam_TB', 'posteam_TEN', 'posteam_WAS', 'defteam_ARI', 'defteam_ATL', 'defteam_BAL', 'defteam_BUF', 'defteam_CAR', 'defteam_CHI', 'defteam_CIN', 'defteam_CLE', 'defteam_DAL', 'defteam_DEN', 'defteam_DET', 'defteam_GB', 'defteam_HOU', 'defteam_IND', 'defteam_JAX', 'defteam_KC', 'defteam_LA', 'defte

In [59]:
# Create historical stats for prediction
player_historical_stats = player_game_stats.groupby('passer_player_name').agg({
    'passing_yards': ['mean', 'std', 'count'],
    'pass_attempt': 'mean',
    'pass_touchdown': 'mean',
    'air_yards': 'mean',
    'yards_after_catch': 'mean',
    'shotgun': 'mean',
    'qb_scramble': 'mean'
}).reset_index()

# Flatten multi-level columns
player_historical_stats.columns = [
    'player_name',
    'avg_passing_yards', 'std_passing_yards', 'num_games',
    'avg_pass_att', 'avg_pass_td', 'avg_air_yards',
    'avg_yac', 'avg_shotgun', 'avg_qb_scramble'
]

# Calculate passing play distributions
pass_length_means = (pass_plays.groupby('passer_player_name')['pass_length']
                     .value_counts(normalize=True)
                     .unstack(fill_value=0)
                     .reset_index())

pass_location_means = (pass_plays.groupby('passer_player_name')['pass_location']
                       .value_counts(normalize=True)
                       .unstack(fill_value=0)
                       .reset_index())

# Merge the distribution averages
player_historical_stats = (player_historical_stats
    .merge(pass_length_means, left_on='player_name', right_on='passer_player_name', how='left')
    .merge(pass_location_means, left_on='player_name', right_on='passer_player_name', how='left')
)

# Drop duplicate passer_player_name columns and fill NaNs
player_historical_stats = player_historical_stats.loc[:, ~player_historical_stats.columns.duplicated()]
player_historical_stats = player_historical_stats.fillna(0)

print("Final columns in historical stats:")
print(player_historical_stats.columns.tolist())

Final columns in historical stats:
['player_name', 'avg_passing_yards', 'std_passing_yards', 'num_games', 'avg_pass_att', 'avg_pass_td', 'avg_air_yards', 'avg_yac', 'avg_shotgun', 'avg_qb_scramble', 'passer_player_name_x', '0_x', 'deep', 'short', 'passer_player_name_y', '0_y', 'left', 'middle', 'right']


In [60]:
# First calculate defensive stats for each team
team_defense_stats = df.groupby('defteam').agg({
    'yards_gained': lambda x: x[df['pass_attempt'] == 1].mean(),  # Avg passing yards allowed per attempt
    'pass_attempt': 'count',  # Number of passes faced
    'pass_touchdown': 'sum',  # Passing TDs allowed
    'yards_after_catch': 'mean',  # YAC allowed
    'air_yards': 'mean',  # Air yards allowed
    'complete_pass': lambda x: x[df['pass_attempt'] == 1].mean()  # Completion percentage allowed (1 or 0)
}).reset_index()

# Calculate per game stats
games_per_team = df.groupby('defteam')['game_id'].nunique()
team_defense_stats['games'] = team_defense_stats['defteam'].map(games_per_team)

# Calculate derived defensive metrics
team_defense_stats = team_defense_stats.assign(
    pass_yards_allowed_per_game=team_defense_stats['yards_gained'] * team_defense_stats['pass_attempt'] / team_defense_stats['games'],
    pass_td_allowed_per_game=team_defense_stats['pass_touchdown'] / team_defense_stats['games'],
    completion_pct_allowed=team_defense_stats['complete_pass'] * 100  # Convert to percentage
)

# Now merge these defensive stats back into player_game_stats
player_game_stats = player_game_stats.merge(
    team_defense_stats[['defteam', 'pass_yards_allowed_per_game', 'pass_td_allowed_per_game', 
                       'completion_pct_allowed']],
    on='defteam',
    how='left'
)

In [61]:
# These defensive features will now be available for the model training
features = player_game_stats.drop(['passer_player_name', 'game_id', 'passing_yards'], axis=1)

# Create dummy variables for teams
features = pd.get_dummies(features, columns=['posteam', 'defteam'])

# Define target variable
target = player_game_stats['passing_yards']

In [62]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [63]:
# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Save model
joblib.dump(model, 'nfl_player_stat_predictor.joblib')
print("\nMost important features:")
feature_importance = pd.DataFrame({
    'feature': features.columns,
    'importance': model.feature_importances_
})
print(feature_importance.sort_values('importance', ascending=False).head(10))


Mean Squared Error: 1244.169719787986

Most important features:
                        feature  importance
0                  pass_attempt    0.725871
3             yards_after_catch    0.092307
1                pass_touchdown    0.040247
2                     air_yards    0.029117
6                           0_x    0.018289
9                           0_y    0.016507
13  pass_yards_allowed_per_game    0.008533
4                       shotgun    0.008154
7                          deep    0.007719
11                       middle    0.007717


In [64]:
def predict_player_stats(player_name, team_name, opponent_name, model, player_historical_stats, team_defense_stats):
    """
    Predicts NFL QB passing yards based on historical data, team matchups, and defensive stats.
    
    Args:
        player_name (str): Name of the player (e.g., 'P.Mahomes')
        team_name (str): Team code (e.g., 'KC')
        opponent_name (str): Opponent team code (e.g., 'LV')
        model: Trained machine learning model
        player_historical_stats (DataFrame): Historical player statistics
        team_defense_stats (DataFrame): Team defensive statistics
    
    Returns:
        float: Predicted passing yards for the QB
    """
    # Convert player_name to string and ensure player_historical_stats names are strings
    player_name = str(player_name)
    player_historical_stats['player_name'] = player_historical_stats['player_name'].astype(str)
    
    # Check if player exists in historical stats
    if player_name not in player_historical_stats['player_name'].values:
        print(f"\nPlayer '{player_name}' not found. Available similar players:")
        similar_players = [str(p) for p in player_historical_stats['player_name'].unique() 
                         if player_name.lower() in str(p).lower()]
        print(similar_players[:5])
        return 0
    
    # Check if opponent exists in defensive stats
    if opponent_name not in team_defense_stats['defteam'].values:
        print(f"\nOpponent '{opponent_name}' not found in defensive stats.")
        return 0
    
    # Get historical stats for the player
    player_stats = player_historical_stats[player_historical_stats['player_name'] == player_name].iloc[0]
    
    # Get defensive stats for the opponent
    defense_stats = team_defense_stats[team_defense_stats['defteam'] == opponent_name].iloc[0]
    
    # Check if player has enough historical data
    if player_stats['num_games'] < 5:
        print(f"\nNot enough historical data for player '{player_name}'")
        return 0
    
    # Create feature vector matching training data
    input_features = np.zeros(len(model.feature_names_in_))
    feature_dict = {name: i for i, name in enumerate(model.feature_names_in_)}
    
    # Set team indicators
    team_feature = f'posteam_{team_name}'
    opp_feature = f'defteam_{opponent_name}'
    
    # Validate team names
    if team_feature not in feature_dict:
        print(f"Warning: Team {team_name} not found in training data")
        return 0
    
    if opp_feature not in feature_dict:
        print(f"Warning: Opponent {opponent_name} not found in training data")
        return 0
    
    # Set team indicators
    input_features[feature_dict[team_feature]] = 1
    input_features[feature_dict[opp_feature]] = 1
    
    # Set basic stats using historical averages
    basic_stats = {
        'pass_attempt': 'avg_pass_att',
        'pass_touchdown': 'avg_pass_td',
        'air_yards': 'avg_air_yards',
        'yards_after_catch': 'avg_yac',
        'shotgun': 'avg_shotgun',
        'qb_scramble': 'avg_qb_scramble'
    }
    
    for model_feat, stat_col in basic_stats.items():
        if model_feat in feature_dict:
            input_features[feature_dict[model_feat]] = max(0, player_stats[stat_col])
    
    # Set categorical distributions (pass length and pass location)
    for col in model.feature_names_in_:
        if col.startswith(('pass_length_', 'pass_location_')):
            if col in feature_dict and col in player_stats:
                input_features[feature_dict[col]] = player_stats[col]
    
    # Print player historical analysis
    print(f"\nPlayer Analysis for {player_name}:")
    print(f"Average Passing Yards: {player_stats['avg_passing_yards']:.1f}")
    print(f"Games Played: {player_stats['num_games']}")
    print(f"Avg Pass Attempts: {player_stats['avg_pass_att']:.1f}")
    print(f"Avg Pass TDs: {player_stats['avg_pass_td']:.1f}")
    print(f"Avg Air Yards: {player_stats['avg_air_yards']:.1f}")
    print(f"Avg YAC: {player_stats['avg_yac']:.1f}")
    print(f"Shotgun %: {player_stats['avg_shotgun']*100:.1f}%")
    print(f"Avg Scrambles: {player_stats['avg_qb_scramble']:.1f}")
    
    # Print defensive analysis
    print(f"\nDefensive Analysis for {opponent_name}:")
    print(f"Pass Yards Allowed/Game: {defense_stats['pass_yards_allowed_per_game']:.1f}")
    print(f"Pass TD Allowed/Game: {defense_stats['pass_td_allowed_per_game']:.2f}")
    print(f"Completion % Allowed: {defense_stats['completion_pct_allowed']:.1f}%")
    
    print("\nKey features being used:")
    for fname, idx in feature_dict.items():
        if input_features[idx] > 0:
            print(f"{fname}: {input_features[idx]:.3f}")
    
    # Make base prediction
    base_prediction = model.predict(input_features.reshape(1, -1))[0]
    
    # Calculate defensive adjustment factor
    defense_factor = defense_stats['pass_yards_allowed_per_game'] / team_defense_stats['pass_yards_allowed_per_game'].mean()
    
    # Adjust prediction based on defensive strength (with dampening)
    adjusted_prediction = base_prediction * (0.7 + 0.3 * defense_factor)
    
    print(f"\nPrediction Breakdown:")
    print(f"Base Prediction: {base_prediction:.1f} yards")
    print(f"Defensive Adjustment Factor: {defense_factor:.2f}")
    print(f"Adjusted Prediction: {adjusted_prediction:.1f} yards")
    
    return max(0, adjusted_prediction)

In [67]:
# Update the main execution block
if __name__ == "__main__":
    # Load the model
    model = joblib.load('nfl_player_stat_predictor.joblib')
    
    # Print some helpful information
    print("\nExample format for names:")
    print("Player name: P.Mahomes")
    print("Team name:", df['posteam'].unique()[0], "(use exactly as shown)")
    print("Opponent name:", df['defteam'].unique()[0], "(use exactly as shown)")
    print("\n")
    
    # Show available players and teams
    print("Sample of available players:")
    print(player_historical_stats['player_name'].head(10))
    print("\nSample of available teams:")
    print(df['posteam'].unique())
    
    # Show model's features
    print("\nModel's expected features:")
    print(model.feature_names_in_)
    
    # Get user input
    player_name = input("Enter player name: ")
    team_name = input("Enter player's team: ")
    opponent_name = input("Enter opponent team: ")
    
    # Make prediction
    prediction = predict_player_stats(player_name, team_name, opponent_name, model, player_historical_stats, team_defense_stats)
    
    print(f"\nPredicted stats for {player_name} of {team_name} against {opponent_name}: {prediction:.1f} total passing yards")


Example format for names:
Player name: P.Mahomes
Team name: NYJ (use exactly as shown)
Opponent name: BAL (use exactly as shown)


Sample of available players:
0         A.Brown
1        A.Cooper
2        A.Dalton
3           A.Lee
4      A.McCarron
5     A.O'Connell
6    A.Richardson
7       A.Rodgers
8     A.St. Brown
9         B.Allen
Name: player_name, dtype: object

Sample of available teams:
['NYJ' 'BAL' 'BUF' 'LA' 'CAR' 'CLE' 'SEA' 'DEN' 'MIN' 'GB' 'IND' 'HOU'
 'JAX' 'WAS' 'KC' 'ARI' 'LAC' 'LV' 'NE' 'MIA' 'ATL' 'NO' 'NYG' 'TEN' 'DET'
 'PHI' 'PIT' 'CIN' 'CHI' 'SF' 'DAL' 'TB']

Model's expected features:
['pass_attempt' 'pass_touchdown' 'air_yards' 'yards_after_catch' 'shotgun'
 'qb_scramble' '0_x' 'deep' 'short' '0_y' 'left' 'middle' 'right'
 'pass_yards_allowed_per_game' 'pass_td_allowed_per_game'
 'completion_pct_allowed' 'posteam_ARI' 'posteam_ATL' 'posteam_BAL'
 'posteam_BUF' 'posteam_CAR' 'posteam_CHI' 'posteam_CIN' 'posteam_CLE'
 'posteam_DAL' 'posteam_DEN' 'posteam_DET' '

