In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib


In [2]:
#Load data, replace this with API call for "live" data.
df = pd.read_csv('../pbp_data.csv')

#Filter for relevant columns
cols_to_keep = [
    'posteam', 'defteam', 'play_type', 'yards_gained', 'pass_attempt', 
    'rush_attempt', 'touchdown', 'pass_touchdown', 'rush_touchdown', 
    'game_seconds_remaining', 'yardline_100', 'down', 'ydstogo', 
    'score_differential', 'air_yards', 'yards_after_catch', 
    'passer_player_name', 'rusher_player_name', 'receiver_player_name',
    'game_id', 'shotgun', 'qb_scramble', 'play_type', 'pass_length', 'pass_location', 'complete_pass'
]
df = df[cols_to_keep]

#Drop rows with missing player names, we only want plays where there was a passer, rusher, or receiver.
df.dropna(subset=['passer_player_name', 'rusher_player_name', 'receiver_player_name'], how='all', inplace=True)

#Fill NaNs in numerical columns with 0
df.fillna(0, inplace=True)

print("Initial data shape:", df.shape)


  df = pd.read_csv('../pbp_data.csv')


Initial data shape: (72066, 26)

Sample of yards_gained: 2    19.0
3     0.0
4     5.0
5     0.0
7     4.0
Name: yards_gained, dtype: float64

Sample of player names: 2            0
3     J.Flacco
4            0
5     J.Flacco
7    L.Jackson
Name: passer_player_name, dtype: object


In [3]:
#Calculate total defensive stats for each team from the play-by-play data.
team_defense_stats = df.groupby('defteam').agg({
    'yards_gained': lambda x: x[df['pass_attempt'] == 1].mean(),
    'pass_attempt': 'count',
    'pass_touchdown': 'sum',
    'yards_after_catch': 'mean',
    'air_yards': 'mean',
    'complete_pass': lambda x: x[df['pass_attempt'] == 1].mean()
}).reset_index()

# Calculate per game stats
games_per_team = df.groupby('defteam')['game_id'].nunique()
team_defense_stats['games'] = team_defense_stats['defteam'].map(games_per_team)

#something is fucky here
team_defense_stats = team_defense_stats.assign(
    pass_yards_allowed_per_game=team_defense_stats['yards_gained'] / team_defense_stats['games'],
    pass_td_allowed_per_game=team_defense_stats['pass_touchdown'] / team_defense_stats['games'],
    completion_pct_allowed=team_defense_stats['complete_pass'] / team_defense_stats['pass_attempt'] * 100
)

#Create game-by-game dataset with only the columns we have
game_data = df[df['pass_attempt'] == 1].groupby(
    ['passer_player_name', 'game_id', 'posteam', 'defteam']
).agg({
    'yards_gained': 'sum',
    'pass_attempt': 'count',
    'pass_touchdown': 'sum',
    'air_yards': 'mean',
    'yards_after_catch': 'mean',
    'shotgun': 'mean',
    'qb_scramble': 'sum'
}).reset_index()

#Merge in defensive stats, so every game will have these columns of the team defense overall, there is definitely a better way to do this
game_data = game_data.merge(
    team_defense_stats[['defteam', 'pass_yards_allowed_per_game', 
                       'pass_td_allowed_per_game', 'completion_pct_allowed']],
    on='defteam',
    how='left'
)

#Create features for each game, passer_player is just a name, game_id an identifier, and we want to predict yards_gained
X = game_data.drop(['passer_player_name', 'game_id', 'yards_gained'], axis=1)
#this creates binary columns for each team (posteam_KC 1 if team is KC, 0 otherwise)
X = pd.get_dummies(X, columns=['posteam', 'defteam'])

# Target variable is passing yards for each game
y = game_data['yards_gained']




In [4]:
#train
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

#save and run some analysis
joblib.dump(model, 'nfl_player_stat_predictor.joblib')
print("\nMost important features:")
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
})
print(feature_importance.sort_values('importance', ascending=False).head(10))



Most important features:
                        feature  importance
0                  pass_attempt    0.720738
3             yards_after_catch    0.113899
2                     air_yards    0.049265
1                pass_touchdown    0.045284
6   pass_yards_allowed_per_game    0.012632
4                       shotgun    0.012341
8        completion_pct_allowed    0.009001
7      pass_td_allowed_per_game    0.007946
28                  posteam_MIA    0.001341
19                  posteam_DET    0.000847


In [5]:
def predict_player_stats(player_name, team_name, opponent_name, model, game_data, team_defense_stats):
    """
    Predicts NFL QB passing yards using individual game data and recent trends.
    
    Args:
        player_name (str): Name of the QB player, First initial<.><Last name> (e.g., 'P.Mahomes')
        team_name (str): Team code (e.g., 'KC')
        opponent_name (str): Opponent team code (e.g., 'LV')
        model: Trained machine learning model
        game_data (DataFrame): DataFrame containing individual game statistics
        team_defense_stats (DataFrame): DataFrame containing team defensive statistics
    
    Returns:
        float: Predicted passing yards for the QB
    """
    # Convert player_name to string
    player_name = str(player_name)
    
    # Get player's recent games for analysis
    player_games = game_data[game_data['passer_player_name'] == player_name].sort_values('game_id')
    
    if len(player_games) == 0:
        print(f"\nPlayer '{player_name}' not found in game data.")
        return 0
    
    # Check if opponent exists in defensive stats
    if opponent_name not in team_defense_stats['defteam'].values:
        print(f"\nOpponent '{opponent_name}' not found in defensive stats.")
        return 0
    
    # Get last 5 games for trend analysis
    recent_games = player_games.tail(5)
    
    print(f"\nRecent Games Analysis for {player_name}:")
    print("\nLast 5 Games:")
    for _, game in recent_games.iterrows():
        print(f"vs {game['defteam']}: {game['yards_gained']:.0f} yards, "
              f"{game['pass_attempt']} attempts, {game['pass_touchdown']} TDs")
    
    print(f"\nRecent Averages (Last 5 Games):")
    print(f"Passing Yards: {recent_games['yards_gained'].mean():.1f}")
    print(f"Pass Attempts: {recent_games['pass_attempt'].mean():.1f}")
    print(f"Pass TDs: {recent_games['pass_touchdown'].mean():.1f}")
    print(f"Air Yards/Attempt: {recent_games['air_yards'].mean():.1f}")
    print(f"YAC/Attempt: {recent_games['yards_after_catch'].mean():.1f}")
    
    print(f"\nSeason Stats ({len(player_games)} games):")
    print(f"Avg Passing Yards: {player_games['yards_gained'].mean():.1f}")
    print(f"Std Dev: {player_games['yards_gained'].std():.1f}")
    print(f"Min Yards: {player_games['yards_gained'].min():.1f}")
    print(f"Max Yards: {player_games['yards_gained'].max():.1f}")
    
    # Get defensive stats for the opponent
    defense_stats = team_defense_stats[team_defense_stats['defteam'] == opponent_name].iloc[0]
    
    print(f"\nDefensive Analysis for {opponent_name}:")
    print(f"Pass Yards Allowed/Game: {defense_stats['pass_yards_allowed_per_game']:.1f}")
    print(f"Pass TD Allowed/Game: {defense_stats['pass_td_allowed_per_game']:.2f}")
    print(f"Completion % Allowed: {defense_stats['completion_pct_allowed']:.1f}%")
    
    # Create prediction features using most recent game's style metrics
    latest_game = player_games.iloc[-1]
    input_features = pd.DataFrame({
        'pass_attempt': [latest_game['pass_attempt']],
        'pass_touchdown': [latest_game['pass_touchdown']],
        'air_yards': [latest_game['air_yards']],
        'yards_after_catch': [latest_game['yards_after_catch']],
        'shotgun': [latest_game['shotgun']],
        'qb_scramble': [latest_game['qb_scramble']],
        'pass_yards_allowed_per_game': [defense_stats['pass_yards_allowed_per_game']],
        'pass_td_allowed_per_game': [defense_stats['pass_td_allowed_per_game']],
        'completion_pct_allowed': [defense_stats['completion_pct_allowed']]
    })
    
    # Add team indicators
    for col in model.feature_names_in_:
        if col.startswith('posteam_'):
            input_features[col] = 1 if col == f'posteam_{team_name}' else 0
        elif col.startswith('defteam_'):
            input_features[col] = 1 if col == f'defteam_{opponent_name}' else 0
    
    # Ensure all model features are present
    for feature in model.feature_names_in_:
        if feature not in input_features.columns:
            input_features[feature] = 0
    
    # Reorder columns to match model's expected feature order
    input_features = input_features[model.feature_names_in_]
    
    # Make prediction
    prediction = model.predict(input_features)[0]
    
    print(f"\nPrediction Components:")
    print(f"Recent Form: {recent_games['yards_gained'].mean():.1f} yards/game")
    print(f"Model Prediction: {prediction:.1f} yards")
    
    return max(0, prediction)

In [11]:
# Update the main execution block
if __name__ == "__main__":
    # Load the model
    model = joblib.load('nfl_player_stat_predictor.joblib')
    
    # Show model's features
    print("\nModel's expected features:")
    print(model.feature_names_in_)
    
    # Get user input
    player_name = input("Enter QB name: ")
    team_name = input("Enter team: ")
    opponent_name = input("Enter opponent team: ")

    prediction = predict_player_stats(player_name, team_name, opponent_name, model, game_data, team_defense_stats)

    print(f"\nFinal Prediction for {player_name} vs {opponent_name}: {prediction:.1f} passing yards")


Example format for names:
Player name: P.Mahomes
Team name: NYJ (use exactly as shown)
Opponent name: BAL (use exactly as shown)



Model's expected features:
['pass_attempt' 'pass_touchdown' 'air_yards' 'yards_after_catch' 'shotgun'
 'qb_scramble' 'pass_yards_allowed_per_game' 'pass_td_allowed_per_game'
 'completion_pct_allowed' 'posteam_ARI' 'posteam_ATL' 'posteam_BAL'
 'posteam_BUF' 'posteam_CAR' 'posteam_CHI' 'posteam_CIN' 'posteam_CLE'
 'posteam_DAL' 'posteam_DEN' 'posteam_DET' 'posteam_GB' 'posteam_HOU'
 'posteam_IND' 'posteam_JAX' 'posteam_KC' 'posteam_LA' 'posteam_LAC'
 'posteam_LV' 'posteam_MIA' 'posteam_MIN' 'posteam_NE' 'posteam_NO'
 'posteam_NYG' 'posteam_NYJ' 'posteam_PHI' 'posteam_PIT' 'posteam_SEA'
 'posteam_SF' 'posteam_TB' 'posteam_TEN' 'posteam_WAS' 'defteam_ARI'
 'defteam_ATL' 'defteam_BAL' 'defteam_BUF' 'defteam_CAR' 'defteam_CHI'
 'defteam_CIN' 'defteam_CLE' 'defteam_DAL' 'defteam_DEN' 'defteam_DET'
 'defteam_GB' 'defteam_HOU' 'defteam_IND' 'defteam_JAX' 'defteam_