In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib


In [11]:
#Load data, replace this with API call for "live" data.
df = pd.read_csv('../pbp_data.csv')

#Filter for relevant columns
cols_to_keep = [
    'posteam', 'defteam', 'play_type', 'yards_gained', 'pass_attempt', 
    'rush_attempt', 'touchdown', 'pass_touchdown', 'rush_touchdown', 
    'game_seconds_remaining', 'yardline_100', 'down', 'ydstogo', 
    'score_differential', 'air_yards', 'yards_after_catch', 
    'passer_player_name', 'rusher_player_name', 'receiver_player_name',
    'game_id', 'shotgun', 'qb_scramble', 'play_type', 'pass_length', 'pass_location', 'complete_pass'
]
df = df[cols_to_keep]

#Drop rows with missing player names, we only want plays where there was a passer, rusher, or receiver.
df.dropna(subset=['passer_player_name', 'rusher_player_name', 'receiver_player_name'], how='all', inplace=True)

#Fill NaNs in numerical columns with 0
df.fillna(0, inplace=True)

print("Initial data shape:", df.shape)


  df = pd.read_csv('../pbp_data.csv')


Initial data shape: (72066, 26)


In [12]:
#Calculate total defensive stats for each team from the play-by-play data.
team_defense_stats = df.groupby('defteam').agg({
    'yards_gained': lambda x: x[df['pass_attempt'] == 1].mean(),
    'pass_attempt': 'count',
    'pass_touchdown': 'sum',
    'yards_after_catch': 'mean',
    'air_yards': 'mean',
    'complete_pass': lambda x: x[df['pass_attempt'] == 1].mean()
}).reset_index()

# Calculate per game stats
games_per_team = df.groupby('defteam')['game_id'].nunique()
team_defense_stats['games'] = team_defense_stats['defteam'].map(games_per_team)

#something is fucky here
team_defense_stats = team_defense_stats.assign(
    pass_yards_allowed_per_game=team_defense_stats['yards_gained'] / team_defense_stats['games'],
    pass_td_allowed_per_game=team_defense_stats['pass_touchdown'] / team_defense_stats['games'],
    completion_pct_allowed=team_defense_stats['complete_pass'] / team_defense_stats['pass_attempt'] * 100
)

#Create game-by-game dataset with only the columns we have
game_data = df[df['pass_attempt'] == 1].groupby(
    ['passer_player_name', 'game_id', 'posteam', 'defteam']
).agg({
    'yards_gained': 'sum',
    'pass_attempt': 'count',
    'pass_touchdown': 'sum',
    'air_yards': 'mean',
    'yards_after_catch': 'mean',
    'shotgun': 'mean',
    'qb_scramble': 'sum'
}).reset_index()

#Merge in defensive stats, so every game will have these columns of the team defense overall, there is definitely a better way to do this
game_data = game_data.merge(
    team_defense_stats[['defteam', 'pass_yards_allowed_per_game', 
                       'pass_td_allowed_per_game', 'completion_pct_allowed']],
    on='defteam',
    how='left'
)

#Create features for each game, passer_player is just a name, game_id an identifier, and we want to predict yards_gained
X = game_data.drop(['passer_player_name', 'game_id', 'yards_gained'], axis=1)
#this creates binary columns for each team (posteam_KC 1 if team is KC, 0 otherwise)
X = pd.get_dummies(X, columns=['posteam', 'defteam'])

# Target variable is passing yards for each game
y = game_data['yards_gained']




In [13]:
#train
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

#save and run some analysis
joblib.dump(model, 'nfl_player_stat_predictor.joblib')
print("\nMost important features:")
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
})
print(feature_importance.sort_values('importance', ascending=False).head(10))



Most important features:
                        feature  importance
0                  pass_attempt    0.720949
3             yards_after_catch    0.114145
2                     air_yards    0.050372
1                pass_touchdown    0.045195
4                       shotgun    0.012392
6   pass_yards_allowed_per_game    0.009973
7      pass_td_allowed_per_game    0.008356
8        completion_pct_allowed    0.008259
28                  posteam_MIA    0.001315
19                  posteam_DET    0.000856


What this code does is take each of those 40 games and asks: 
"What if this exact game pattern happened against the new opponent?"

For example:
Game 1: Mahomes threw 35 times, averaged 8 air yards per attempt, more cols
Game 2: Mahomes threw 42 times, averaged 6 air yards per attempt, more cols

The model predicts for each of these patterns: "If Mahomes plays with this pattern against the new opponent, he would throw for X yards."

In [14]:
def predict_player_stats(player_name, team_name, opponent_name, model, game_data, team_defense_stats):
    """
    Predicts NFL QB passing yards using individual game data.
    
    Args:
        player_name (str): Name of the player (e.g., 'P.Mahomes')
        team_name (str): Team code (e.g., 'KC')
        opponent_name (str): Opponent team code (e.g., 'LV')
        model: Trained machine learning model
        game_data (DataFrame): DataFrame containing individual game statistics
        team_defense_stats (DataFrame): DataFrame containing team defensive statistics
    
    Returns:
        float: Predicted passing yards for the QB
    """
    # Convert player_name to string
    player_name = str(player_name)
    
    # Get all player's games for analysis
    player_games = game_data[game_data['passer_player_name'] == player_name].sort_values('game_id')
    
    if len(player_games) == 0:
        print(f"\nPlayer '{player_name}' not found in game data.")
        return 0
    
    # Check if opponent exists in defensive stats
    if opponent_name not in team_defense_stats['defteam'].values:
        print(f"\nOpponent '{opponent_name}' not found in defensive stats.")
        return 0
    
    # Get defensive stats for the opponent
    defense_stats = team_defense_stats[team_defense_stats['defteam'] == opponent_name].iloc[0]
    
    # Create a DataFrame for each game with the new opponent
    prediction_games = player_games.copy()
    prediction_games['defteam'] = opponent_name
    prediction_games['pass_yards_allowed_per_game'] = defense_stats['pass_yards_allowed_per_game']
    prediction_games['pass_td_allowed_per_game'] = defense_stats['pass_td_allowed_per_game']
    prediction_games['completion_pct_allowed'] = defense_stats['completion_pct_allowed']
    prediction_games['posteam'] = team_name
    
    # Drop non-feature columns
    X_pred = prediction_games.drop(['passer_player_name', 'game_id', 'yards_gained'], axis=1)
    
    # Create dummy variables
    X_pred = pd.get_dummies(X_pred, columns=['posteam', 'defteam'])
    
    # Ensure all model features are present
    for feature in model.feature_names_in_:
        if feature not in X_pred.columns:
            X_pred[feature] = 0
            
    # Reorder columns to match model's expected feature order
    X_pred = X_pred[model.feature_names_in_]
    
    # Make predictions for all games
    predictions = model.predict(X_pred)
    
    # Print summary statistics
    print(f"\nPrediction Analysis for {player_name} vs {opponent_name}:")
    print(f"Based on {len(predictions)} historical game patterns")
    print(f"Mean Prediction: {predictions.mean():.1f} yards")
    print(f"Median Prediction: {np.median(predictions):.1f} yards")
    print(f"Std Dev: {predictions.std():.1f} yards")
    print(f"Range: {predictions.min():.1f} - {predictions.max():.1f} yards")
    
    # Return the mean prediction
    return max(0, predictions.mean())

In [16]:
# the fantastic user interface
if __name__ == "__main__":
    # Load the model
    model = joblib.load('nfl_player_stat_predictor.joblib')
    
    # Show model's features
    print("\nModel's expected features:")
    print(model.feature_names_in_)
    
    # Get user input
    player_name = input("Enter QB name: ")
    team_name = input("Enter team: ")
    opponent_name = input("Enter opponent team: ")

    prediction = predict_player_stats(player_name, team_name, opponent_name, model, game_data, team_defense_stats)

    print(f"\nFinal Prediction for {player_name} vs {opponent_name}: {prediction:.1f} passing yards")


Model's expected features:
['pass_attempt' 'pass_touchdown' 'air_yards' 'yards_after_catch' 'shotgun'
 'qb_scramble' 'pass_yards_allowed_per_game' 'pass_td_allowed_per_game'
 'completion_pct_allowed' 'posteam_ARI' 'posteam_ATL' 'posteam_BAL'
 'posteam_BUF' 'posteam_CAR' 'posteam_CHI' 'posteam_CIN' 'posteam_CLE'
 'posteam_DAL' 'posteam_DEN' 'posteam_DET' 'posteam_GB' 'posteam_HOU'
 'posteam_IND' 'posteam_JAX' 'posteam_KC' 'posteam_LA' 'posteam_LAC'
 'posteam_LV' 'posteam_MIA' 'posteam_MIN' 'posteam_NE' 'posteam_NO'
 'posteam_NYG' 'posteam_NYJ' 'posteam_PHI' 'posteam_PIT' 'posteam_SEA'
 'posteam_SF' 'posteam_TB' 'posteam_TEN' 'posteam_WAS' 'defteam_ARI'
 'defteam_ATL' 'defteam_BAL' 'defteam_BUF' 'defteam_CAR' 'defteam_CHI'
 'defteam_CIN' 'defteam_CLE' 'defteam_DAL' 'defteam_DEN' 'defteam_DET'
 'defteam_GB' 'defteam_HOU' 'defteam_IND' 'defteam_JAX' 'defteam_KC'
 'defteam_LA' 'defteam_LAC' 'defteam_LV' 'defteam_MIA' 'defteam_MIN'
 'defteam_NE' 'defteam_NO' 'defteam_NYG' 'defteam_NYJ' 'def