In [1]:
import polars as pl
import pandas as pd



In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load your dataframe
run_id = '20240809'
df = pd.read_parquet(f'./data/outputs/player_influence/fantasy_points_{run_id}.parquet')

# One-hot encode categorical features
df_encoded = pd.get_dummies(df, columns=['player_id', 'team', 'home_away', 'opponent', 'position', 'curr_team'])

# Define inputs and outputs
X = df_encoded[['season', 'week'] + [col for col in df_encoded.columns if col.startswith(('team_', 'home_away_', 'opponent_', 'position_', 'curr_team_'))]]
y = df_encoded[['passing_attempts', 'passing_touchdowns', 'passing_yards', 'rushing_attempts', 'rushing_yards', 'rushing_touchdowns', 'receptions', 'receiving_yards', 'receiving_touchdowns', 'fumbles', 'interceptions']]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 242.44965052039697


In [3]:
import pandas as pd
run_id = '20240809'
df = pd.read_parquet(f'./data/outputs/player_influence/fantasy_points_{run_id}.parquet')

# Filter data for training and testing
train_data = df[df['season'].isin([2020, 2021, 2022])]
test_data = df[df['season'] == 2023]

# Split into input features and target variables
X_train = train_data[['player_id', 'team', 'home_away', 'opponent', 'season', 'week']]
y_train = train_data[['passing_attempts', 'passing_touchdowns', 'passing_yards', 'rushing_attempts', 'rushing_yards', 'rushing_touchdowns', 'receptions', 'receiving_yards', 'receiving_touchdowns', 'fumbles', 'interceptions']]

X_test = test_data[['player_id', 'team', 'home_away', 'opponent', 'season', 'week']]
y_test = test_data[['passing_attempts', 'passing_touchdowns', 'passing_yards', 'rushing_attempts', 'rushing_yards', 'rushing_touchdowns', 'receptions', 'receiving_yards', 'receiving_touchdowns', 'fumbles', 'interceptions']]

# One-hot encode categorical features
X_train_encoded = pd.get_dummies(X_train, columns=['player_id', 'team', 'home_away', 'opponent'])
X_test_encoded = pd.get_dummies(X_test, columns=['player_id', 'team', 'home_away', 'opponent'])

# Ensure both train and test sets have the same feature columns
X_train_encoded, X_test_encoded = X_train_encoded.align(X_test_encoded, join='left', axis=1, fill_value=0)

from sklearn.ensemble import RandomForestRegressor

# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_encoded, y_train)

# Make predictions on the 2023 season
y_pred = model.predict(X_test_encoded)

# Evaluate the model
from sklearn.metrics import mean_squared_error, mean_absolute_error

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error on 2023 data: {mse}")
print(f"Mean Absolute Error on 2023 data: {mae}")

Mean Squared Error on 2023 data: 310.0727653463621
Mean Absolute Error on 2023 data: 4.204484425597186


In [6]:
import pandas as pd

# Define the column names for the target variables
target_columns = ['passing_attempts', 'passing_touchdowns', 'passing_yards', 
                  'rushing_attempts', 'rushing_yards', 'rushing_touchdowns', 
                  'receptions', 'receiving_yards', 'receiving_touchdowns', 
                  'fumbles', 'interceptions']

# Convert the predictions array to a DataFrame with column names
y_pred_df = pd.DataFrame(y_pred, columns=target_columns)

# Reset the index of y_test to align with y_pred_df
y_test_reset = y_test.reset_index(drop=True)

# Combine the actual and predicted DataFrames
comparison_df = pd.concat([X_test.reset_index(drop=True), y_test_reset, y_pred_df], axis=1)

# Add suffixes to differentiate between actual and predicted values
comparison_df.columns = list(X_test.columns) + [f"{col}_actual" for col in target_columns] + [f"{col}_predicted" for col in target_columns]

# Display the comparison DataFrame
comparison_df

Unnamed: 0,player_id,team,home_away,opponent,season,week,passing_attempts_actual,passing_touchdowns_actual,passing_yards_actual,rushing_attempts_actual,...,passing_touchdowns_predicted,passing_yards_predicted,rushing_attempts_predicted,rushing_yards_predicted,rushing_touchdowns_predicted,receptions_predicted,receiving_yards_predicted,receiving_touchdowns_predicted,fumbles_predicted,interceptions_predicted
0,00-0033943,DET,away,DET,2023,14,0.0,0.0,0.0,0.0,...,0.0,0.00,0.01,0.05,0.00,5.40,38.86,0.23,0.00,0.0
1,00-0031687,MIA,away,MIA,2023,4,0.0,0.0,0.0,7.0,...,0.0,0.00,14.05,64.55,0.09,2.88,12.08,0.00,0.00,0.0
2,00-0034184,BAL,home,BAL,2023,17,0.0,0.0,0.0,16.0,...,0.0,0.00,5.66,27.76,0.00,0.20,3.53,0.00,0.00,0.0
3,00-0033282,WAS,home,WAS,2023,18,0.0,0.0,0.0,1.0,...,0.0,0.00,1.02,-1.79,0.00,2.05,9.02,0.02,0.00,0.0
4,00-0033009,CIN,home,CIN,2023,6,0.0,0.0,0.0,0.0,...,0.0,1.84,0.02,0.29,0.00,7.45,75.97,0.28,0.01,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5702,00-0036139,DAL,home,DAL,2023,8,0.0,0.0,0.0,5.0,...,0.0,0.00,0.97,3.89,0.01,0.92,2.08,0.13,0.00,0.0
5703,00-0035261,DAL,home,DAL,2023,2,0.0,0.0,0.0,26.0,...,0.0,0.00,9.30,48.27,0.70,5.16,37.69,0.00,0.00,0.0
5704,00-0035535,NYG,away,NYG,2023,10,0.0,0.0,0.0,0.0,...,0.0,0.00,0.00,0.00,0.00,6.05,70.83,0.27,0.06,0.0
5705,00-0037744,ARI,away,ARI,2023,17,0.0,0.0,0.0,0.0,...,0.0,0.00,0.02,0.08,0.00,8.14,65.56,0.70,0.01,0.0


In [12]:
players = pd.read_parquet('./data/outputs/rosters/rosters_20240809.parquet')

In [13]:
comparison_df.merge(players[['full_name', 'player_id', 'position']].drop_duplicates(), on='player_id')

Unnamed: 0,player_id,team,home_away,opponent,season,week,passing_attempts_actual,passing_touchdowns_actual,passing_yards_actual,rushing_attempts_actual,...,rushing_attempts_predicted,rushing_yards_predicted,rushing_touchdowns_predicted,receptions_predicted,receiving_yards_predicted,receiving_touchdowns_predicted,fumbles_predicted,interceptions_predicted,full_name,position
0,00-0033943,DET,away,DET,2023,14,0.0,0.0,0.0,0.0,...,0.01,0.05,0.00,5.40,38.86,0.23,0.00,0.0,Josh Reynolds,WR
1,00-0031687,MIA,away,MIA,2023,4,0.0,0.0,0.0,7.0,...,14.05,64.55,0.09,2.88,12.08,0.00,0.00,0.0,Raheem Mostert,RB
2,00-0034184,BAL,home,BAL,2023,17,0.0,0.0,0.0,16.0,...,5.66,27.76,0.00,0.20,3.53,0.00,0.00,0.0,Gus Edwards,RB
3,00-0033282,WAS,home,WAS,2023,18,0.0,0.0,0.0,1.0,...,1.02,-1.79,0.00,2.05,9.02,0.02,0.00,0.0,Curtis Samuel,WR
4,00-0033009,CIN,home,CIN,2023,6,0.0,0.0,0.0,0.0,...,0.02,0.29,0.00,7.45,75.97,0.28,0.01,0.0,Tyler Boyd,WR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6100,00-0036139,DAL,home,DAL,2023,8,0.0,0.0,0.0,5.0,...,0.97,3.89,0.01,0.92,2.08,0.13,0.00,0.0,Rico Dowdle,RB
6101,00-0035261,DAL,home,DAL,2023,2,0.0,0.0,0.0,26.0,...,9.30,48.27,0.70,5.16,37.69,0.00,0.00,0.0,Tony Pollard,RB
6102,00-0035535,NYG,away,NYG,2023,10,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,6.05,70.83,0.27,0.06,0.0,Darius Slayton,WR
6103,00-0037744,ARI,away,ARI,2023,17,0.0,0.0,0.0,0.0,...,0.02,0.08,0.00,8.14,65.56,0.70,0.01,0.0,Trey McBride,TE
