In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [71]:
#pip install pyarrow

In [72]:
player_data_full = pd.read_parquet('../data/raw/nflverse/player_stats_season.parquet')
adj_salaries = pd.read_csv('../data/raw/salaries/RB_adjusted_salaries.csv')

In [63]:
adj_salaries[adj_salaries.player == 'Nick Chubb']

Unnamed: 0,player,year_signed,apy,med_adjusted_apy,mean_adjusted_apy,smooth_adjusted_apy
38,Nick Chubb,2021,12.2,28.284314,6.550837,18.147493
325,Nick Chubb,2018,1.845774,6.658977,0.481372,0.858696
363,Nick Chubb,2024,2.275,3.059399,0.732977,1.77989


In [74]:
rb = player_data_full[player_data_full['position'] == 'RB']

## including postgame data 
rb = rb[rb.season_type == 'REG+POST']

## filtering for relevant rb metrics
rb_features = ['season', 'player_display_name', 'games', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa', 'receptions', 'targets', 'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 'target_share', 'fantasy_points']
#rb_features = ['season', 'player_display_name', 'games', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa', 'fantasy_points']
rb = rb[rb_features]
rb.sample(2)

Unnamed: 0,season,player_display_name,games,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,receptions,targets,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points
11849,2008,Brian Westbrook,17,283,1055.0,9,1.0,0.0,54.0,-21.171722,61,61,487.0,23.0,30.258866,0.168975,243.6
7083,2004,Dominic Rhodes,13,58,266.0,2,1.0,1.0,17.0,-6.48187,3,3,0.0,1.0,-0.280214,0.045455,42.8


In [75]:
## removing rookie contracts from salary data

rookie_years = adj_salaries.groupby('player')['year_signed'].min().reset_index()
rookie_years.columns = ['player', 'rookie_year']

adj_salaries = adj_salaries.merge(rookie_years, on='player')
adj_salaries = adj_salaries[adj_salaries['year_signed'] > adj_salaries['rookie_year']]

In [76]:
## joining salaries to data prior to the year contract was signed. 
## ex joining Aaron Rodgers 2018 contract information to all years 2013-2017, 2013 to 2008-2012 etc

def next_salary(player_id, year, contract_df):

    future_contracts = contract_df[(contract_df['player'] == player_id) &  
                                   (contract_df['year_signed'] > year)]


    ### getting contract signed most recently after current year of play, returning salary/yr
    if len(future_contracts) > 0:
        next_contract = future_contracts.sort_values('year_signed').iloc[0]
        return next_contract['apy']
    else:
        return np.nan 
    

def next_salary_smoothed(player_id, year, contract_df, type):

    future_contracts = contract_df[(contract_df['player'] == player_id) &  
                                   (contract_df['year_signed'] > year)]
    if len(future_contracts) > 0:
        next_contract = future_contracts.sort_values('year_signed').iloc[0]
        if type == 'smoothed':
            return next_contract['smooth_adjusted_apy']
        if type == 'mean':
            return next_contract['mean_adjusted_apy']
        if type == 'median':
            return next_contract['med_adjusted_apy']
    else:
        return np.nan 

In [77]:
### applying next salary function to all rows in qb_data, joining the salary from their next contract 

rb['salary_per_year'] = rb.apply(
    lambda row: next_salary(row['player_display_name'], row['season'], adj_salaries), 
    axis=1
)

rb['smoothed_salary_per_year'] = rb.apply(
    lambda row: next_salary_smoothed(row['player_display_name'], row['season'], adj_salaries, 'smoothed'), 
    axis=1
)

rb['mean_adj_salary_per_year'] = rb.apply(
   lambda row: next_salary_smoothed(row['player_display_name'], row['season'], adj_salaries, 'mean'), 
    axis=1
)

rb['med_adj_salary_per_year'] = rb.apply(
    lambda row: next_salary_smoothed(row['player_display_name'], row['season'], adj_salaries, 'median'), 
    axis=1
)

In [78]:
rb.shape

(3560, 21)

In [80]:
adj_salaries = adj_salaries.rename(columns={'player': 'player_display_name','smooth_adjusted_apy': 'smoothed_salary_per_year'})

rb2 = pd.merge(
    rb, 
    adj_salaries[['player_display_name', 'smoothed_salary_per_year', 'year_signed']], 
    on=['player_display_name', 'smoothed_salary_per_year'], 
    how='left'
)

In [86]:
rb2[rb2.player_display_name == 'Christian McCaffrey'].sample(2)

Unnamed: 0,season,player_display_name,games,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,receptions,targets,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points,salary_per_year,smoothed_salary_per_year,mean_adj_salary_per_year,med_adj_salary_per_year,year_signed
3135,2021,Christian McCaffrey,7,99,442.0,1,1.0,0.0,20.0,1.778066,37,41,258.0,21.0,21.0834,0.18552,90.5,19.0,23.530694,10.536877,39.673679,2024.0
2977,2020,Christian McCaffrey,3,59,225.0,5,0.0,0.0,14.0,3.933632,17,19,123.0,8.0,8.911715,0.161017,73.4,19.0,23.530694,10.536877,39.673679,2024.0


In [92]:
rb_train_data = rb2[rb2.year_signed <= 2023]
rb_train_data = rb_train_data.dropna(subset=['salary_per_year'])
rb_train_data.to_csv('../data/cleaned/rb_train.csv', index=False)

In [93]:
rb_test_data = rb2[rb2.year_signed == 2024]
rb_test_data = rb_test_data.dropna(subset=['salary_per_year'])
rb_test_data.to_csv('../data/cleaned/rb_test.csv', index=False)