In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [3]:
#pip install pyarrow

In [55]:
player_data_full = pd.read_parquet('../data/raw/nflverse/player_stats_season.parquet')
adj_salaries = pd.read_csv('../data/raw/salaries/RB_adjusted_salaries.csv')

In [56]:
rb = player_data_full[player_data_full['position'] == 'RB']

## including postgame data 
rb = rb[rb.season_type == 'REG+POST']

## filtering for relevant qb metrics
rb_features = ['season', 'player_display_name', 'games', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa', 'receptions', 'targets', 'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 'target_share', 'fantasy_points']
#rb_features = ['season', 'player_display_name', 'games', 'carries', 'rushing_yards', 'rushing_tds', 'rushing_fumbles', 'rushing_fumbles_lost', 'rushing_first_downs', 'rushing_epa', 'fantasy_points']
rb = rb[rb_features]
rb.sample(2)

Unnamed: 0,season,player_display_name,games,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,receptions,targets,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points
26335,2019,Jonathan Williams,4,49,235.0,1,0.0,0.0,13.0,2.484749,5,5,64.0,3.0,5.649887,0.060976,35.4
26315,2019,Peyton Barber,16,154,470.0,6,1.0,1.0,23.0,-33.68279,16,24,118.0,4.0,-3.242034,0.055556,100.5


In [57]:
## removing rookie contracts from salary data

rookie_years = adj_salaries.groupby('player')['year_signed'].min().reset_index()
rookie_years.columns = ['player', 'rookie_year']

adj_salaries = adj_salaries.merge(rookie_years, on='player')
adj_salaries = adj_salaries[adj_salaries['year_signed'] > adj_salaries['rookie_year']]

In [48]:
## joining salaries to data prior to the year contract was signed. 
## ex joining Aaron Rodgers 2018 contract information to all years 2013-2017, 2013 to 2008-2012 etc

def next_salary(player_id, year, contract_df):

    future_contracts = contract_df[(contract_df['player'] == player_id) &  
                                   (contract_df['year_signed'] > year)]


    ### getting contract signed most recently after current year of play, returning salary/yr
    if len(future_contracts) > 0:
        next_contract = future_contracts.sort_values('year_signed').iloc[0]
        return next_contract['apy']
    else:
        return np.nan 
    

def next_salary_smoothed(player_id, year, contract_df):

    future_contracts = contract_df[(contract_df['player'] == player_id) &  
                                   (contract_df['year_signed'] > year)]
    if len(future_contracts) > 0:
        next_contract = future_contracts.sort_values('year_signed').iloc[0]
        return next_contract['smooth_adjusted_apy']
    else:
        return np.nan 

In [49]:
### applying next salary function to all rows in qb_data, joining the salary from their next contract 

rb['salary_per_year'] = rb.apply(
    lambda row: next_salary(row['player_display_name'], row['season'], adj_salaries), 
    axis=1
)

rb['smoothed_salary_per_year'] = rb.apply(
    lambda row: next_salary_smoothed(row['player_display_name'], row['season'], adj_salaries), 
    axis=1
)

In [50]:
adj_salaries[adj_salaries.player == 'Nick Chubb']

Unnamed: 0,player,year_signed,apy,med_adjusted_apy,mean_adjusted_apy,smooth_adjusted_apy,rookie_year_x,rookie_year_y
32,Nick Chubb,2021,12.2,28.284314,6.550837,18.147493,2018,2021
301,Nick Chubb,2024,2.275,3.059399,0.732977,1.77989,2018,2021


In [51]:
rb[rb.player_display_name == 'Nick Chubb'].sample(3)

Unnamed: 0,season,player_display_name,games,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,receptions,targets,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points,salary_per_year,smoothed_salary_per_year
30576,2022,Nick Chubb,17,302,1525.0,12,1.0,1.0,69.0,13.225027,27,37,259.0,12.0,7.964626,0.072978,254.4,2.275,1.77989
27876,2020,Nick Chubb,14,221,1212.0,12,1.0,1.0,63.0,17.199724,22,27,218.0,8.0,8.068899,0.072193,219.5,12.2,18.147493
26542,2019,Nick Chubb,16,299,1513.0,8,2.0,2.0,63.0,-21.48616,36,49,318.0,13.0,-2.660069,0.094595,221.1,12.2,18.147493


In [52]:
rb_train_data = rb[rb.season <= 2023]
rb_train_data = rb_train_data.dropna(subset=['salary_per_year'])
rb_train_data.to_csv('../data/final/rb_train_updated.csv', index=False)

In [53]:
rb_test_data = rb[rb.season > 2023]
rb_test_data = rb_test_data.dropna(subset=['salary_per_year'])
rb_test_data.to_csv('../data/final/rb_test_updated.csv', index=False)

In [54]:
rb_test_data.sample(7)

Unnamed: 0,season,player_display_name,games,carries,rushing_yards,rushing_tds,rushing_fumbles,rushing_fumbles_lost,rushing_first_downs,rushing_epa,receptions,targets,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points,salary_per_year,smoothed_salary_per_year
33243,2024,Patrick Taylor,5,39,183.0,1,0.0,0.0,11.0,2.139868,3,11,19.0,2.0,-10.156942,0.083333,26.8,1.17,0.164746
33281,2024,Clyde Edwards-Helaire,2,13,46.0,0,0.0,0.0,2.0,-1.958973,3,5,35.0,1.0,0.48212,0.068493,7.0,1.17,0.164746
33218,2024,Ty Johnson,20,61,307.0,1,0.0,0.0,18.0,0.926307,22,29,184.0,17.0,27.64656,0.063457,92.5,2.5,1.267339
33410,2024,Aaron Shampklin,3,6,17.0,0,0.0,0.0,1.0,-2.41447,0,0,0.0,0.0,,,1.7,0.96,-0.009348
33360,2024,Tyler Goodson,9,32,153.0,1,0.0,0.0,6.0,2.687648,11,15,67.0,3.0,1.303773,0.069124,33.4,1.03,0.048683
33108,2024,Samaje Perine,20,21,100.0,1,0.0,0.0,8.0,0.036445,29,38,330.0,17.0,16.01346,0.0608,55.9,1.8,0.687027
33504,2024,Emanuel Wilson,18,106,508.0,4,0.0,0.0,27.0,7.135807,12,17,129.0,4.0,-2.96274,0.054839,87.9,1.03,0.048683
