In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [3]:
#pip install pyarrow

In [15]:
player_data_full = pd.read_parquet('../data/raw/nflverse/player_stats_season.parquet')
adj_salaries = pd.read_csv('../data/raw/salaries/WR_adjusted_salaries.csv')

In [16]:
wr = player_data_full[player_data_full['position'] == 'WR']

## including postgame data 
wr = wr[wr.season_type == 'REG+POST']

## filtering for relevant wr metrics
wr_features = ['season', 'player_display_name', 'games','receptions', 'targets', 'receiving_yards', 'receiving_tds', 'receiving_yards_after_catch', 'receiving_first_downs', 'receiving_epa', 'target_share', 'fantasy_points']
wr = wr[wr_features]
wr.sample(2)

Unnamed: 0,season,player_display_name,games,receptions,targets,receiving_yards,receiving_tds,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points
13422,2009,Stefan Logan,1,1,1,5.0,0,9.0,0.0,0.075517,0.021739,0.5
33189,2024,Jakobi Meyers,15,87,129,1027.0,4,293.0,52.0,31.348306,0.248077,131.0


In [17]:
## removing rookie contracts from salary data

rookie_years = adj_salaries.groupby('player')['year_signed'].min().reset_index()
rookie_years.columns = ['player', 'rookie_year']

adj_salaries = adj_salaries.merge(rookie_years, on='player')
adj_salaries = adj_salaries[adj_salaries['year_signed'] > adj_salaries['rookie_year']]

In [18]:
## joining salaries to data prior to the year contract was signed. 
## ex joining Aaron Rodgers 2018 contract information to all years 2013-2017, 2013 to 2008-2012 etc

def next_salary(player_id, year, contract_df):

    future_contracts = contract_df[(contract_df['player'] == player_id) &  
                                   (contract_df['year_signed'] > year)]


    ### getting contract signed most recently after current year of play, returning salary/yr
    if len(future_contracts) > 0:
        next_contract = future_contracts.sort_values('year_signed').iloc[0]
        return next_contract['apy']
    else:
        return np.nan 
    

def next_salary_smoothed(player_id, year, contract_df):

    future_contracts = contract_df[(contract_df['player'] == player_id) &  
                                   (contract_df['year_signed'] > year)]
    if len(future_contracts) > 0:
        next_contract = future_contracts.sort_values('year_signed').iloc[0]
        return next_contract['smooth_adjusted_apy']
    else:
        return np.nan 

In [19]:
### applying next salary function to all rows in qb_data, joining the salary from their next contract 

wr['salary_per_year'] = wr.apply(
    lambda row: next_salary(row['player_display_name'], row['season'], adj_salaries), 
    axis=1
)

wr['smoothed_salary_per_year'] = wr.apply(
    lambda row: next_salary_smoothed(row['player_display_name'], row['season'], adj_salaries), 
    axis=1
)

In [20]:
wr.sample(10)

Unnamed: 0,season,player_display_name,games,receptions,targets,receiving_yards,receiving_tds,receiving_yards_after_catch,receiving_first_downs,receiving_epa,target_share,fantasy_points,salary_per_year,smoothed_salary_per_year
24781,2018,Aldrick Robinson,14,17,35,231.0,5,42.0,12.0,9.123689,0.068762,53.1,1.155,-0.086307
23509,2017,Marvin Jones,16,61,107,1101.0,9,195.0,44.0,58.864088,0.189381,164.1,6.25,5.858813
25163,2018,Corey Davis,16,65,112,891.0,4,267.0,47.0,30.047387,0.263529,118.6,12.5,12.95014
21237,2015,Allen Robinson,16,80,151,1400.0,14,356.0,61.0,43.216319,0.251248,224.0,14.0,9.933433
14760,2010,Dezmon Briscoe,2,6,7,93.0,1,33.0,4.0,5.098042,0.127273,15.3,,
22323,2016,Jaron Brown,6,11,22,187.0,1,58.0,6.0,2.285043,0.096491,24.7,2.75,1.094817
33187,2024,Steven Sims,3,1,1,4.0,0,7.0,0.0,-0.099836,0.04,1.9,,
16819,2012,Reggie Wayne,17,115,212,1469.0,5,386.0,80.0,4.804487,0.315946,174.4,2.3,0.289933
33086,2024,Tyler Boyd,15,39,57,390.0,0,189.0,20.0,3.825874,0.132251,39.3,,
23852,2017,Mike Thomas,4,5,7,93.0,0,29.0,4.0,2.187294,0.052632,9.3,0.85,-0.336551


In [21]:
wr_train_data = wr[wr.season <= 2023]
wr_train_data = wr_train_data.dropna(subset=['salary_per_year'])
wr_train_data.to_csv('../data/final/wr_train_updated.csv', index=False)

In [22]:
wr_test_data = wr[wr.season > 2023]
wr_test_data = wr_test_data.dropna(subset=['salary_per_year'])
wr_test_data.to_csv('../data/final/wr_test_updated.csv', index=False)