In [224]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

In [176]:
#pip install pyarrow

In [225]:
player_data_full = pd.read_parquet('../data/raw/nflverse/player_stats_season.parquet')
adj_salaries = pd.read_csv('../data/raw/salaries/QB_adjusted_salaries.csv')

In [226]:
qb = player_data_full[player_data_full['position'] == 'QB']

## including postgame data 
qb = qb[qb.season_type == 'REG+POST']

## filtering for relevant qb metrics
qb_features = ['season', 'player_display_name', 'games', 'completions', 'attempts', 'passing_yards', 'passing_tds', 'interceptions', 'sacks', 'sack_fumbles', 'passing_first_downs', 'passing_epa', 'passing_2pt_conversions', 'pacr', 'dakota', 'rushing_yards', 'rushing_tds', 'fantasy_points']
qb = qb[qb_features]
qb.sample(2)

Unnamed: 0,season,player_display_name,games,completions,attempts,passing_yards,passing_tds,interceptions,sacks,sack_fumbles,passing_first_downs,passing_epa,passing_2pt_conversions,pacr,dakota,rushing_yards,rushing_tds,fantasy_points
23919,2017,Patrick Mahomes,1,22,35,284.0,0,1.0,2.0,0,14.0,7.888877,0,1.088123,0.132096,10.0,0,10.36
5615,2003,Todd Bouman,3,7,13,81.0,1,0.0,1.0,0,4.0,2.858155,1,0.0,0.0349,1.0,0,7.34


In [227]:
## removing rookie contracts from salary data

rookie_years = adj_salaries.groupby('player')['year_signed'].min().reset_index()
rookie_years.columns = ['player', 'rookie_year']

adj_salaries = adj_salaries.merge(rookie_years, on='player')
adj_salaries = adj_salaries[adj_salaries['year_signed'] > adj_salaries['rookie_year']]

In [229]:
## joining salaries to data prior to the year contract was signed. 
## ex joining Aaron Rodgers 2018 contract information to all years 2013-2017, 2013 to 2008-2012 etc

def next_salary(player_id, year, contract_df):

    future_contracts = contract_df[(contract_df['player'] == player_id) &  
                                   (contract_df['year_signed'] > year)]


    ### getting contract signed most recently after current year of play, returning salary/yr
    if len(future_contracts) > 0:
        next_contract = future_contracts.sort_values('year_signed').iloc[0]
        return next_contract['apy']
    else:
        return np.nan 
    

def next_salary_smoothed(player_id, year, contract_df, type):

    future_contracts = contract_df[(contract_df['player'] == player_id) &  
                                   (contract_df['year_signed'] > year)]
    if len(future_contracts) > 0:
        next_contract = future_contracts.sort_values('year_signed').iloc[0]
        if type == 'smoothed':
            return next_contract['smooth_adjusted_apy']
        if type == 'mean':
            return next_contract['mean_adjusted_apy']
        if type == 'median':
            return next_contract['med_adjusted_apy']
    else:
        return np.nan 

In [230]:
### applying next salary function to all rows in qb_data, joining the salary from their next contract 

qb['salary_per_year'] = qb.apply(
    lambda row: next_salary(row['player_display_name'], row['season'], adj_salaries), 
    axis=1
)

qb['smoothed_salary_per_year'] = qb.apply(
    lambda row: next_salary_smoothed(row['player_display_name'], row['season'], adj_salaries, 'smoothed'), 
    axis=1
)

qb['mean_adj_salary_per_year'] = qb.apply(
    lambda row: next_salary_smoothed(row['player_display_name'], row['season'], adj_salaries, 'mean'), 
    axis=1
)

qb['med_adj_salary_per_year'] = qb.apply(
    lambda row: next_salary_smoothed(row['player_display_name'], row['season'], adj_salaries, 'median'), 
    axis=1
)

In [231]:
adj_salaries = adj_salaries.rename(columns={'player': 'player_display_name','smooth_adjusted_apy': 'smoothed_salary_per_year'})

qb = pd.merge(
    qb, 
    adj_salaries[['player_display_name', 'smoothed_salary_per_year', 'year_signed']], 
    on=['player_display_name', 'smoothed_salary_per_year'], 
    how='left' 
)

In [234]:
qb_train_data = qb[qb.year_signed < 2024]
qb_train_data = qb_train_data.dropna(subset=['salary_per_year'])
qb_train_data.to_csv('../data/cleaned/qb_train.csv', index=False)

In [235]:
qb_test_data = qb[(qb.year_signed == 2024)]
qb_test_data = qb_test_data.dropna(subset=['salary_per_year'])
qb_test_data.to_csv('../data/cleaned/qb_test.csv', index=False)