In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost 

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)

import xgboost
from sklearn.linear_model import LassoLars
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import TweedieRegressor

from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer

from prepare import x_y_split, rmse, select_kbest, rfe

from sklearn.model_selection import GridSearchCV

from hypopt import GridSearch

In [2]:
df = pd.read_csv('season1.csv', index_col=0)

In [3]:
s_2018 = df[df['year']>2017]

In [4]:
passing = pd.read_csv('passing.csv')
rec = pd.read_csv('receive.csv')
rush = pd.read_csv('rush.csv')

In [5]:
passing['date'] = pd.to_datetime(passing['date'])
rush['date'] = pd.to_datetime(rush['date'])
rec['date'] = pd.to_datetime(rec['date'])

In [6]:
passing['year'] = passing['date'].dt.year
rec['year'] = rec['date'].dt.year
rush['year'] = rush['date'].dt.year

In [7]:
passing.drop(columns=['Unnamed: 0'], inplace=True)
rush.drop(columns=['Unnamed: 0'], inplace=True)
rec.drop(columns=['Unnamed: 0'], inplace=True)

In [88]:
def find_pts_averages(player):
    pts_var_avg = player.pts_var.abs().mean()
    return pts_var_avg

In [90]:
weekly = pd.DataFrame(columns=['player', 'pts_var'])
pts_var_avg = pd.DataFrame(columns=['player', 'pts_var', 'year'])

for year in range(2022, 2017, -1):
    res = rec[rec['year']==year].groupby('player')['tgt'].mean()> 5
    res = res[res==True].index.to_list()
    
    rec_temp = rec[(rec['year']==year) & (rec['player'].isin(res))]
    weekly = weekly.append(rec_temp[['player'] + ['pts_var']])

    qbs = passing[passing['year']==year].groupby('player')['att'].mean()> 5
    qbs = qbs[qbs==True].index.to_list()
    
    pass_temp = passing[(passing['year']==year) & (passing['player'].isin(qbs))]
    weekly = weekly.append(pass_temp[['player'] + ['pts_var']])

    rbs = rush[rush['year']==year].groupby('player')['att'].mean()> 5
    rbs = rbs[rbs==True].index.to_list()
    
    rush_temp = rush[(rush['year']==year) & (rush['player'].isin(rbs))]
    weekly = weekly.append(rush_temp[['player'] + ['pts_var']])
    
    pts_avg = weekly.groupby('player', group_keys=False).apply(find_pts_averages)
    weekly_temp = pd.DataFrame(pts_avg, columns=['pts_var_avg'])

    weekly_temp['year'] = year
    
    pts_var_avg = pts_var_avg.append(weekly_temp)

In [93]:
pts_var_avg = pts_var_avg.reset_index()

In [95]:
pts_var_avg.drop(columns=['player','pts_var'],inplace=True)

In [98]:
pts_var_avg.rename(columns={'index':'player'}, inplace=True)

In [101]:
df1 = pd.merge(left=s_2018, right=pts_var_avg, on=['player','year'], how='left')
df1.rename(columns={'pts_var':'pts_var_avg'}, inplace=True)

In [102]:
qb_df = df1[df1['pos']=='QB']
rb_df = df1[df1['pos']=='RB']
wr_df = df1[df1['pos']=='WR']
te_df = df1[df1['pos']=='TE']

In [103]:
qb_df.drop(columns=['rk','pos','tgt','rec','rec_yards','y/r','rec_tds','vbd', 'team'], inplace=True)

In [104]:
qb_df['rating'] = round(((((((qb_df['cmp']/qb_df['pass_att'])-.3)*5) + 
                  ((qb_df['pass_yds']/qb_df['pass_att']-3)*.25) +
                  ((qb_df['pass_tds']/qb_df['pass_att'])*20) +
                  (2.375-((qb_df['int']/qb_df['pass_att'])*25)))/6)*100),2)

In [105]:
def add_target(group):
    group['target'] = group['ppr_pts'].shift(-1)
    group = group.fillna(0)
    return group

In [106]:
qb_df = add_target(qb_df)

In [107]:
qb_df = qb_df[(qb_df['player']!= 'Tom Brady') & (qb_df['player']!='Marcus Mariota')]

In [108]:
qb_df['comp%'] = round((qb_df['cmp'] / qb_df['pass_att']) * 100, 2)
qb_df['int%'] = round((qb_df['int'] / qb_df['pass_att']) * 100, 2)

In [109]:
qb_df.drop(columns=['cmp','pass_att','int'], inplace=True)

In [118]:
rb_df.drop(columns=['rk','team','pos','cmp','pass_att','pass_yds','pass_tds','int','vbd'], inplace=True)

In [119]:
rb_df = add_target(rb_df)

In [121]:
rb_df['rec%'] = round(((rb_df['rec']/rb_df['tgt'])*100),2)

In [127]:
wr_df.drop(columns=['rk','team','pos','cmp','pass_tds','pass_att','pass_yds','int','rush_att','rush_yard',
                    'y/a','rush_tds','vbd'],inplace=True)

In [128]:
wr_df = add_target(wr_df)

In [130]:
wr_df['rec%'] = round(((wr_df['rec']/wr_df['tgt'])*100),2)

In [131]:
wr_df = wr_df[wr_df['player']!='KaVontae Turpin']

In [138]:
te_df = add_target(te_df)

In [140]:
te_df['rec%'] = round(((te_df['rec']/te_df['tgt'])*100),2)

In [142]:
te_df = te_df[(te_df['player']!='Richard Rodgers') & (te_df['player']!='Feleipe Franks')]

In [148]:
parameters = {'learning_rate': [.03, .07, .1, .15, .2],
              'max_depth': [3, 4, 5, 6, 7],
              'min_child_weight': [1, 5, 15, 200],
              'subsample': [.65, 0.7, .8, .85],
              'colsample_bytree': [0.7, .8, .85],
              'n_estimators': [250, 500],
              'gamma':[0, 1, 10]}

In [149]:
xgb = xgboost.XGBRegressor()

In [150]:
xgb_grid = GridSearchCV(xgb,
                        parameters,
                        cv = 4,
                        n_jobs = -1,
                        verbose=True,
                        scoring='neg_root_mean_squared_error')


In [184]:
X_train = qb_df[qb_df['year']<2020]
X_val = qb_df[qb_df['year']<2022]
X_test = qb_df[qb_df['year']==2022]

y_train = X_train['target']
X_train.drop(columns=['target'], inplace = True)

y_val = X_val['target']
X_val.drop(columns=['target'], inplace = True)

X_test.drop(columns=['target'], inplace = True)

ss = StandardScaler()

X_train[qb_cols] = ss.fit_transform(X_train[qb_cols])
X_val[qb_cols] = ss.transform(X_val[qb_cols])
X_test[qb_cols] = ss.transform(X_test[qb_cols])

In [185]:
qb_cols = ['age','g','gs','pass_yds','pass_tds','rush_att','rush_yard','y/a','rush_tds',
           'fmb','fl','rush_rec_tds','pos_rank','adp','adp_by_pos','round','ppr_pts',
           'comp%', 'int%', 'rating','success', 'pts_var_avg']

In [None]:
xgb_grid.fit(X_train[qb_cols],
             y_train)

#print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

Fitting 4 folds for each of 7200 candidates, totalling 28800 fits


In [154]:
qb_xgb = xgboost.XGBRegressor(colsample_bytree=.7, gamma=10, eta=.1, max_depth=3,
                           min_child_weight=5,n_estimators=500,subsample.8)

In [None]:
rb_cols = ['age','g','gs','rush_att','rush_yard','y/a','rush_tds','tgt','rec','rec_yards','y/r','rec_tds','fmb',
           'fl','rush_rec_tds','ppr_pts','pos_rank','adp','adp_by_pos','round', 'rec%', 'pts_var_avg']

In [None]:
X_train = rb_df[rb_df['year']<2020]
X_val = rb_df[rb_df['year']<2022]
X_test = rb_df[rb_df['year']==2022]

y_train = X_train['target']
X_train.drop(columns=['target'], inplace = True)

y_val = X_val['target']
X_val.drop(columns=['target'], inplace = True)

X_test.drop(columns=['target'], inplace = True)

ss = StandardScaler()

X_train[rb_cols] = ss.fit_transform(X_train[rb_cols])
X_val[rb_cols] = ss.transform(X_val[rb_cols])
X_test[rb_cols] = ss.transform(X_test[rb_cols])

In [None]:
rb_xgb = xgboost.XGBRegressor(colsample_bytree=.7, gamma=10, eta=.1, max_depth=3,
                            min_child_weight=5,n_estimators=500,subsample.8)

In [None]:
wr_cols = ['age','g','gs','tgt','rec','rec_yards','y/r','rec_tds','fmb','fl','rush_rec_tds','ppr_pts','pos_rank',
           'adp','adp_by_pos','round', 'rec%','pts_var_avg']

In [None]:
X_train = wr_df[wr_df['year']<2020]
X_val = wr_df[wr_df['year']<2022]
X_test = wr_df[wr_df['year']==2022]

y_train = X_train['target']
X_train.drop(columns=['target'], inplace = True)

y_val = X_val['target']
X_val.drop(columns=['target'], inplace = True)

X_test.drop(columns=['target'], inplace = True)

ss = StandardScaler()

X_train[wr_cols] = ss.fit_transform(X_train[wr_cols])
X_val[wr_cols] = ss.transform(X_val[wr_cols])
X_test[wr_cols] = ss.transform(X_test[wr_cols])

In [None]:
wrte_xgb = xgboost.XGBRegressor(colsample_bytree=.7, gamma=10, eta=.1, max_depth=3,
                           min_child_weight=5,n_estimators=500,subsample.8)

In [155]:
def qb_xgb_modeling(df, cols):
    
    X_train = df[df['year']<2020]
    X_val = df[df['year']<2022]
    X_test = df[df['year']==2022]
    
    y_train = X_train['target']
    X_train.drop(columns=['target'], inplace = True)
    
    y_val = X_val['target']
    X_val.drop(columns=['target'], inplace = True)

    X_test.drop(columns=['target'], inplace = True)
    
    ss = StandardScaler()
    
    X_train[cols] = ss.fit_transform(X_train[cols])
    X_val[cols] = ss.transform(X_val[cols])
    X_test[cols] = ss.transform(X_test[cols])
    
    cols.append('success')
    
    xgb = xgboost.XGBRegressor(colsample_bytree=.7, gamma=10, eta=.1, max_depth=3,
                               min_child_weight=5,n_estimators=500,subsample=.8)
    
    xgb.fit(X_train[cols], y_train, eval_set=[(X_train[cols], y_train), (X_val[cols], y_val)],
           early_stopping_rounds=25)
    xgb_preds = xgb.predict(X_train[cols])
    
    preds = pd.DataFrame({'actual':y_train,
                          'baseline':y_train.mean(),
                          'xgb_preds':xgb_preds})
    
    xgb_val_preds = xgb.predict(X_val[cols])
    
    val_preds = pd.DataFrame({'actual':y_val,
                              'baseline':y_train.mean(),
                              'xgb_val_preds':xgb_val_preds})
    
    
    pos_2023 = pd.DataFrame({'player':X_test['player'],
                             'preds':xgb.predict(X_test[cols])})
    
    val_2022 = X_val[X_val['year']==2021]
    pos_2022 = pd.DataFrame({'player':val_2022['player'],
                             'actual':val_2022['ppr_pts'],
                             'preds':xgb.predict(val_2022[cols])})
    
    return preds, val_preds, pos_2023, pos_2022

In [156]:
qb_cols = ['age','g','gs','pass_yds','pass_tds','rush_att','rush_yard','y/a','rush_tds',
           'fmb','fl','rush_rec_tds','pos_rank','adp','adp_by_pos','round','ppr_pts',
           'comp%', 'int%', 'rating','pts_var_avg']

In [157]:
qb_preds, qb_val_preds, qb_2023, qb_2022 = xgb_modeling(qb_df, qb_cols)

[0]	validation_0-rmse:153.79402	validation_1-rmse:152.76804
[1]	validation_0-rmse:140.25306	validation_1-rmse:139.63573
[2]	validation_0-rmse:127.88357	validation_1-rmse:127.66720
[3]	validation_0-rmse:117.14615	validation_1-rmse:117.36296
[4]	validation_0-rmse:107.84204	validation_1-rmse:108.56321
[5]	validation_0-rmse:98.92421	validation_1-rmse:100.17391
[6]	validation_0-rmse:90.93965	validation_1-rmse:92.80046
[7]	validation_0-rmse:83.59905	validation_1-rmse:85.98719
[8]	validation_0-rmse:77.10436	validation_1-rmse:80.04100
[9]	validation_0-rmse:71.15029	validation_1-rmse:74.60838
[10]	validation_0-rmse:65.93092	validation_1-rmse:69.84244
[11]	validation_0-rmse:61.30069	validation_1-rmse:65.63480
[12]	validation_0-rmse:56.77654	validation_1-rmse:61.86896
[13]	validation_0-rmse:53.24280	validation_1-rmse:58.71708
[14]	validation_0-rmse:49.88896	validation_1-rmse:55.63968
[15]	validation_0-rmse:47.13386	validation_1-rmse:53.42689
[16]	validation_0-rmse:44.58650	validation_1-rmse:51.20

In [161]:
qb_2023.sort_values('preds',ascending=False).head()

Unnamed: 0,player,preds
2282,Patrick Mahomes,331.213684
2283,Josh Allen,329.217285
2285,Joe Burrow,327.581146
2284,Jalen Hurts,325.271179
2286,Geno Smith,295.66983


In [162]:
rmse(qb_preds, 'xgb_preds')

14.005851528540592

In [163]:
rmse(qb_val_preds, 'xgb_val_preds')

36.03477700939183

In [164]:
rb_cols = ['age','g','gs','rush_att','rush_yard','y/a','rush_tds','tgt','rec','rec_yards','y/r','rec_tds','fmb',
           'fl','rush_rec_tds','ppr_pts','pos_rank','adp','adp_by_pos','round', 'rec%', 'pts_var_avg']

In [165]:
rb_preds, rb_val_preds, rb_2023,rb_2022 = xgb_modeling(rb_df, rb_cols)

[0]	validation_0-rmse:108.73431	validation_1-rmse:103.98682
[1]	validation_0-rmse:99.24092	validation_1-rmse:94.82802
[2]	validation_0-rmse:90.74731	validation_1-rmse:86.61286
[3]	validation_0-rmse:83.30524	validation_1-rmse:79.44119
[4]	validation_0-rmse:76.91528	validation_1-rmse:73.49552
[5]	validation_0-rmse:70.85677	validation_1-rmse:67.61542
[6]	validation_0-rmse:65.38230	validation_1-rmse:62.47749
[7]	validation_0-rmse:60.49426	validation_1-rmse:57.83991
[8]	validation_0-rmse:56.51700	validation_1-rmse:54.08494
[9]	validation_0-rmse:52.43739	validation_1-rmse:50.18230
[10]	validation_0-rmse:49.01122	validation_1-rmse:46.91396
[11]	validation_0-rmse:45.89651	validation_1-rmse:43.97239
[12]	validation_0-rmse:43.17317	validation_1-rmse:41.44732
[13]	validation_0-rmse:40.74615	validation_1-rmse:39.16703
[14]	validation_0-rmse:38.60132	validation_1-rmse:37.34500
[15]	validation_0-rmse:36.48034	validation_1-rmse:35.78148
[16]	validation_0-rmse:34.83145	validation_1-rmse:34.33157
[17]	

In [175]:
rb_2023.sort_values('preds',ascending=False).head()

Unnamed: 0,player,preds
2366,Christian McCaffrey,343.766418
2365,Austin Ekeler,339.474304
2367,Josh Jacobs,309.225494
2368,Derrick Henry,295.166412
2369,Saquon Barkley,265.912476


In [176]:
rmse(rb_preds, 'xgb_preds')

19.46248136533273

In [178]:
rmse(rb_val_preds, 'xgb_val_preds')

24.92291809949817

In [167]:
wr_cols = ['age','g','gs','tgt','rec','rec_yards','y/r','rec_tds','fmb','fl','rush_rec_tds','ppr_pts','pos_rank',
           'adp','adp_by_pos','round', 'rec%','pts_var_avg']

In [168]:
wr_preds, wr_val_preds, wr_2023, wr_2022 = xgb_modeling(wr_df, wr_cols)

[0]	validation_0-rmse:105.26974	validation_1-rmse:106.78987
[1]	validation_0-rmse:95.86515	validation_1-rmse:97.50085
[2]	validation_0-rmse:87.37274	validation_1-rmse:89.10282
[3]	validation_0-rmse:79.56367	validation_1-rmse:81.38940
[4]	validation_0-rmse:72.66629	validation_1-rmse:74.45000
[5]	validation_0-rmse:66.50489	validation_1-rmse:68.35147
[6]	validation_0-rmse:61.03557	validation_1-rmse:62.82956
[7]	validation_0-rmse:55.97034	validation_1-rmse:57.83032
[8]	validation_0-rmse:51.43390	validation_1-rmse:53.30497
[9]	validation_0-rmse:47.51513	validation_1-rmse:49.35296
[10]	validation_0-rmse:43.89812	validation_1-rmse:45.74389
[11]	validation_0-rmse:40.68483	validation_1-rmse:42.55061
[12]	validation_0-rmse:37.78742	validation_1-rmse:39.66707
[13]	validation_0-rmse:35.33665	validation_1-rmse:37.19939
[14]	validation_0-rmse:33.03965	validation_1-rmse:34.89573
[15]	validation_0-rmse:31.19844	validation_1-rmse:33.10091
[16]	validation_0-rmse:29.46744	validation_1-rmse:31.37431
[17]	

In [170]:
wr_2023.sort_values('preds',ascending=False).head()

Unnamed: 0,player,preds
2529,Davante Adams,319.385406
2530,Stefon Diggs,319.385406
2527,Justin Jefferson,306.659302
2528,Tyreek Hill,305.748993
2532,A.J. Brown,303.102478


In [179]:
rmse(wr_preds, 'xgb_preds')

9.42040150541751

In [180]:
rmse(wr_val_preds, 'xgb_val_preds')

16.97214030975949

In [171]:
te_cols = ['age','g','gs','tgt','rec','rec_yards','y/r','rec_tds','fmb','fl','rush_rec_tds','ppr_pts','pos_rank',
           'adp','adp_by_pos','round', 'rec%','pts_var_avg']

In [172]:
te_preds, te_val_preds, te_2023, te_2022 = xgb_modeling(te_df, te_cols)

[0]	validation_0-rmse:72.42422	validation_1-rmse:74.11671
[1]	validation_0-rmse:66.82328	validation_1-rmse:68.32488
[2]	validation_0-rmse:61.71639	validation_1-rmse:63.03041
[3]	validation_0-rmse:56.92671	validation_1-rmse:58.34917
[4]	validation_0-rmse:52.89070	validation_1-rmse:54.37743
[5]	validation_0-rmse:49.12649	validation_1-rmse:50.71610
[6]	validation_0-rmse:45.88527	validation_1-rmse:47.39326
[7]	validation_0-rmse:42.83148	validation_1-rmse:44.40353
[8]	validation_0-rmse:40.01579	validation_1-rmse:41.62098
[9]	validation_0-rmse:37.64660	validation_1-rmse:39.30435
[10]	validation_0-rmse:35.46673	validation_1-rmse:37.19305
[11]	validation_0-rmse:33.59105	validation_1-rmse:35.47216
[12]	validation_0-rmse:31.94049	validation_1-rmse:33.92504
[13]	validation_0-rmse:30.49483	validation_1-rmse:32.55304
[14]	validation_0-rmse:29.27580	validation_1-rmse:31.33737
[15]	validation_0-rmse:28.13314	validation_1-rmse:30.25280
[16]	validation_0-rmse:27.11428	validation_1-rmse:29.28546
[17]	va

[139]	validation_0-rmse:10.81032	validation_1-rmse:19.53487
[140]	validation_0-rmse:10.75814	validation_1-rmse:19.54778
[141]	validation_0-rmse:10.76607	validation_1-rmse:19.60753
[142]	validation_0-rmse:10.60495	validation_1-rmse:19.58301


In [174]:
te_2023.sort_values('preds',ascending=False).head()

Unnamed: 0,player,preds
2745,Travis Kelce,248.688889
2746,T.J. Hockenson,212.971268
2747,George Kittle,188.65741
2748,Mark Andrews,184.977936
2749,Evan Engram,174.655853


In [181]:
rmse(te_preds, 'xgb_preds')

11.917962012062912

In [183]:
rmse(te_val_preds, 'xgb_val_preds')

19.410786132894856