In [1]:
###################### Import Packages #############################################
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import statsmodels.formula.api as smf

####################################### Functions ######################################
#Pull data from Pro Football Reference
def data_assembly(start_year, current_year):
    database = []
    #start_year = 2014 #Input Earliest Year of Data
    #current_year = 2019 #Input Current Year
    for x in range(start_year, current_year):
        page = requests.get("https://www.pro-football-reference.com/years/%d/fantasy.htm" % x)
        soup = BeautifulSoup(page.content, 'html.parser')
        table = soup.find_all('table')[0]
        df = pd.read_html(str(table), header = 1)
        dfyear = df[0]
        dfyear['Year'] = x
        if x == start_year:
            database = dfyear
        else: database = database.append(dfyear, ignore_index = True)

    #Rename columns, eliminate duplicate column titles as rows
    database = database.rename(columns = {'Player':'Name', 'Att':'PaAtt', 'Yds':'PaYds', 'TD':'PaTD','Att.1':'RuAtt', 'Yds.1':'RuYds', 'TD.1':'RuTD', 'Y/A':'RuY/A', 'Y/R':'ReYds/R', 'Att.2':'ReAtt', 'Yds.2':'ReYds', 'TD.2':'ReTD'})
    database = database[database.Rk != 'Rk']

    database['Name'] = database['Name'].apply(lambda x: x[0:len(x)-1] if x[len(x)-1] == '+' else x)
    database['Name'] = database['Name'].apply(lambda x: x[0:len(x)-1] if x[len(x)-1] == '*' else x)
    return database

def shift_col(frame, new_name, col_to_shift, magnitude):
    frame1 = frame.sort_values(['Name', 'Year']).reset_index(drop = True)
    frame1[new_name] = frame1[col_to_shift].shift(magnitude)
    for i in range(len(frame1) - 1):
        if frame1.loc[i, 'Name'] != frame1.loc[i+1, 'Name']:
            if magnitude == -1:
                frame1.loc[i, new_name] = np.nan
            elif magnitude == 1:
                frame1.loc[i+1, new_name] = np.nan
    return frame1


In [20]:
############## Assemble the database #########################################
database = data_assembly(2012, 2019)                         # pull based on years
shifted = shift_col(database, 'pts_next_year', 'PPR', -1)    # create pts_next_year column for regressions
shifted = shift_col(shifted, 'G_prev_year', 'G', 1)          # create games prev year for injury history

# change columns to floats
cols_to_change = ['VBD',
                  'Age',
                  'PPR',
                  'pts_next_year',
                  'G_prev_year',
                  'GS',
                  'G',
                  'PaTD',
                  'RuTD',
                  'ReTD',
                  'PaYds',
                  'RuYds',
                  'ReYds',
                 'Rec', 
                 'RuY/A',
                  'ReYds/R']
for col in cols_to_change:
    shifted[col] = shifted[col].astype(float)
    
# create new metrics
shifted['season_frac_1'] = shifted['G'] / 16                                # 1 year injury history
shifted['season_frac_2'] = (shifted['G'] + shifted['G_prev_year']) / 32     # 2 year injury history
shifted['agesq'] = shifted['Age'] * shifted['Age']                          # Age squared
shifted['ReYds_per_R'] = shifted['ReYds/R']
shifted['RuYds_per_A'] = shifted['RuY/A']

for metric in ['PaTD', 'RuTD', 'ReTD', 'PaYds', 'RuYds', 'ReYds', 'Rec']:
    new_str = metric + '_pergame'
    shifted[new_str] = shifted[metric] / shifted['G']



In [13]:
shifted

Unnamed: 0,Rk,Name,Tm,FantPos,Age,G,GS,Cmp,PaAtt,PaYds,...,season_frac_1,season_frac_2,agesq,PaTD_pergame,RuTD_pergame,ReTD_pergame,PaYds_pergame,RuYds_pergame,ReYds_pergame,Rec_pergame
0,357,A.J. Derby,2TM,TE,25.0,10.0,3.0,0,0,0.0,...,0.6250,,625.0,0.000000,0.000000,0.000000,0.000000,0.000000,16.000000,1.600000
1,271,A.J. Derby,2TM,TE,26.0,11.0,1.0,0,0,0.0,...,0.6875,0.65625,676.0,0.000000,0.000000,0.181818,0.000000,0.000000,22.181818,1.909091
2,388,A.J. Derby,MIA,TE,27.0,4.0,0.0,0,0,0.0,...,0.2500,0.46875,729.0,0.000000,0.000000,0.250000,0.000000,0.000000,12.000000,0.750000
3,13,A.J. Green,CIN,WR,24.0,16.0,16.0,0,0,0.0,...,1.0000,,576.0,0.000000,0.000000,0.687500,0.000000,2.375000,84.375000,6.062500
4,12,A.J. Green,CIN,WR,25.0,16.0,16.0,0,0,0.0,...,1.0000,1.00000,625.0,0.000000,0.000000,0.687500,0.000000,0.000000,89.125000,6.125000
5,59,A.J. Green,CIN,WR,26.0,13.0,13.0,0,0,0.0,...,0.8125,0.90625,676.0,0.000000,0.000000,0.461538,0.000000,0.153846,80.076923,5.307692
6,17,A.J. Green,CIN,WR,27.0,16.0,16.0,0,0,0.0,...,1.0000,0.90625,729.0,0.000000,0.000000,0.625000,0.000000,0.000000,81.062500,5.375000
7,101,A.J. Green,CIN,WR,28.0,10.0,10.0,0,0,0.0,...,0.6250,0.81250,784.0,0.000000,0.000000,0.400000,0.000000,0.000000,96.400000,6.600000
8,27,A.J. Green,CIN,WR,29.0,16.0,16.0,0,0,0.0,...,1.0000,0.81250,841.0,0.000000,0.000000,0.500000,0.000000,0.000000,67.375000,4.687500
9,118,A.J. Green,CIN,WR,30.0,9.0,9.0,0,0,0.0,...,0.5625,0.78125,900.0,0.000000,0.000000,0.666667,0.000000,0.000000,77.111111,5.111111


In [21]:
################ Seperate database by position ####################################################
pos_dict = {}
for pos in shifted.FantPos.unique():
    pos_dict[pos] = shifted.loc[shifted.FantPos == pos].reset_index(drop = True)
    #pos_dict[pos]['season_frac_1'] = pos_dict[pos]['season_frac_1'] - pos_dict[pos]['season_frac_1'].mean()
    #pos_dict[pos]['season_frac_2'] = pos_dict[pos]['season_frac_2'] - pos_dict[pos]['season_frac_2'].mean()
    #pos_dict[pos]['Age'] = pos_dict[pos]['Age'] - pos_dict[pos]['Age'].mean()
    #pos_dict[pos]['agesq'] = pos_dict[pos]['agesq'] - pos_dict[pos]['agesq'].mean()
pos_dict['RB']



Unnamed: 0,Rk,Name,Tm,FantPos,Age,G,GS,Cmp,PaAtt,PaYds,...,agesq,ReYds_per_R,RuYds_per_A,PaTD_pergame,RuTD_pergame,ReTD_pergame,PaYds_pergame,RuYds_pergame,ReYds_pergame,Rec_pergame
0,164,Aaron Jones,GNB,RB,23.0,12.0,4.0,0,0,0.0,...,529.0,2.44,5.53,0.0,0.333333,0.000000,0.0,37.333333,1.833333,0.750000
1,62,Aaron Jones,GNB,RB,24.0,12.0,8.0,0,0,0.0,...,576.0,7.92,5.47,0.0,0.666667,0.083333,0.0,60.666667,17.166667,2.166667
2,493,Aaron Ripkowski,GNB,RB,23.0,15.0,0.0,0,0,0.0,...,529.0,18.00,,0.0,0.000000,0.000000,0.0,0.000000,1.200000,0.066667
3,248,Aaron Ripkowski,GNB,RB,24.0,16.0,8.0,0,0,0.0,...,576.0,5.11,4.41,0.0,0.125000,0.062500,0.0,9.375000,2.875000,0.562500
4,450,Aaron Ripkowski,GNB,RB,25.0,16.0,2.0,0,0,0.0,...,625.0,5.57,2.60,0.0,0.000000,0.000000,0.0,0.812500,2.437500,0.437500
5,1,Adrian Peterson,MIN,RB,27.0,16.0,16.0,0,0,0.0,...,729.0,5.43,6.03,0.0,0.750000,0.062500,0.0,131.062500,13.562500,2.500000
6,17,Adrian Peterson,MIN,RB,28.0,14.0,14.0,0,0,0.0,...,784.0,5.90,4.54,0.0,0.714286,0.071429,0.0,90.428571,12.214286,2.071429
7,405,Adrian Peterson,MIN,RB,29.0,1.0,1.0,0,0,0.0,...,841.0,9.00,3.57,0.0,0.000000,0.000000,0.0,75.000000,18.000000,2.000000
8,4,Adrian Peterson,MIN,RB,30.0,16.0,16.0,0,0,0.0,...,900.0,7.40,4.54,0.0,0.687500,0.000000,0.0,92.812500,13.875000,1.875000
9,438,Adrian Peterson,MIN,RB,31.0,3.0,3.0,0,0,0.0,...,961.0,2.67,1.95,0.0,0.000000,0.000000,0.0,24.000000,2.666667,1.000000


In [24]:
## to help with multicolinearity, can make ryds, tds, rec all per game metrics

all_cols = ['Age', 'agesq', 'season_frac_2']
xcols_dict = {
    'TE': ['ReTD_pergame','ReYds_pergame', 'Rec_pergame', 'ReYds_per_R'],
    'WR': ['ReTD_pergame','ReYds_pergame', 'Rec_pergame', 'ReYds_per_R'],
    'QB': ['PaTD_pergame','PaYds_pergame', 'RuTD_pergame','RuYds_pergame', 'RuYds_per_A'],
    'RB': ['RuTD_pergame','RuYds_pergame', 'ReTD_pergame','ReYds_pergame', 'Rec_pergame', 'RuYds_per_A', 'ReYds_per_R'],
}

for pos in xcols_dict:
    print(pos)
    ycol = 'pts_next_year'
    xcols = xcols_dict[pos] + all_cols
    frame1 = pos_dict[pos]
    frame = frame1[['Name', ycol] + xcols].dropna()
    xform = ' + '.join(xcols)
    formula = ycol + " ~ " + xform
    model = smf.ols(formula, frame)
    results = model.fit(cov_type='cluster', cov_kwds={'groups': frame['Name']})
    print(results.summary())

TE
                            OLS Regression Results                            
Dep. Variable:          pts_next_year   R-squared:                       0.408
Model:                            OLS   Adj. R-squared:                  0.395
Method:                 Least Squares   F-statistic:                     37.15
Date:                Thu, 01 Aug 2019   Prob (F-statistic):           4.07e-27
Time:                        16:47:41   Log-Likelihood:                -1659.3
No. Observations:                 308   AIC:                             3335.
Df Residuals:                     300   BIC:                             3365.
Df Model:                           7                                         
Covariance Type:              cluster                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept       189.6617    185.043      1.

['Rk',
 'Name',
 'Tm',
 'FantPos',
 'Age',
 'G',
 'GS',
 'Cmp',
 'PaAtt',
 'PaYds',
 'PaTD',
 'Int',
 'RuAtt',
 'RuYds',
 'RuY/A',
 'RuTD',
 'Tgt',
 'Rec',
 'ReYds',
 'ReYds/R',
 'ReTD',
 'Fmb',
 'FL',
 'TD.3',
 '2PM',
 '2PP',
 'FantPt',
 'PPR',
 'DKPt',
 'FDPt',
 'VBD',
 'PosRank',
 'OvRank',
 'Year']