In [79]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import statsmodels.formula.api as smf

#Pull data from Pro Football Reference
def data_assembly(start_year, current_year):
    database = []
    #start_year = 2014 #Input Earliest Year of Data
    #current_year = 2019 #Input Current Year
    for x in range(start_year, current_year):
        page = requests.get("https://www.pro-football-reference.com/years/%d/fantasy.htm" % x)
        soup = BeautifulSoup(page.content, 'html.parser')
        table = soup.find_all('table')[0]
        df = pd.read_html(str(table), header = 1)
        dfyear = df[0]
        dfyear['Year'] = x
        if x == start_year:
            database = dfyear
        else: database = database.append(dfyear, ignore_index = True)

    #Rename columns, eliminate duplicate column titles as rows
    database = database.rename(columns = {'Player':'Name', 'Att':'PaAtt', 'Yds':'PaYds', 'TD':'PaTD','Att.1':'RuAtt', 'Yds.1':'RuYds', 'TD.1':'RuTD', 'Y/A':'RuY/A', 'Y/R':'ReYds/R', 'Att.2':'ReAtt', 'Yds.2':'ReYds', 'TD.2':'ReTD'})
    database = database[database.Rk != 'Rk']

    database['Name'] = database['Name'].apply(lambda x: x[0:len(x)-1] if x[len(x)-1] == '+' else x)
    database['Name'] = database['Name'].apply(lambda x: x[0:len(x)-1] if x[len(x)-1] == '*' else x)
    return database

def shift_col(frame, new_name, col_to_shift, magnitude):
    frame1 = frame.sort_values(['Name', 'Year']).reset_index(drop = True)
    frame1[new_name] = frame1[col_to_shift].shift(magnitude)
    for i in range(len(frame1) - 1):
        if frame1.loc[i, 'Name'] != frame1.loc[i+1, 'Name']:
            if magnitude == -1:
                frame1.loc[i, new_name] = np.nan
            elif magnitude == 1:
                frame1.loc[i+1, new_name] = np.nan
    return frame1


In [93]:
database = data_assembly(2014, 2019)
shifted = shift_col(database, 'pts_next_year', 'PPR', -1)
shifted = shift_col(shifted, 'G_prev_year', 'G', 1)
cols_to_change = ['VBD',
                  'Age',
                  'PPR',
                  'pts_next_year',
                  'G_prev_year',
                  'GS',
                  'G',
                  'PaTD',
                  'RuTD',
                  'ReTD',
                  'PaYds',
                  'RuYds',
                  'ReYds',
                 'Rec']
for col in cols_to_change:
    shifted[col] = shifted[col].astype(float)
shifted['season_frac_1'] = shifted['G'] / 16
shifted['season_frac_2'] = (shifted['G'] + shifted['G_prev_year']) / 32
shifted['agesq'] = shifted['Age'] * shifted['Age']

In [94]:
shifted

Unnamed: 0,Rk,Name,Tm,FantPos,Age,G,GS,Cmp,PaAtt,PaYds,...,FDPt,VBD,PosRank,OvRank,Year,pts_next_year,G_prev_year,season_frac_1,season_frac_2,agesq
0,357,A.J. Derby,2TM,TE,25.0,10.0,3.0,0,0,0.0,...,22.0,,62,,2016,55.4,,0.6250,,625.0
1,271,A.J. Derby,2TM,TE,26.0,11.0,1.0,0,0,0.0,...,44.9,,42,,2017,13.8,10.0,0.6875,0.65625,676.0
2,388,A.J. Derby,MIA,TE,27.0,4.0,0.0,0,0,0.0,...,12.3,,77,,2018,,11.0,0.2500,0.46875,729.0
3,59,A.J. Green,CIN,WR,26.0,13.0,13.0,0,0,0.0,...,170.8,13.0,23,59,2014,273.7,,0.8125,,676.0
4,17,A.J. Green,CIN,WR,27.0,16.0,16.0,0,0,0.0,...,230.7,63.0,8,17,2015,186.4,13.0,1.0000,0.90625,729.0
5,101,A.J. Green,CIN,WR,28.0,10.0,10.0,0,0,0.0,...,153.4,,35,,2016,226.8,16.0,0.6250,0.81250,784.0
6,27,A.J. Green,CIN,WR,29.0,16.0,16.0,0,0,0.0,...,189.3,41.0,10,27,2017,149.4,10.0,1.0000,0.81250,841.0
7,118,A.J. Green,CIN,WR,30.0,9.0,9.0,0,0,0.0,...,126.4,,37,,2018,,16.0,0.5625,0.78125,900.0
8,403,A.J. Jenkins,KAN,WR,25.0,9.0,2.0,0,0,0.0,...,13.6,,155,,2014,,,0.5625,,625.0
9,620,A.J. McCarron,CIN,,24.0,0.0,,0,0,0.0,...,,,70,,2014,55.3,,0.0000,,576.0


In [100]:
pos_dict = {}
for pos in shifted.FantPos.unique():
    pos_dict[pos] = shifted.loc[shifted.FantPos == pos].reset_index(drop = True)
    pos_dict[pos]['season_frac_1'] = pos_dict[pos]['season_frac_1'] - pos_dict[pos]['season_frac_1'].mean()
    pos_dict[pos]['season_frac_2'] = pos_dict[pos]['season_frac_2'] - pos_dict[pos]['season_frac_2'].mean()
    pos_dict[pos]['Age'] = pos_dict[pos]['Age'] - pos_dict[pos]['Age'].mean()
    pos_dict[pos]['agesq'] = pos_dict[pos]['agesq'] - pos_dict[pos]['agesq'].mean()
pos_dict['RB']



Unnamed: 0,Rk,Name,Tm,FantPos,Age,G,GS,Cmp,PaAtt,PaYds,...,FDPt,VBD,PosRank,OvRank,Year,pts_next_year,G_prev_year,season_frac_1,season_frac_2,agesq
0,164,Aaron Jones,GNB,RB,-2.61834,12.0,4.0,0,0,0.0,...,75.5,,50,,2017,171.4,,0.035936,,-135.071871
1,62,Aaron Jones,GNB,RB,-1.61834,12.0,8.0,0,0,0.0,...,158.4,10.0,23,62,2018,,12.0,0.035936,0.011947,-88.071871
2,493,Aaron Ripkowski,GNB,RB,-2.61834,15.0,0.0,0,0,0.0,...,2.3,,146,,2015,46.6,,0.223436,,-135.071871
3,248,Aaron Ripkowski,GNB,RB,-1.61834,16.0,8.0,0,0,0.0,...,42.1,,71,,2016,12.2,15.0,0.285936,0.230697,-88.071871
4,450,Aaron Ripkowski,GNB,RB,-0.61834,16.0,2.0,0,0,0.0,...,8.7,,127,,2017,,16.0,0.285936,0.261947,-39.071871
5,405,Adrian Peterson,MIN,RB,3.38166,1.0,1.0,0,0,0.0,...,10.3,,123,,2014,260.7,,-0.651564,,176.928129
6,4,Adrian Peterson,MIN,RB,4.38166,16.0,16.0,0,0,0.0,...,245.7,107.0,2,4,2015,9.0,1.0,0.285936,-0.206803,235.928129
7,438,Adrian Peterson,MIN,RB,5.38166,3.0,3.0,0,0,0.0,...,7.5,,126,,2016,78.9,16.0,-0.526564,-0.144303,296.928129
8,171,Adrian Peterson,2TM,RB,6.38166,10.0,7.0,0,0,0.0,...,73.4,,53,,2017,189.0,3.0,-0.089064,-0.331803,359.928129
9,42,Adrian Peterson,WAS,RB,7.38166,16.0,16.0,0,0,0.0,...,179.0,34.0,16,42,2018,,10.0,0.285936,0.074447,424.928129


In [104]:
## to help with multicolinearity, can make ryds, tds, rec all per game metrics

all_cols = ['Age', 'agesq', 'season_frac_2']
xcols_dict = {
    'TE': ['ReTD','ReYds', 'Rec'],
    'WR': ['ReTD','ReYds', 'Rec'],
    'QB': ['PaTD','PaYds'],
    'RB': ['RuTD','RuYds', 'ReTD','ReYds', 'Rec'],
}
for pos in xcols_dict:
    print(pos)
    ycol = 'pts_next_year'
    xcols = xcols_dict[pos] + all_cols
    frame1 = pos_dict[pos]
    frame = frame1[['Name', ycol] + xcols].dropna()
    xform = ' + '.join(xcols)
    formula = ycol + " ~ " + xform
    model = smf.ols(formula, frame)
    results = model.fit(cov_type='cluster', cov_kwds={'groups': frame['Name']})
    print(results.summary())

TE
                            OLS Regression Results                            
Dep. Variable:          pts_next_year   R-squared:                       0.354
Model:                            OLS   Adj. R-squared:                  0.333
Method:                 Least Squares   F-statistic:                     13.67
Date:                Wed, 31 Jul 2019   Prob (F-statistic):           4.21e-11
Time:                        18:21:54   Log-Likelihood:                -1020.2
No. Observations:                 189   AIC:                             2054.
Df Residuals:                     182   BIC:                             2077.
Df Model:                           6                                         
Covariance Type:              cluster                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        22.7225      8.009      2.

['Rk',
 'Name',
 'Tm',
 'FantPos',
 'Age',
 'G',
 'GS',
 'Cmp',
 'PaAtt',
 'PaYds',
 'PaTD',
 'Int',
 'RuAtt',
 'RuYds',
 'RuY/A',
 'RuTD',
 'Tgt',
 'Rec',
 'ReYds',
 'ReYds/R',
 'ReTD',
 'Fmb',
 'FL',
 'TD.3',
 '2PM',
 '2PP',
 'FantPt',
 'PPR',
 'DKPt',
 'FDPt',
 'VBD',
 'PosRank',
 'OvRank',
 'Year']