# Import packages and define functions

In [49]:
###################### Import Packages #############################################
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import statsmodels.formula.api as smf
from sklearn.ensemble import RandomForestRegressor

## import custom and scraper functions ######################
from ffb_functions import *


def greater_year(yr1, yr2):
    if pd.notna(yr1):
        if pd.notna(yr2):
            val = max(yr1, yr2)
        else:
            val = yr1
    elif pd.notna(yr2):
        val = yr2
    else:
        val = np.nan
    return val



# Assemble the database and create metrics

In [50]:
############## Assemble the database #########################################
database = pd.read_csv('full_college_df.csv')          # save as csv just in case
big_db = pd.read_csv('ffb_db.csv')[['Year', 'Name', 'FantPos', 'PPR']]
database['Year_True'] = database.apply(lambda row:
                                  greater_year(
                                      row['Year']
                                      , row['Year_y']
                                  ), axis = 1)


frame2 = pd.merge(big_db, database, on=['Name'], how='outer')
frame2 = frame2.drop_duplicates(['Name'], keep = 'first')
frame2 = frame2.rename(columns = {'FantPos_y':'FantPos'})


frame2['Final_Year'] = frame2.apply(lambda row:
                                  greater_year(
                                      row['Year_x']
                                      , row['Year_True']
                                  ), axis = 1)


frame2 = frame2.drop(columns = ['Year_True', 'Year_x'], axis = 1)
frame2 = frame2.rename(columns = {'Final_Year':'Year'})

In [51]:
database.loc[database.Name == 'Josh Jacobs']

Unnamed: 0,Pick,Tm,Name,Age,College,Year,FantPos,School_x,Ht,Wt,Dash,Vertical,Bench,Broad_Jump,Three_Cone,Shuttle,Year_y,School_y,Conf,G,Cmp,PaAtt,Pct,PaYds,PaTD,Int,RuYds,RuTD,RuAtt,ReYds,ReTD,Rec,Year_True
6758,24.0,OAK,Josh Jacobs,21.0,Alabama,2019.0,RB,Alabama,5-10,220.0,4.55651,34.404968,19.395455,118.065789,7.063914,4.265,2018.0,Alabama,SEC,15.0,0.0,0.0,0.0,0.0,0.0,0.0,640.0,11.0,120.0,247.0,3.0,20.0,2019.0


In [52]:
frame2 = frame2[['Name', 'Year', 'FantPos', 'College'
                  , 'Pick'
                  , 'Age', 'G'
                  , 'PPR'
                  , 'Wt', 'Ht'
                  , 'Dash', 'Vertical', 'Bench', 'Three_Cone', 'Broad_Jump'
                    , 'Tm'
                    , 'Conf', 'Cmp', 'PaAtt', 'Pct', 'PaYds', 'PaTD', 'Int'
                    , 'RuYds', 'RuTD', 'RuAtt', 'ReYds', 'ReTD', 'Rec']]


In [53]:
pd.set_option('display.max_columns', 500)

database.dtypes

Pick          float64
Tm             object
Name           object
Age           float64
College        object
Year          float64
FantPos        object
School_x       object
Ht             object
Wt            float64
Dash          float64
Vertical      float64
Bench         float64
Broad_Jump    float64
Three_Cone    float64
Shuttle       float64
Year_y        float64
School_y       object
Conf           object
G             float64
Cmp           float64
PaAtt         float64
Pct           float64
PaYds         float64
PaTD          float64
Int           float64
RuYds         float64
RuTD          float64
RuAtt         float64
ReYds         float64
ReTD          float64
Rec           float64
Year_True     float64
dtype: object

In [54]:
frame2.loc[frame2.Name == 'Josh Jacobs']

Unnamed: 0,Name,Year,FantPos,College,Pick,Age,G,PPR,Wt,Ht,Dash,Vertical,Bench,Three_Cone,Broad_Jump,Tm,Conf,Cmp,PaAtt,Pct,PaYds,PaTD,Int,RuYds,RuTD,RuAtt,ReYds,ReTD,Rec
12787,Josh Jacobs,2019.0,RB,Alabama,24.0,21.0,15.0,,220.0,5-10,4.55651,34.404968,19.395455,7.063914,118.065789,OAK,SEC,0.0,0.0,0.0,0.0,0.0,0.0,640.0,11.0,120.0,247.0,3.0,20.0


In [55]:
frame2


Unnamed: 0,Name,Year,FantPos,College,Pick,Age,G,PPR,Wt,Ht,Dash,Vertical,Bench,Three_Cone,Broad_Jump,Tm,Conf,Cmp,PaAtt,Pct,PaYds,PaTD,Int,RuYds,RuTD,RuAtt,ReYds,ReTD,Rec
0,DeAngelo Williams,2008.0,RB,Memphis,27.0,23.0,11.0,307.6,214.0,5-9,4.45,35.500000,25.000000,7.063914,121.000000,CAR,CUSA,0.0,0.0,0.0,0.0,0.0,0.0,1964.0,18.0,310.0,78.0,1.0,12.0
9,Michael Turner,2008.0,RB,Northern Illinois,154.0,22.0,12.0,282.0,237.0,5-10,4.49,31.000000,22.000000,7.500000,114.000000,SDG,MAC,0.0,0.0,0.0,0.0,0.0,0.0,1648.0,14.0,310.0,230.0,3.0,19.0
14,Adrian Peterson,2008.0,RB,Georgia Southern,199.0,23.0,7.0,269.5,214.0,5-10,4.68,34.000000,14.000000,7.590000,119.000000,CHI,Big 12,0.0,0.0,0.0,0.0,0.0,0.0,1012.0,12.0,188.0,136.0,1.0,10.0
66,Larry Fitzgerald,2008.0,WR,Pittsburgh,3.0,21.0,13.0,311.1,225.0,6-3,4.48,35.287858,14.556322,6.972027,120.427972,ARI,Big East,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,1.0,1672.0,22.0,92.0
77,Matt Forte,2008.0,RB,Tulane,44.0,22.0,11.0,306.5,217.0,6-1,4.44,33.000000,23.000000,6.840000,118.000000,CHI,CUSA,0.0,0.0,0.0,0.0,0.0,0.0,624.0,4.0,140.0,180.0,2.0,20.0
87,Thomas Jones,2008.0,RB,Virginia,7.0,22.0,,277.9,216.0,5-10,4.45,34.404968,19.395455,7.063914,118.065789,ARI,,,,,,,,,,,,,
91,Tony Gonzalez,2008.0,,,,,12.0,261.8,,,,,,,,,ACC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,491.0,5.0,43.0
97,Andre Johnson,2008.0,WR,Miami (FL),3.0,22.0,12.0,322.5,230.0,6-2,4.41,39.000000,14.556322,6.972027,132.000000,HOU,Big East,0.0,0.0,0.0,0.0,0.0,0.0,-2.0,0.0,1.0,1092.0,9.0,52.0
106,Calvin Johnson,2008.0,WR,Georgia Tech,2.0,21.0,14.0,285.0,239.0,6-5,4.35,35.287858,14.556322,6.972027,120.427972,DET,ACC,0.0,0.0,0.0,0.0,0.0,0.0,30.0,0.0,7.0,1202.0,15.0,76.0
114,Drew Brees,2008.0,QB,Purdue,32.0,22.0,12.0,304.7,213.0,6-0,4.83,32.000000,19.210526,7.090000,105.000000,SDG,Big Ten,309.0,512.0,60.4,3668.0,26.0,12.0,521.0,5.0,95.0,0.0,0.0,0.0


In [56]:
# height in inches
frame2['height'] = frame2['Ht'].apply(lambda x: 12 * float(x.split('-')[0]) + float(x.split('-')[1]) \
                                          if isinstance(x, str) else np.nan)
database = frame2
# change columns to floats
cols_to_change = ['Pick'
                  , 'Age'
                  , 'G'
                  , 'PPR'
                  , 'Wt'
                  , 'height'
                  , 'Dash'
                  , 'Vertical'
                  , 'Bench'
                  , 'Three_Cone'
                  , 'Broad_Jump'
                  , 'Cmp'
                    , 'PaAtt' 
                    , 'Pct' 
                    , 'PaYds'   
                , 'PaTD'    
                , 'Int'     
                , 'RuYds'          
                , 'RuTD'           
                , 'RuAtt'          
                , 'ReYds'          
                , 'ReTD'           
                , 'Rec'         
                  ]
for col in cols_to_change:
    database[col] = database[col].astype(float)
    
# create new metrics
shifted = database


shifted['ReYds_per_R'] = shifted['ReYds']/shifted['Rec']                                         # ReYds per reception
shifted['RuYds_per_A'] = shifted['RuYds']/shifted['RuAtt']                              # RuYds per attempt
shifted['RuTD_per_Att'] = shifted['RuTD'] / shifted['RuAtt']                          # RuTds per attempt
shifted['PaYds_per_PaAtt'] = shifted['PaYds'] / shifted['PaAtt']                      # PaYds per attempt
shifted['PaTD_per_PaAtt'] = shifted['PaTD'] / shifted['PaAtt']                        # PaTds per attempt
shifted['Cmp_per_PaAtt'] = shifted['Cmp'] / shifted['PaAtt']                        # Completions per attempt
shifted['Int_per_PaAtt'] = shifted['Int'] / shifted['PaAtt']                        # Interceptions per attempt
shifted['ReTD_per_rec'] = shifted['ReTD'] / shifted['Rec']              # ReTD's per reception



In [57]:
################ Seperate database by position ####################################################
pos_dict = {}
for pos in ['TE', 'WR', 'RB', 'QB']:
    pos_dict[pos] = shifted.loc[shifted.FantPos == pos].reset_index(drop = True)
    
    # center age to prepare for power relationship to reduce multicollinearity
    pos_dict[pos]['Age'] = pos_dict[pos]['Age'] - np.mean(pos_dict[pos]['Age']) 
    pos_dict[pos]['agesq'] = pos_dict[pos]['Age'] * pos_dict[pos]['Age']




In [58]:
pos_dict['RB']

Unnamed: 0,Name,Year,FantPos,College,Pick,Age,G,PPR,Wt,Ht,Dash,Vertical,Bench,Three_Cone,Broad_Jump,Tm,Conf,Cmp,PaAtt,Pct,PaYds,PaTD,Int,RuYds,RuTD,RuAtt,ReYds,ReTD,Rec,height,ReYds_per_R,RuYds_per_A,RuTD_per_Att,PaYds_per_PaAtt,PaTD_per_PaAtt,Cmp_per_PaAtt,Int_per_PaAtt,ReTD_per_rec,agesq
0,DeAngelo Williams,2008.0,RB,Memphis,27.0,0.699482,11.0,307.6,214.0,5-9,4.45000,35.500000,25.000000,7.063914,121.000000,CAR,CUSA,0.0,0.0,0.0,0.0,0.0,0.0,1964.0,18.0,310.0,78.0,1.0,12.0,69.0,6.500000,6.335484,0.058065,,,,,0.083333,0.489275
1,Michael Turner,2008.0,RB,Northern Illinois,154.0,-0.300518,12.0,282.0,237.0,5-10,4.49000,31.000000,22.000000,7.500000,114.000000,SDG,MAC,0.0,0.0,0.0,0.0,0.0,0.0,1648.0,14.0,310.0,230.0,3.0,19.0,70.0,12.105263,5.316129,0.045161,,,,,0.157895,0.090311
2,Adrian Peterson,2008.0,RB,Georgia Southern,199.0,0.699482,7.0,269.5,214.0,5-10,4.68000,34.000000,14.000000,7.590000,119.000000,CHI,Big 12,0.0,0.0,0.0,0.0,0.0,0.0,1012.0,12.0,188.0,136.0,1.0,10.0,70.0,13.600000,5.382979,0.063830,,,,,0.100000,0.489275
3,Matt Forte,2008.0,RB,Tulane,44.0,-0.300518,11.0,306.5,217.0,6-1,4.44000,33.000000,23.000000,6.840000,118.000000,CHI,CUSA,0.0,0.0,0.0,0.0,0.0,0.0,624.0,4.0,140.0,180.0,2.0,20.0,73.0,9.000000,4.457143,0.028571,,,,,0.100000,0.090311
4,Thomas Jones,2008.0,RB,Virginia,7.0,-0.300518,,277.9,216.0,5-10,4.45000,34.404968,19.395455,7.063914,118.065789,ARI,,,,,,,,,,,,,,70.0,,,,,,,,,0.090311
5,Steve Slaton,2008.0,RB,West Virginia,89.0,-0.300518,10.0,275.9,197.0,5-9,4.45000,33.500000,19.000000,7.063914,118.000000,HOU,Big East,0.0,0.0,0.0,0.0,0.0,0.0,1128.0,17.0,205.0,95.0,2.0,12.0,69.0,7.916667,5.502439,0.082927,,,,,0.166667,0.090311
6,LaDainian Tomlinson,2008.0,RB,,,,11.0,277.6,221.0,5-10,4.46000,40.500000,18.000000,6.840000,124.000000,,WAC,0.0,0.0,0.0,0.0,0.0,0.0,2158.0,22.0,369.0,40.0,0.0,10.0,70.0,4.000000,5.848238,0.059621,,,,,0.000000,
7,Clinton Portis,2008.0,RB,Miami (FL),51.0,-1.300518,11.0,252.5,204.0,5-11,4.42000,34.404968,19.395455,7.063914,118.065789,DEN,Big East,0.0,0.0,0.0,0.0,0.0,0.0,1200.0,10.0,220.0,125.0,1.0,12.0,71.0,10.416667,5.454545,0.045455,,,,,0.083333,1.691347
8,Maurice Jones-Drew,2008.0,RB,UCLA,60.0,-1.300518,,284.9,207.0,5-7,4.39000,36.000000,18.000000,7.080000,116.000000,JAX,,,,,,,,,,,,,,67.0,,,,,,,,,1.691347
9,Brian Westbrook,2008.0,RB,Villanova,91.0,0.699482,,271.8,200.0,5-8,4.57000,37.000000,26.000000,7.090000,118.000000,PHI,,,,,,,,,,,,,,68.0,,,,,,,,,0.489275


In [59]:
predict_year = 2019
#### replace na's with mean of column ###########################
model_dict = {}
predict_dict = {}
predict_names = {}
for pos in pos_dict:
    print(pos)
    test = pos_dict[pos]

    test = test.replace(np.inf, 'NULL', regex=True)
    test = test.replace(-np.inf, 'NULL', regex=True)
    test = test.replace('NULL', np.nan)

    test = test.fillna(0)
    target = test[[x for x in test.columns if ('Rk' not in x) 
                   & ('FantPos' not in x)
                  & ('Pt' not in x)
                  & ('Tm_prev_year' not in x)
                  & ('2PP' not in x) & ('delta_ppg' not in x)
                  & ('ppg_next_year' not in x) & ('g_next_year' not in x)
                  & ('College' not in x) & ('Conf' not in x)
                  & ('Ht' not in x)]]
    
    ## ad in predict_set here, after a lot of the stuff. or take out year later most likely
    
    dum = pd.get_dummies(target.Tm)
    target = target[[x for x in target.columns if ('Tm' not in x)]]
    target = pd.concat([target, dum], axis=1)
    
    predict_names[pos] = target.loc[target.Year == predict_year]\
        .reset_index(drop = True)['Name']
    predict_dict[pos] = target.loc[target.Year == predict_year]\
        .reset_index(drop = True)\
        .drop(['Year', 'PPR', 'Name'], axis=1)
    target = target.loc[target.Year != predict_year]\
        .reset_index(drop = True)\
        .drop(['Year', 'Name'], axis=1)
    
    labels = target['PPR']
    target = target.drop(['PPR'], axis = 1)
    
    feature_list = list(target.columns)
    features = np.array(target)
    labels = np.array(labels)

    # Using Skicit-learn to split data into training and testing sets
    from sklearn.model_selection import train_test_split
    # Split the data into training and testing sets
    train_features, test_features, train_labels, test_labels = train_test_split(features
                                                                                , labels
                                                                                , test_size = 0.25
                                                                                , random_state = 42)
    #train_features[train_features < -10000]
    # Import the model we are using
    from sklearn.ensemble import RandomForestRegressor
    # Instantiate model with 1000 decision trees
    rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
    # Train the model on training data
    rf.fit(train_features, train_labels)
    model_dict[pos] = rf
    
    # Get numerical feature importances
    importances = list(rf.feature_importances_)
    # List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
    # Sort the feature importances by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    # Print out the feature and importances 
    [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]
    

TE
Variable: Pick                 Importance: 0.2
Variable: ReYds                Importance: 0.15
Variable: agesq                Importance: 0.07
Variable: Age                  Importance: 0.05
Variable: Vertical             Importance: 0.05
Variable: Rec                  Importance: 0.05
Variable: ReYds_per_R          Importance: 0.05
Variable: Broad_Jump           Importance: 0.04
Variable: Wt                   Importance: 0.03
Variable: ReTD_per_rec         Importance: 0.03
Variable: NYG                  Importance: 0.03
Variable: G                    Importance: 0.02
Variable: Dash                 Importance: 0.02
Variable: Bench                Importance: 0.02
Variable: height               Importance: 0.02
Variable: Three_Cone           Importance: 0.01
Variable: RuYds                Importance: 0.01
Variable: RuTD                 Importance: 0.01
Variable: RuAtt                Importance: 0.01
Variable: ReTD                 Importance: 0.01
Variable: RuYds_per_A          Importa

Variable: Age                  Importance: 0.15
Variable: Pick                 Importance: 0.14
Variable: PaTD_per_PaAtt       Importance: 0.08
Variable: Cmp_per_PaAtt        Importance: 0.05
Variable: Wt                   Importance: 0.04
Variable: Dash                 Importance: 0.04
Variable: Vertical             Importance: 0.04
Variable: Pct                  Importance: 0.04
Variable: agesq                Importance: 0.04
Variable: Broad_Jump           Importance: 0.03
Variable: PaAtt                Importance: 0.03
Variable: PaYds_per_PaAtt      Importance: 0.03
Variable: MIN                  Importance: 0.03
Variable: Three_Cone           Importance: 0.02
Variable: PaYds                Importance: 0.02
Variable: Int                  Importance: 0.02
Variable: RuYds                Importance: 0.02
Variable: height               Importance: 0.02
Variable: RuTD_per_Att         Importance: 0.02
Variable: DAL                  Importance: 0.02
Variable: G                    Importanc

In [60]:
final_dict = {}
for pos in predict_dict:
    names = predict_names[pos]
    final_dict[pos] = predict_dict[pos]
    #print(frame)
    #print([x for x in list(final_dict[pos].columns) if x not in feature_list])
    model = model_dict[pos]
    final_dict[pos]['prediction'] = model.predict(final_dict[pos])
    final_dict[pos]['Names'] = names

In [72]:
predict_dict['WR'].sort_values(['prediction'], ascending = False)[['Names', 'Pick', 'PaYds', 'PaTD','RuYds', 'RuTD', 'ReYds', 'ReTD', 'Rec', 'prediction']]

Unnamed: 0,Names,Pick,PaYds,PaTD,RuYds,RuTD,ReYds,ReTD,Rec,prediction
8,D.K. Metcalf,64.0,0.0,0.0,0.0,0.0,569.0,5.0,26.0,139.9717
7,Andy Isabella,62.0,0.0,0.0,79.0,1.0,1698.0,13.0,102.0,117.2681
4,Mecole Hardman,56.0,0.0,0.0,36.0,0.0,543.0,7.0,35.0,104.4049
1,N'Keal Harry,32.0,0.0,0.0,10.0,1.0,1088.0,9.0,73.0,101.0239
3,A.J. Brown,51.0,0.0,0.0,0.0,0.0,1320.0,6.0,85.0,97.8544
5,JJ Arcega-Whiteside,57.0,0.0,0.0,0.0,0.0,1059.0,14.0,63.0,89.6151
13,Hakeem Butler,103.0,0.0,0.0,0.0,0.0,1318.0,9.0,60.0,88.5532
6,Parris Campbell,59.0,0.0,0.0,24.0,0.0,1063.0,12.0,90.0,85.1816
0,Marquise Brown,25.0,0.0,0.0,0.0,0.0,1318.0,10.0,75.0,83.8809
9,Diontae Johnson,66.0,0.0,0.0,17.0,0.0,761.0,8.0,49.0,79.2496


# PPG Regressions

In [41]:
# Toggles
pos_dict_reg = pos_dict

#pos_dict_reg = {}
#for pos in pos_dict:
 #   pos_dict_reg[pos] = pos_dict[pos].loc[pd.notna(pos_dict[pos].VBD)].reset_index(drop = True)

# Non-position specific variables

all_cols = ['Age'
            #, 'Rnd'
            , 'Pick'
            #, 'College'
            #, 'Tm'
            , 'height'
            , 'Wt'
            , 'Dash'
            , 'Vertical'
            #, 'Bench'
            , 'Three_Cone'
            , 'Broad_Jump'
           ]

xcols_dict = {
    'TE': ['ReTD_per_rec', 'ReYds_per_R', 'Rec'],
    'WR': ['ReTD_per_rec', 'Rec', 'ReYds_per_R'],
    'QB': ['RuAtt_per_game', 'RuYds_per_A', 'RuTD_per_Att',\
           'PaYds_per_PaAtt', 'PaTD_per_PaAtt', 'PaAtt_per_game', 'Cmp_per_PaAtt', 'Int_per_PaAtt'],
    'RB': ['RuTD_per_Att','RuAtt_per_game', 'RuYds_per_A',\
           'ReTD_per_rec', 'ReYds_per_R', 'Rec'],
}



# Position specific variables
for pos in pos_dict_reg:
    print(pos)
    ycol = 'PPR'
    xcols = xcols_dict[pos] + all_cols
    xcols = all_cols + xcols
    frame1 = pos_dict_reg[pos]
    frame = frame1[['Name', ycol] + xcols].dropna()
    xform = ' + '.join(xcols)
    formula = ycol + " ~ " + xform
    model = smf.ols(formula, frame)
    results = model.fit(cov_type='cluster', cov_kwds={'groups': frame['Name']})
    print(results.summary())
    print('\n')

WR
                            OLS Regression Results                            
Dep. Variable:                    PPR   R-squared:                       0.518
Model:                            OLS   Adj. R-squared:                  0.444
Method:                 Least Squares   F-statistic:                     7.145
Date:                Sun, 18 Aug 2019   Prob (F-statistic):           2.09e-08
Time:                        19:15:04   Log-Likelihood:                -437.80
No. Observations:                  84   AIC:                             899.6
Df Residuals:                      72   BIC:                             928.8
Df Model:                          11                                         
Covariance Type:              cluster                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        11.9958    538.786      0.



KeyError: "['RuAtt_per_game'] not in index"

In [None]:
database.sort_values('PPR', ascending = False)

In [None]:
college = college.drop_duplicates(subset = 'Name', keep = 'last')
college = college.reset_index(drop = True, inplace = True)

# change columns to floats
cols_to_change = ['G'
                  , 'PaTD'
                  , 'Cmp'
                  , 'Int'
                  , 'RuTD'
                  , 'ReTD'
                  , 'PaYds'
                  , 'RuYds'
                  , 'ReYds'
                 , 'Rec'
                 , 'ReYds/R'
                 , 'PaAtt'
                 , 'RuAtt'
                 ]
for col in cols_to_change:
    college[col] = college[col].astype(float)



In [None]:
college