In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

import seaborn as sns

In [2]:
!ls ../ffmdl/data/interim

base_fantasy_df.pkl


In [3]:
import pickle

In [438]:
base_df = pd.read_pickle("../ffmdl/data/interim/base_fantasy_df.pkl")

In [440]:
base_df.head()

RecursionError: maximum recursion depth exceeded while calling a Python object

In [29]:
from ffmdl.features import build_features
from ffmdl.models import train as mdl

In [19]:
base_df = build_features.build_base_df()

In [20]:
base_df.head()

Unnamed: 0,player_id,team,season_year,full_name,position,games_played,total_passing_twopt,total_rushing_twopt,total_rushing_yds,total_rushing_tds,total_fum_lost,total_rushing_att,total_passing_att,total_passing_yds,total_passing_tds,total_ints,total_rec_yds,total_rec_tds,total_recs,total_targets
0,00-0019596,TB,2009,Tom Brady,QB,16,1,0,44,1,2,29,565,4398,28,13,0,0,0,0
1,00-0020531,NO,2009,Drew Brees,QB,15,0,0,33,2,6,22,514,4388,34,11,-4,0,1,1
2,00-0022127,OAK,2009,Jason Witten,TE,16,0,0,0,0,0,0,0,0,0,0,1030,2,94,124
3,00-0022787,ATL,2009,Matt Schaub,QB,16,1,0,61,0,2,47,583,4770,29,15,0,0,0,0
4,00-0022921,ARI,2009,Larry Fitzgerald,WR,16,0,0,0,0,0,0,0,0,0,0,1092,13,97,154


In [21]:
base_df.columns

Index(['player_id', 'team', 'season_year', 'full_name', 'position',
       'games_played', 'total_passing_twopt', 'total_rushing_twopt',
       'total_rushing_yds', 'total_rushing_tds', 'total_fum_lost',
       'total_rushing_att', 'total_passing_att', 'total_passing_yds',
       'total_passing_tds', 'total_ints', 'total_rec_yds', 'total_rec_tds',
       'total_recs', 'total_targets'],
      dtype='object')

In [278]:
pos_cols = {
    'WR': ["games_played", "total_rec_yds", "total_rec_tds", "total_recs", "total_targets"],
    'TE': ["games_played", "total_rec_yds", "total_rec_tds", "total_recs", "total_targets"],
    'RB': ["games_played", "total_rushing_yds", "total_rushing_tds", "total_rushing_att"],
    'QB': ["games_played", "total_passing_yds", "total_passing_tds", "total_passing_att"],
}

In [282]:
pos_target_map = {
    'TE': 'total_rec_yds',
    'WR': 'total_rec_yds',
    'RB': 'total_rushing_yds',
    'QB': 'total_passing_yds',
}

In [288]:
def prepare_df(dataframe, pos):
    """
    Returns training features as data and labels are the players' performance
    in the following season. For example, if the training example was for year 2015,
    the label would be the fantasy performance in 2016 (avg per game).
    """

    target = pos_target_map[pos]
    # get all seasons except for last one which we use for inference
    dataframe = dataframe.loc[dataframe["position"] == pos]
    past_ssn = dataframe.season_year.unique()[:-1]
    print(f"preparing data in range: {np.min(past_ssn)} - {np.max(past_ssn)}")

    train_data = dataframe.loc[
        (dataframe.season_year >= np.min(past_ssn)) &
        (dataframe.season_year <= np.max(past_ssn))]
    train_df = pd.DataFrame()
    for year in past_ssn:
        train_df = train_df.append(train_data
                                   .loc[train_data.season_year == year]
                                   .merge(
                                       dataframe.loc[
                                           dataframe.season_year == year + 1,
                                           (target, "player_id")],
                                          on="player_id"))
    train_df.dropna(inplace=True)
    train_df.rename(columns={f"{target}_y": "y", f"{target}_x": target}, inplace=True)

    return train_df

In [414]:
pos = 'QB'
train_data = prepare_df(base_df, pos)

preparing data in range: 2009 - 2015


In [415]:
X, y = train_data.drop("y", axis=1), train_data["y"]

In [317]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [416]:
x_train, x_val, y_train, y_val = train_test_split(X[pos_cols[pos]], y)

In [437]:
x_train.columns

Index(['games_played', 'total_passing_yds', 'total_passing_tds',
       'total_passing_att'],
      dtype='object')

In [427]:
models = {
    "LinearRegression": LinearRegression(normalize=False),
    "RandomForest": RandomForestRegressor(n_estimators=50, max_depth=None, min_samples_split=8),
    "SVR": SVR(gamma='scale')
}

In [428]:
scl = StandardScaler()
scl.fit(x_train)
standardize = False

if standardize:
    x_train = scl.transform(x_train)
    x_val = scl.transform(x_val)

best_score = float('inf')
best_mdl = None
best_mdl_name = None

for mdl_name, mdl in models.items():
    mdl.fit(x_train, y_train)
    y_pred = mdl.predict(x_val)
    mse = mean_squared_error(y_pred, y_val)
    print(f"{mdl_name} -- {mse}")

    if mse < best_score:
        print(f"Best score: {mse} best model: {mdl_name}")
        best_score = mse
        best_mdl = mdl
        best_mdl_name = mdl_name


LinearRegression -- 1228154.1448682635
Best score: 1228154.1448682635 best model: LinearRegression
RandomForest -- 1023891.7970912714
Best score: 1023891.7970912714 best model: RandomForest
SVR -- 3959394.8367820424


In [429]:
mdl = RandomForestRegressor()
params = {
    'n_estimators':[10, 25, 50, 100],
    'max_depth':[1, 2, 5, 10, 100, None],
    'min_samples_split': [2, 4, 8],
}
clf = GridSearchCV(mdl, params, cv=5)

In [298]:
mdl = LinearRegression()
params = {
    'normalize': [True, False]
}

clf = GridSearchCV(mdl, params, cv=5)

In [420]:
clf.fit(X[pos_cols[pos]], y)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [1, 2, 5, 10, 100,

In [421]:
clf.best_params_

{'max_depth': None, 'min_samples_split': 8, 'n_estimators': 50}

In [431]:
x_test = base_df.loc[(base_df["season_year"] == 2016) & (base_df["position"] == pos), pos_cols[pos]]

In [432]:
y_pred = best_mdl.predict(x_test)

In [433]:
y_pred

array([4093.39809768, 4964.71237718,  103.69917527, 4053.52298657,
       4142.98356159, 2975.1485    , 4160.45516883, 2436.63516367,
       4199.24665429, 2038.46900794, 3551.96791275, 1256.11444733,
       2694.07020352, 2386.48571867, 3106.24372378, 3031.14895843,
       2121.58290482, 3343.34660473,  653.61780664, 4217.24367832,
       1224.43931247, 3245.51890376, 2286.95374681, 1192.46044733,
       3961.56251321,  532.73063736, 4527.37675103,   96.1190555 ,
       1192.46044733, 3713.9999844 , 3006.93427073, 3595.98545238,
       2263.93030958,  472.25443751,  194.14212438])

In [434]:
x_test["y"] = y_pred

In [435]:
x_test.index

Int64Index([477, 478, 480, 482, 483, 484, 485, 486, 491, 493, 495, 496, 497,
            507, 509, 516, 519, 524, 529, 530, 533, 534, 553, 567, 572, 575,
            585, 591, 628, 630, 665, 673, 677, 678, 684],
           dtype='int64')

In [436]:
pd.merge(base_df.iloc[x_test.index], x_test, on=x_test.index).sort_values(by="y", ascending=False)[:10][["full_name", "y"]]

Unnamed: 0,full_name,y
1,Drew Brees,4964.712377
26,Jameis Winston,4527.376751
19,Kirk Cousins,4217.243678
8,Matt Ryan,4199.246654
6,Aaron Rodgers,4160.455169
4,Philip Rivers,4142.983562
0,Tom Brady,4093.398098
3,Ben Roethlisberger,4053.522987
24,Derek Carr,3961.562513
29,Marcus Mariota,3713.999984


In [411]:
pred["full_name"].to_list()

['Ezekiel Elliott',
 'David Johnson',
 "Le'Veon Bell",
 'Melvin Gordon',
 'Latavius Murray',
 'Jordan Howard',
 'Todd Gurley',
 'Mark Ingram',
 'Tevin Coleman',
 'Derrick Henry']

In [406]:
top_rbs = (
    "Kareem Hunt",
    "Todd Gurley",
    "Le\'Veon Bell",
    "LeSean McCoy",
    "Mark Ingram",
    "Jordan Howard",
    "Melvin Gordon",
    "Leonard Fournette",
    "C.J. Anderson",
    "Ezekiel Elliot",
)

In [413]:
for x in pred["full_name"].to_list():
    if x not in top_rbs:
        print(x)

Ezekiel Elliott
David Johnson
Latavius Murray
Tevin Coleman
Derrick Henry


# Drop-Off Modeling

# Deep Learning Approachs

In [91]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

In [94]:
class StatNN(nn.Module):
    def __init__(self):
        pass
    
    def forward(self, x):
        pass