In [1]:
import pandas as pd

In [2]:
players = pd.read_csv("data/player_data.csv")

In [3]:
offense = pd.read_csv("data/offensive_stats.csv")

In [4]:
from pandas.api.types import is_numeric_dtype


In [5]:
def add_sma(df: pd.DataFrame) -> pd.DataFrame:
    x = df.sort_values(["player", "Season"])

    numeric_cols = [col for col in x.columns if is_numeric_dtype(x[col])]

    e = x[["player"] + numeric_cols].groupby("player").ewm(com=0.4).mean()
    s = x[["player"] + numeric_cols].groupby("player").rolling(3).mean()

    e.columns = [col + "_ewm" for col in e.columns if col not in ["player"]]

    s.columns = [col + "_sma" for col in s.columns if col not in ["player"]]

    return x, pd.concat([e, s], axis=1)

In [6]:
x, f = add_sma(offense)

In [7]:
dat = pd.concat(
    [
        x.sort_values(["player", "Season"]).reset_index(drop=True),
        f.reset_index()
        .sort_values(["player", "Season_sma"])
        .drop(columns=["level_1", "Unnamed: 0_ewm", "player"]),
    ],
    axis=1,
)
dat.shape

(6492, 86)

In [8]:
dat = dat.drop(columns=[x for x in dat.columns if "Unnamed" in x])
dat = dat.drop(columns=[x for x in dat.columns if "season_" in x])

In [9]:
offstats = pd.read_csv("data/offensive_stats.csv")

In [10]:
correlations = dat.groupby("pos").corr(numeric_only=True)["misc_fpts"].reset_index()
correlations = correlations.rename({"level_1": "metric"}, axis=1)

In [11]:
import plotly.express as px

In [12]:
correlations.head()

Unnamed: 0,pos,metric,misc_fpts
0,qb,passing_cmp,0.975382
1,qb,passing_att,0.970395
2,qb,passing_pct,0.576386
3,qb,passing_yds,0.98223
4,qb,passing_y/a,0.438854


In [13]:
px.bar(
    correlations,
    x="metric",
    y="misc_fpts",
    color="pos",
    facet_row="pos",
    height=800,
    title="Corr between features and fantasy points",
)

#### pipeline
1. no look ahead (train for 2021, 2022, 2023)
2. RF, ridge, LASSO, XGB?

In [14]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.model_selection import train_test_split

In [15]:
def evaluate(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return [mae, mse, r2]

In [35]:
X = dat[
    ["player", "pid", "Season", "misc_fpts", "pos"]
    + [col for col in dat.columns if col[-4:] in ["_sma", "_ewm"]]
]

In [36]:
import logging

logger = logging.getLogger(__name__)
from sklearn.preprocessing import OrdinalEncoder
import xgboost as xgb


In [37]:
# set up RF
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

random_grid = {
    "n_estimators": [300, 500, 700],
    "max_features": ["sqrt"],
    "max_depth": [2, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
}

rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=random_grid,
    n_iter=100,
    cv=5,
    verbose=99,
    random_state=21,
    n_jobs=-1,
)

rf_opt = RandomForestRegressor(
    max_depth=20,
    max_features="sqrt",
    min_samples_split=5,
    n_estimators=700,
    random_state=0,
)


In [38]:
enc = OrdinalEncoder()

In [39]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()


In [40]:
res = pd.DataFrame()

for season in range(2019, 2024):
    print(f"calculating for {season}...")

    X_t = X.query(f"Season <= {season}").fillna(0)
    X_t["pos"] = enc.fit_transform(X_t[["pos"]])

    X_ = X_t.drop(
        columns=[
            "player",
            "Season",
            "misc_fpts",
            "Season_ewm",
            "Season_sma",
            "pid_ewm",
            "pid_sma",
        ]
    )
    y_ = X_t["misc_fpts"]

    X_train, X_test, y_train, y_test = train_test_split(
        X_, y_, random_state=0, test_size=0.3
    )

    # train models

    for modname, mod in [
        ("ridge", Ridge(alpha=1.0)),
        ("lasso", Lasso(alpha=1.0)),
        ("ols", LinearRegression()),
        ("rf", rf_opt),
        # ("xgb", xgb.XGBClassifier(tree_method="hist")),
    ]:
        if modname == "xgb":
            y_train = le.fit_transform(y_train)
            y_test = le.fit_transform(y_test)

        print(f"calculating for {modname}")
        clf = mod
        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        err = evaluate(y_test, y_pred)
        err_df = pd.DataFrame([err], columns=["mae", "mse", "r2"])

        print(err)

        err_df["model_name"] = modname
        err_df["season"] = season
        res = pd.concat([res, err_df])


calculating for 2019...
calculating for ridge
[0.2096476759283043, 0.18270920771681376, 0.9999709554752617]
calculating for lasso
[0.15708525345617533, 0.10859896577334688, 0.9999827364729595]
calculating for ols
[0.2095003518414477, 0.1825591041057583, 0.99997097933661]
calculating for rf
[3.1674476432943672, 104.60555072531724, 0.9833712895821305]
calculating for 2020...
calculating for ridge
[0.1307627018851323, 0.0977421219825542, 0.9999802726367888]
calculating for lasso
[0.16070065795164054, 5.391533360922916, 0.9989118229201585]
calculating for ols
[0.12494310692115179, 0.08214605844358942, 0.999983420401579]
calculating for rf
[1.4423165567613545, 23.529597541697083, 0.9952510043008639]
calculating for 2021...
calculating for ridge
[0.22517891426652353, 1.9143507772937918, 0.9996350643481978]
calculating for lasso
[0.12531644858783336, 3.1583817720219565, 0.9993979127941002]
calculating for ols
[0.22670570219649144, 1.890258813611387, 0.9996396570365254]
calculating for rf
[1.2

In [22]:
# test vs projections
projs = pd.read_csv("data/offensive_projections.csv")

In [23]:
datp = dat[["pid", "Season", "pos", "misc_fpts"]].merge(
    projs[["pid", "Season", "misc_fpts"]],
    on=["pid", "Season"],
    how="inner",
    suffixes=["", "_proj"],
)

In [28]:
projerr = pd.DataFrame()

for season in range(2019, 2024):
    datp_ = datp.query(f"Season <= {season}")

    y_pred = datp_.misc_fpts_proj.to_list()
    y_test = datp_.misc_fpts.to_list()

    err = evaluate(y_test, y_pred)
    err_df = pd.DataFrame([err], columns=["mae", "mse", "r2"])

    err_df["season"] = season

    projerr = pd.concat([projerr, err_df])

In [29]:
projerr

Unnamed: 0,mae,mse,r2,season
0,56.027073,6501.777732,0.259348,2019
0,54.066426,6003.549349,0.306118,2020
0,52.857037,5748.448407,0.328028,2021
0,51.17681,5442.751611,0.350575,2022
0,42.093522,4136.535471,0.488646,2023


In [30]:
res

Unnamed: 0,mae,mse,r2,model_name,season
0,0.013852,0.001044,0.999963,ridge,2019
0,0.342252,0.443361,0.98414,lasso,2019
0,0.013568,0.001015,0.999964,ols,2019
0,0.40968,1.047254,0.962537,rf,2019
0,0.008619,0.000511,0.999979,ridge,2020
0,0.295594,0.337794,0.985955,lasso,2020
0,0.007908,0.000345,0.999986,ols,2020
0,0.189455,0.196167,0.991844,rf,2020
0,0.014394,0.007527,0.99969,ridge,2021
0,0.330037,0.46751,0.980717,lasso,2021


In [46]:
res = pd.DataFrame()
est_pts = pd.DataFrame()

season = 2023

# print(f"calculating for {season}...")

X_t = X.query(f"Season <= {season}").fillna(0)
X_t["pos"] = enc.fit_transform(X_t[["pos"]])

X_ = X_t.drop(
    columns=[
        "player",
        "Season",
        "misc_fpts",
        "Season_ewm",
        "Season_sma",
        "pid_ewm",
        "pid_sma",
    ]
)
y_ = X_t["misc_fpts"]

X_train, X_test, y_train, y_test = train_test_split(
    X_, y_, random_state=0, test_size=0.3
)

X_2024 = X.query(f"Season == 2024").fillna(0).drop(
    columns=[
        "player",
        "Season",
        "misc_fpts",
        "Season_ewm",
        "Season_sma",
        "pid_ewm",
        "pid_sma",
    ]
)

X_2024["pos"] = enc.fit_transform(X_2024[["pos"]])


# train models

for modname, mod in [
    ("ridge", Ridge(alpha=1.0)),
    ("lasso", Lasso(alpha=1.0)),
    ("ols", LinearRegression()),
    ("rf", rf_opt),
    # ("xgb", xgb.XGBClassifier(tree_method="hist")),
]:
    if modname == "xgb":
        y_train = le.fit_transform(y_train)
        y_test = le.fit_transform(y_test)

    print(f"calculating for {modname}")
    clf = mod
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    err = evaluate(y_test, y_pred)
    err_df = pd.DataFrame([err], columns=["mae", "mse", "r2"])

    print(err)

    err_df["model_name"] = modname
    err_df["season"] = season

    y_pred_2024 = pd.DataFrame(clf.predict(X_2024), columns = ['fpts_pred'])
    y_pred_2024['model_name'] = modname

    res = pd.concat([res, err_df])

    est_pts = pd.concat([est_pts, y_pred_2024])


calculating for ridge
[0.26846496963748445, 1.4315824074022996, 0.9997501782694088]
calculating for lasso
[0.1031264212764204, 2.453208069185294, 0.9995718970265525]
calculating for ols
[0.2696902365249561, 1.4296945638024536, 0.999750507712096]
calculating for rf
[0.8635409522450462, 16.55665624887597, 0.9971107408867902]


In [53]:
est_pts_ = est_pts.reset_index().pivot_table(index='index', columns='model_name', values='fpts_pred')

In [66]:
est_pts_

model_name,lasso,ols,rf,ridge
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,289.526548,289.063789,288.354296,289.064135
1,60.614354,60.455730,60.354214,60.459152
2,-0.004305,-0.009750,-0.000034,-0.009788
3,117.657398,117.000441,117.747178,117.004988
4,0.805871,0.656989,0.912772,0.659277
...,...,...,...,...
1077,-0.003833,-0.007089,-0.000034,-0.007910
1078,-0.003833,-0.009822,-0.000028,-0.009086
1079,206.434345,206.921339,207.509555,206.919416
1080,78.123430,77.725333,80.440843,77.728227


In [68]:
X.query(f"Season == 2024")[['player', 'pid', 'Season', 'misc_fpts']].reset_index()

Unnamed: 0,index,player,pid,Season,misc_fpts
0,5,A.J. Brown (PHI),18218,2024,289.6
1,11,A.T. Perry (NO),24353,2024,60.6
2,17,AJ Barner (SEA),26335,2024,0.0
3,23,AJ Dillon (GB),19358,2024,117.6
4,29,AJ McCarron (FA),12091,2024,0.8
...,...,...,...,...,...
1077,6472,Zavier Scott (IND),25748,2024,0.0
1078,6473,Zavier Scott (IND),25748,2024,0.0
1079,6479,Zay Flowers (BAL),22916,2024,206.4
1080,6485,Zay Jones (ARI),16431,2024,78.1


In [72]:
finalpreds = pd.concat([X.query(f"Season == 2024")[['player', 'pid', 'Season', 'misc_fpts']].reset_index(drop=True), est_pts_], axis=1)

In [75]:
finalpreds['median_est'] = finalpreds[['lasso', 'ols', 'rf', 'ridge']].median(axis=1)

In [77]:
finalpreds['delta'] = finalpreds['median_est'] - finalpreds['misc_fpts']

In [78]:
px.histogram(finalpreds['delta'])

In [83]:
offstats[['pid', 'pos']].drop_duplicates()

Unnamed: 0,pid,pos
0,17298,qb
1,19275,qb
2,15600,qb
3,17233,qb
4,19246,qb
...,...,...
5257,15325,te
5258,15392,te
5259,15760,te
5260,14339,te


In [85]:
finalpreds= finalpreds.merge(offstats[['pid', 'pos']].drop_duplicates(), on='pid', how='inner')

In [86]:
finalpreds.to_csv("predictions/ppr_estimate.csv", index=False)