In [1]:
import pandas as pd

In [2]:
# reading in dataset and cleaning

In [3]:
stats = pd.read_csv("team_stats.csv")

In [4]:
del stats ["Unnamed: 0"]

In [5]:
# checking for any null values so ML works

In [6]:
pd.isnull(stats).sum()

Tm                 0
W                  0
L                  0
T                320
W-L%               0
PF                 0
PA                 0
PD                 0
MoV                0
SoS                0
SRS                0
OSRS               0
DSRS               0
Year               0
Total Yards        0
Ply                0
Y/P                0
TO                 0
FL                 0
1stD               0
Pen                0
Penalty Yards      0
1stPy              0
Sc%                0
TO%                0
Cmp                0
Passing Att        0
Cmp%               0
Passing Yards      0
Passing TD         0
Int                0
Passing Y/A        0
Passing Y/G        0
Sk                 0
Passing EXP        0
Rushing Att        0
Rushing Yards      0
Rushing TD         0
Rushing Y/A        0
Rushing Y/G        0
Fmb                0
Rushing EXP        0
dtype: int64

In [7]:
stats[pd.isnull(stats["T"])][["Tm", "T"]]

Unnamed: 0,Tm,T
32,New England Patriots,
33,Miami Dolphins,
34,Buffalo Bills,
35,New York Jets,
36,Baltimore Ravens,
...,...,...
507,Tampa Bay Buccaneers,
508,Los Angeles Rams,
509,Seattle Seahawks,
510,Arizona Cardinals,


In [8]:
# replacing null values with 0

In [9]:
stats = stats.fillna(0)

In [10]:
# double checking for any null values -- which there are none

In [11]:
pd.isnull(stats).sum()

Tm               0
W                0
L                0
T                0
W-L%             0
PF               0
PA               0
PD               0
MoV              0
SoS              0
SRS              0
OSRS             0
DSRS             0
Year             0
Total Yards      0
Ply              0
Y/P              0
TO               0
FL               0
1stD             0
Pen              0
Penalty Yards    0
1stPy            0
Sc%              0
TO%              0
Cmp              0
Passing Att      0
Cmp%             0
Passing Yards    0
Passing TD       0
Int              0
Passing Y/A      0
Passing Y/G      0
Sk               0
Passing EXP      0
Rushing Att      0
Rushing Yards    0
Rushing TD       0
Rushing Y/A      0
Rushing Y/G      0
Fmb              0
Rushing EXP      0
dtype: int64

In [12]:
# finding the correlation for "W", or "Win"'s

In [13]:
stats.corr()["W"]

  stats.corr()["W"]


W                1.000000
L               -0.994025
T               -0.076559
W-L%             0.998357
PF               0.747232
PA              -0.647818
PD               0.910204
MoV              0.910262
SoS             -0.195696
SRS              0.874671
OSRS             0.747535
DSRS             0.622293
Year             0.019203
Total Yards      0.538386
Ply              0.266504
Y/P              0.525372
TO              -0.504727
FL              -0.304235
1stD             0.516143
Pen             -0.119212
Penalty Yards   -0.076774
1stPy            0.133098
Sc%              0.679303
TO%             -0.519259
Cmp              0.160822
Passing Att     -0.060213
Cmp%             0.439522
Passing Yards    0.356921
Passing TD       0.529216
Int             -0.477649
Passing Y/A      0.544280
Passing Y/G      0.353950
Sk              -0.469332
Passing EXP      0.608464
Rushing Att      0.428639
Rushing Yards    0.341569
Rushing TD       0.480515
Rushing Y/A      0.118973
Rushing Y/G 

In [14]:
# there is a strong correlation for most of the stats

In [15]:
stats.columns

Index(['Tm', 'W', 'L', 'T', 'W-L%', 'PF', 'PA', 'PD', 'MoV', 'SoS', 'SRS',
       'OSRS', 'DSRS', 'Year', 'Total Yards', 'Ply', 'Y/P', 'TO', 'FL', '1stD',
       'Pen', 'Penalty Yards', '1stPy', 'Sc%', 'TO%', 'Cmp', 'Passing Att',
       'Cmp%', 'Passing Yards', 'Passing TD', 'Int', 'Passing Y/A',
       'Passing Y/G', 'Sk', 'Passing EXP', 'Rushing Att', 'Rushing Yards',
       'Rushing TD', 'Rushing Y/A', 'Rushing Y/G', 'Fmb', 'Rushing EXP'],
      dtype='object')

In [16]:
# assigning predictors

In [17]:
predict = ['PF', 'PA', 'PD', 'MoV', 'SoS', 'SRS',
       'OSRS', 'DSRS', 'Total Yards', 'Ply', 'Y/P', 'TO', 'FL', '1stD',
       'Pen', 'Penalty Yards', '1stPy', 'Sc%', 'TO%', 'Cmp', 'Passing Att',
       'Cmp%', 'Passing Yards', 'Passing TD', 'Int', 'Passing Y/A',
       'Passing Y/G', 'Sk', 'Passing EXP', 'Rushing Att', 'Rushing Yards',
       'Rushing TD', 'Rushing Y/A', 'Rushing Y/G', 'Fmb', 'Rushing EXP']

In [18]:
train = stats[stats["Year"] < 2022]

In [19]:
test = stats[stats["Year"] == 2022]

In [20]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha=.1)

In [21]:
reg.fit(train[predict], train["W"])

In [22]:
predictions = reg.predict(test[predict])

In [23]:
predictions = pd.DataFrame(predictions, columns=["predict"], index=test.index)

In [24]:
predictions

Unnamed: 0,predict
640,11.603745
641,8.941453
642,8.918529
643,7.341014
644,10.350324
645,7.89776
646,7.46057
647,6.908686
648,9.785982
649,6.406431


In [25]:
comb = pd.concat([test[["Tm", "W"]], predictions], axis=1)

In [26]:
comb

Unnamed: 0,Tm,W,predict
640,Buffalo Bills,13,11.603745
641,Miami Dolphins,9,8.941453
642,New England Patriots,8,8.918529
643,New York Jets,7,7.341014
644,Cincinnati Bengals,12,10.350324
645,Baltimore Ravens,10,7.89776
646,Pittsburgh Steelers,9,7.46057
647,Cleveland Browns,7,6.908686
648,Jacksonville Jaguars,9,9.785982
649,Tennessee Titans,7,6.406431


In [27]:
from sklearn.metrics import mean_squared_error

In [28]:
mean_squared_error(comb["W"], comb["predict"])

2.1233178003803364

In [29]:
comb = comb.sort_values("W", ascending=False)
comb["Rk"] = list(range(1,comb.shape[0]+1))

In [30]:
comb = comb.sort_values("predict", ascending=False)
comb["Predicted_Rk"] = list(range(1,comb.shape[0]+1))

In [31]:
comb

Unnamed: 0,Tm,W,predict,Rk,Predicted_Rk
668,San Francisco 49ers,13,12.436656,3,1
652,Kansas City Chiefs,14,12.001776,2,2
640,Buffalo Bills,13,11.603745,5,3
657,Dallas Cowboys,12,11.194409,6,4
656,Philadelphia Eagles,14,10.532708,1,5
644,Cincinnati Bengals,12,10.350324,7,6
648,Jacksonville Jaguars,9,9.785982,15,7
661,Detroit Lions,9,9.538375,11,8
653,Los Angeles Chargers,10,9.399177,9,9
660,Minnesota Vikings,13,9.090638,4,10


In [32]:
# going to try a logistic model to try for better accuracy

In [33]:
from sklearn.linear_model import LogisticRegression

In [34]:
logreg = LogisticRegression()

In [39]:
logreg = LogisticRegression(max_iter=100)

In [40]:
# in this case, "train[predict] is my "X" train variable

In [41]:
# whereas train["W"] is my "y" train variable

In [42]:
logreg.fit(train[predict], train["W"])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [43]:
# finally, "test[predict]" is my "X" test variable

In [44]:
log_predictions = logreg.predict(test[predict])

In [45]:
log_predictions = pd.DataFrame(log_predictions, columns=["predict"], index=test.index)

In [46]:
log_predictions

Unnamed: 0,predict
640,13
641,8
642,8
643,7
644,12
645,10
646,7
647,8
648,10
649,6


In [47]:
log_comb = pd.concat([test[["Tm", "W"]], log_predictions], axis=1)

In [48]:
log_comb = log_comb.sort_values("W", ascending=False)
log_comb["Rk"] = list(range(1,log_comb.shape[0]+1))

In [49]:
log_comb = log_comb.sort_values("predict", ascending=False)
log_comb["Predicted_Rk"] = list(range(1,log_comb.shape[0]+1))

In [50]:
log_comb

Unnamed: 0,Tm,W,predict,Rk,Predicted_Rk
656,Philadelphia Eagles,14,13,1,1
652,Kansas City Chiefs,14,13,2,2
668,San Francisco 49ers,13,13,3,3
640,Buffalo Bills,13,13,5,4
657,Dallas Cowboys,12,12,6,5
644,Cincinnati Bengals,12,12,7,6
645,Baltimore Ravens,10,10,8,7
661,Detroit Lions,9,10,11,8
648,Jacksonville Jaguars,9,10,15,9
654,Las Vegas Raiders,6,8,26,10


In [51]:
mean_squared_error(log_comb["W"], log_comb["predict"])

3.5625

In [52]:
# I'm using mean squared error as opposed to other
# error metrics as I'm predicting where NFL franchises are going to finalize

In [53]:
# to wrap up this project, I'm going to implement
# backtesting to predict NFL franchise wins/year
# that way, if I want to place an early season playoff bet,
# I have an ML model to do so with relaively decent accuracy

In [54]:
years = list(range(2002,2022))

In [55]:
# this will return "total # of iterations reached"
# for every year. Probably could've done a Logistic Regression ML in 
# Google Collab - which may work better for this ML

In [56]:
aps = []
all_pred = []
all_log_comb = []
for year in years[5:]:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    logreg = LogisticRegression(max_iter=100)
    logreg.fit(train[predict], train["W"])
    log_predictions = logreg.predict(test[predict])
    log_predictions = pd.DataFrame(log_predictions, columns=["predict"], index=test.index)
    log_comb = pd.concat([test[["Tm", "W"]], log_predictions], axis=1)
    all_pred.append(logreg)
    all_log_comb.append(log_comb)
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [57]:
def add_rank_win(log_comb):
    log_comb = log_comb.sort_values("W", ascending=False)
    log_comb["Rk"] = list(range(1, log_comb.shape[0] + 1))
    log_comb = log_comb.sort_values("predict", ascending=False)
    log_comb["Predicted_Rk"] = list(range(1, log_comb.shape[0] + 1))
    log_comb["Diff_Rk"] = log_comb["Rk"] - log_comb["Predicted_Rk"]
    
    log_comb = log_comb.sort_values("W", ascending=False)
    log_comb = log_comb.sort_values("predict", ascending=False)
    log_comb["Diff_Wins"] = log_comb["W"] - log_comb["predict"]
    return log_comb


In [58]:
add_rank_win(all_log_comb[8])

Unnamed: 0,Tm,W,predict,Rk,Predicted_Rk,Diff_Rk,Diff_Wins
444,Arizona Cardinals,13,13,2,1,1,0
421,Pittsburgh Steelers,10,13,9,3,6,-3
416,New England Patriots,12,13,3,2,1,-1
440,Carolina Panthers,15,12,1,4,-3,3
420,Cincinnati Bengals,12,12,5,5,0,0
445,Seattle Seahawks,10,12,8,6,2,-2
429,Kansas City Chiefs,11,11,7,7,0,0
436,Minnesota Vikings,11,10,6,8,-2,1
417,New York Jets,10,10,11,9,2,0
428,Denver Broncos,12,9,4,10,-6,3


In [63]:
def backtest(stats, years, predict):
    aps = []
    all_pred = []
    all_log_comb = []

    for year in years[5:]:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]

        logreg = LogisticRegression(max_iter=100)
        logreg.fit(train[predict], train["W"])
        log_predictions = logreg.predict(test[predict])
        log_predictions = pd.DataFrame(log_predictions, columns=["predict"], index=test.index)
        log_comb = pd.concat([test[["Tm", "W"]], log_predictions], axis=1)
        log_comb = add_rank_win(log_comb)
        all_pred.append(logreg)
        all_log_comb.append(log_comb)
        ap = find_ap(log_comb)
        aps.append(ap)

    mean_ap = sum(aps) / len(aps)

    return mean_ap, aps, all_pred


In [64]:
mean_ap, aps, all_pred = backtest(stats, years, predict)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


NameError: name 'find_ap' is not defined

In [65]:
# predicting 2023 season using the trained ML model

In [69]:
import random

In [70]:
test_predictions = logreg.predict(test[predict])

In [71]:
predictions_2022 = pd.DataFrame({"Team": test["Tm"], "Predicted Wins 2022": test_predictions})

In [72]:
predictions_2022 = predictions_2022.sort_values(by="Predicted Wins 2022", ascending=False)

In [73]:
n_iterations = 1000

In [74]:
all_random_predictions = []

for _ in range(n_iterations):
    random_test = test.sample(n=len(test), replace=True)
    random_test_predictions = logreg.predict(random_test[predict])

    random_predictions_2023 = pd.DataFrame({"Team": random_test["Tm"], "Predicted Wins 2023": random_test_predictions})

    random_predictions_2023 = random_predictions_2023.sort_values(by="Predicted Wins 2023", ascending=False)

    all_random_predictions.append(random_predictions_2023)

In [75]:
mean_predictions_2023 = pd.concat(all_random_predictions).groupby("Team").mean().reset_index()

In [76]:
mean_predictions_2023 = mean_predictions_2023.sort_values(by="Predicted Wins 2023", ascending=False)

In [77]:
print(mean_predictions_2023)

                     Team  Predicted Wins 2023
18       Los Angeles Rams                 13.0
11      Green Bay Packers                 13.0
29   Tampa Bay Buccaneers                 13.0
6      Cincinnati Bengals                 13.0
21   New England Patriots                 12.0
3           Buffalo Bills                 12.0
15     Kansas City Chiefs                 12.0
8          Dallas Cowboys                 12.0
25    Philadelphia Eagles                 11.0
13     Indianapolis Colts                 11.0
27    San Francisco 49ers                 11.0
0       Arizona Cardinals                 11.0
30       Tennessee Titans                 10.0
9          Denver Broncos                 10.0
22     New Orleans Saints                  9.0
28       Seattle Seahawks                  8.0
20      Minnesota Vikings                  8.0
7        Cleveland Browns                  8.0
2        Baltimore Ravens                  8.0
16      Las Vegas Raiders                  7.0
19         Mi