In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [14]:
pd.set_option('display.max_columns',50)

In [140]:
pd.set_option('display.max_rows',180)

In [92]:
stats = pd.read_csv("data/player_mvp_advanced_stats.csv")

In [93]:
stats.shape

(17959, 62)

In [96]:
stats.isnull().sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS          280
MP_x          0
FG            0
FGA           0
FG%          63
3P            0
3PA           0
3P%        2680
2P            0
2PA           0
2P%         106
eFG%         63
FT            0
FTA           0
FT%         581
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
MP_y          0
PER           3
TS%          55
3PAr         63
FTr          63
ORB%          3
DRB%          3
TRB%          3
AST%          3
STL%          3
BLK%          3
TOV%         46
USG%          3
OWS           0
DWS           0
WS            0
WS/48         3
OBPM          0
DBPM          0
BPM           0
VORP          0
dtype: i

In [97]:
stats[stats["3P%"].isnull()][["Player","3PA","3P%"]]

Unnamed: 0,Player,3PA,3P%
3,Jerome Henderson,0.0,
5,Kurt Rambis,0.0,
12,Pétur Guðmundsson,0.0,
20,Kurt Rambis,0.0,
23,Mike Smrek,0.0,
...,...,...,...
17932,Joe Meriweather,0.0,
17933,Kenny Dennard,0.0,
17936,Leon Douglas,0.0,
17940,Reggie King,0.0,


In [98]:
stats[stats["FT%"].isnull()][["Player","FTA","FT%"]]

Unnamed: 0,Player,FTA,FT%
3,Jerome Henderson,0.0,
144,John Coker,0.0,
159,Jason Sasser,0.0,
170,Adrian Caldwell,0.0,
186,Bruno Šundov,0.0,
...,...,...,...
17778,Mark McNamara,0.0,
17806,Luke Zeller,0.0,
17859,Myron Brown,0.0,
17881,Malcolm Lee,0.0,


In [99]:
stats.fillna(0, inplace = True)

In [100]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP_x', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS', 'MP_y', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%',
       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS',
       'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'],
      dtype='object')

In [106]:
features = ['Age', 'G', 'GS', 'MP_x', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
            'W', 'L', 'W/L%', 'GB', 'PS/G','PA/G', 'SRS','PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%',
       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS',
       'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']

## Train test split

In [107]:
train = stats[stats["Year"] < 2022]
test = stats[stats["Year"] == 2022]

In [108]:
from sklearn.linear_model import Ridge, Lasso, LinearRegression

In [109]:
ridge = Ridge(alpha = .1)

In [110]:
ridge.fit(train[features], train["Share"])

Ridge(alpha=0.1)

In [111]:
preds = ridge.predict(test[features])

In [112]:
preds = pd.DataFrame(preds, columns = ["predictions"], index = test.index)

In [113]:
preds

Unnamed: 0,predictions
715,-0.011589
716,-0.042163
717,-0.000217
718,0.003795
719,-0.016701
...,...
15131,0.004756
15132,-0.001328
15133,-0.002516
15134,-0.005816


In [114]:
combine = pd.concat([test[["Player","Share"]], preds], axis =1)

In [115]:
combine

Unnamed: 0,Player,Share,predictions
715,Aaron Gordon,0.0,-0.011589
716,Austin Rivers,0.0,-0.042163
717,Bol Bol,0.0,-0.000217
718,Bones Hyland,0.0,0.003795
719,Bryn Forbes,0.0,-0.016701
...,...,...,...
15131,Micah Potter,0.0,0.004756
15132,Rodney McGruder,0.0,-0.001328
15133,Saben Lee,0.0,-0.002516
15134,Saddiq Bey,0.0,-0.005816


In [116]:
combine.sort_values("Share", ascending = False).head(10)

Unnamed: 0,Player,Share,predictions
730,Nikola Jokić,0.875,0.265256
904,Joel Embiid,0.706,0.18518
14225,Giannis Antetokounmpo,0.595,0.235504
974,Devin Booker,0.216,0.075031
14016,Luka Dončić,0.146,0.176381
1246,Jayson Tatum,0.043,0.112005
14849,Ja Morant,0.01,0.097849
7524,Stephen Curry,0.004,0.081643
972,Chris Paul,0.002,0.066076
9573,LeBron James,0.001,0.131443


In [117]:
from sklearn.metrics import mean_squared_error, r2_score

In [118]:
mean_squared_error(combine["Share"],combine["predictions"])

0.0018516051024500353

In [119]:
combine["Share"].value_counts()

0.000    593
0.001      3
0.875      1
0.706      1
0.002      1
0.216      1
0.043      1
0.004      1
0.146      1
0.595      1
0.010      1
Name: Share, dtype: int64

In [120]:
combine = combine.sort_values("Share",ascending = False)
combine["Rank"] = list(range(1,combine.shape[0]+1))

In [121]:
combine.head(10)

Unnamed: 0,Player,Share,predictions,Rank
730,Nikola Jokić,0.875,0.265256,1
904,Joel Embiid,0.706,0.18518,2
14225,Giannis Antetokounmpo,0.595,0.235504,3
974,Devin Booker,0.216,0.075031,4
14016,Luka Dončić,0.146,0.176381,5
1246,Jayson Tatum,0.043,0.112005,6
14849,Ja Morant,0.01,0.097849,7
7524,Stephen Curry,0.004,0.081643,8
972,Chris Paul,0.002,0.066076,9
9573,LeBron James,0.001,0.131443,10


In [122]:
combine = combine.sort_values("predictions", ascending= False)
combine["pred_rank"] = list(range(1,combine.shape[0]+1))

In [123]:
combine.head(10)

Unnamed: 0,Player,Share,predictions,Rank,pred_rank
730,Nikola Jokić,0.875,0.265256,1,1
14225,Giannis Antetokounmpo,0.595,0.235504,3,2
904,Joel Embiid,0.706,0.18518,2,3
14016,Luka Dončić,0.146,0.176381,5,4
9573,LeBron James,0.001,0.131443,10,5
7311,Kevin Durant,0.001,0.115032,12,6
1246,Jayson Tatum,0.043,0.112005,6,7
14367,Trae Young,0.0,0.107129,289,8
14849,Ja Morant,0.01,0.097849,7,9
9608,Karl-Anthony Towns,0.0,0.092362,121,10


In [124]:
combine.sort_values("Share",ascending = False).head(10)

Unnamed: 0,Player,Share,predictions,Rank,pred_rank
730,Nikola Jokić,0.875,0.265256,1,1
904,Joel Embiid,0.706,0.18518,2,3
14225,Giannis Antetokounmpo,0.595,0.235504,3,2
974,Devin Booker,0.216,0.075031,4,17
14016,Luka Dončić,0.146,0.176381,5,4
1246,Jayson Tatum,0.043,0.112005,6,7
14849,Ja Morant,0.01,0.097849,7,9
7524,Stephen Curry,0.004,0.081643,8,14
972,Chris Paul,0.002,0.066076,9,20
7311,Kevin Durant,0.001,0.115032,12,6


In [125]:
def find_avg_precision(combine):
    actual = combine.sort_values("Share", ascending = False).head(5)
    predicted = combine.sort_values("predictions", ascending = False)
    ps = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps)/len(ps)

In [126]:
find_avg_precision(combine)

0.8588235294117647

### Backtesting

In [127]:
years = list(range(1981, 2023))

In [128]:
aps = []
all_predictions = []

for year in years[5:]:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    ridge.fit(train[features], train["Share"])
    preds = ridge.predict(test[features])
    preds = pd.DataFrame(preds, columns = ["predictions"], index = test.index)
    combine = pd.concat([test[["Player","Share"]], preds], axis =1)
    all_predictions.append(combine)
    aps.append(find_avg_precision(combine))
              
              

In [129]:
mean_avg_prec = sum(aps)/len(aps)
print(mean_avg_prec)

0.7959558527606921


In [130]:
def add_ranks(combine):
    combine = combine.sort_values("Share", ascending = False)
    combine["Rank"] = list(range(1,combine.shape[0]+1))
    combine = combine.sort_values("predictions", ascending = False)
    combine["pred_rank"] = list(range(1,combine.shape[0]+1))
    combine["Diff_rank"] = combine["Rank"]- combine["pred_rank"]
    return combine
    
    

In [131]:
add_ranks(all_predictions[1])[add_ranks(all_predictions[1])["Rank"]<6].sort_values("Diff_rank", ascending = False)

Unnamed: 0,Player,Share,predictions,Rank,pred_rank,Diff_rank
7855,Larry Bird,0.347,0.355819,3,1,2
12460,Michael Jordan,0.576,0.265287,2,2,0
10456,Dominique Wilkins,0.164,0.168627,5,5,0
21,Magic Johnson,0.94,0.255782,1,3,-2
7854,Kevin McHale,0.326,0.135568,4,9,-5


In [132]:
def backtest(stats, model, years, predictors):
    aps = []
    all_predictions = []
    for year in years:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]
        model.fit(train[predictors],train["Share"])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_abs_precision(combination))
    return sum(aps) / len(aps), aps, pd.concat(all_predictions)


In [133]:
mean_ap, aps, all_predictions = backtest(stats, ridge, years[5:], features)

In [134]:
mean_ap

0.7959558527606921

In [135]:
# biggest differences in rank
all_predictions[all_predictions["Rank"]<6].sort_values("Diff_rank").head(10)

Unnamed: 0,Player,Share,predictions,Rank,pred_rank,Diff_rank
10006,Glen Rice,0.117,0.031859,5,53,-48
10274,Peja Stojaković,0.228,0.051443,4,34,-30
5508,Hakeem Olajuwon,0.247,0.062035,4,29,-25
6546,Steve Nash,0.839,0.061435,1,26,-25
6564,Steve Nash,0.739,0.073047,1,26,-25
16505,Patrick Ewing,0.235,0.082194,4,23,-19
14572,Tony Parker,0.274,0.056502,5,23,-18
5035,Stephen Curry,0.173,0.079657,5,21,-16
1566,Chris Paul,0.138,0.061817,5,20,-15
6370,Jason Kidd,0.135,0.04517,5,19,-14


In [141]:
all_predictions[all_predictions["Rank"]<6]

Unnamed: 0,Player,Share,predictions,Rank,pred_rank,Diff_rank
13215,Larry Bird,0.981,0.328058,1,1,0
7,Magic Johnson,0.263,0.195027,3,2,1
10443,Dominique Wilkins,0.522,0.152667,2,4,-2
4,Kareem Abdul-Jabbar,0.173,0.093040,5,10,-5
5508,Hakeem Olajuwon,0.247,0.062035,4,29,-25
...,...,...,...,...,...,...
730,Nikola Jokić,0.875,0.265256,1,1,0
14225,Giannis Antetokounmpo,0.595,0.235504,3,2,1
904,Joel Embiid,0.706,0.185180,2,3,-1
14016,Luka Dončić,0.146,0.176381,5,4,1


In [136]:
pd.concat([pd.Series(ridge.coef_),pd.Series(features)], axis = 1).sort_values(0, ascending = False)

Unnamed: 0,0,1
13,0.159433,eFG%
29,0.065329,W/L%
49,0.029194,WS/48
18,0.026823,DRB
53,0.025355,VORP
16,0.019836,FT%
15,0.018634,FTA
5,0.017677,FGA
23,0.014241,TOV
10,0.013642,2P


In [142]:
stats["NPos"] = stats["Pos"].astype("category").cat.codes
stats["NTm"] = stats["Tm"].astype("category").cat.codes

stats["NTm"].value_counts()

In [None]:
## Random Forest