In [76]:
import pandas as pd

In [77]:
stats = pd.read_csv("player_mvp_stats.csv")

In [78]:
del stats["Unnamed: 0"]

In [79]:
pd.isnull(stats).sum()

Rk            0
Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          49
3P            0
3PA           0
3P%        2023
2P            0
2PA           0
2P%          79
eFG%         49
FT            0
FTA           0
FT%         444
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

In [80]:
stats = stats.fillna(0)

In [81]:
stats.columns

Index(['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [82]:
predictors = ['Age','G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

In [83]:
train = stats[stats["Year"] < 2020]

In [84]:
test = stats[stats["Year"] == 2020]

In [85]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha=.1)

In [86]:
reg.fit(train[predictors], train["Share"])

Ridge(alpha=0.1)

In [87]:
predictions = reg.predict(test[predictors])

In [88]:
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [89]:
predictions

Unnamed: 0,predictions
611,0.020689
612,-0.028052
613,0.000923
614,-0.014191
615,0.008199
...,...
12554,0.004809
12555,0.037983
12556,0.007636
12557,0.009623


In [90]:
combination = pd.concat([test[["Player", "Share"]], predictions],axis=1)

In [91]:
combination

Unnamed: 0,Player,Share,predictions
611,Aaron Gordon,0.0,0.020689
612,Al-Farouq Aminu,0.0,-0.028052
613,Amile Jefferson,0.0,0.000923
614,B.J. Johnson,0.0,-0.014191
615,D.J. Augustin,0.0,0.008199
...,...,...,...
12554,Rondae Hollis-Jefferson,0.0,0.004809
12555,Serge Ibaka,0.0,0.037983
12556,Shamorie Ponds,0.0,0.007636
12557,Stanley Johnson,0.0,0.009623


In [92]:
combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions
11848,Giannis Antetokounmpo,0.952,0.239042
3581,LeBron James,0.746,0.16124
9307,James Harden,0.363,0.187201
8954,Luka Dončić,0.198,0.168907
5116,Kawhi Leonard,0.166,0.131559
3569,Anthony Davis,0.081,0.144882
1316,Chris Paul,0.026,0.079222
7195,Damian Lillard,0.023,0.13291
10938,Nikola Jokić,0.018,0.102292
12551,Pascal Siakam,0.017,0.077603


In [93]:
from sklearn.metrics import mean_squared_error

mean_squared_error(combination["Share"],combination["predictions"])

0.0025247880923921474

In [94]:
combination['Share'].value_counts()

0.000    517
0.026      1
0.081      1
0.746      1
0.166      1
0.009      1
0.023      1
0.198      1
0.363      1
0.018      1
0.001      1
0.952      1
0.017      1
Name: Share, dtype: int64

In [95]:
combination = combination.sort_values('Share',ascending=False)
combination['Rk'] = list(range(1,combination.shape[0]+1))

In [96]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk
11848,Giannis Antetokounmpo,0.952,0.239042,1
3581,LeBron James,0.746,0.16124,2
9307,James Harden,0.363,0.187201,3
8954,Luka Dončić,0.198,0.168907,4
5116,Kawhi Leonard,0.166,0.131559,5
3569,Anthony Davis,0.081,0.144882,6
1316,Chris Paul,0.026,0.079222,7
7195,Damian Lillard,0.023,0.13291,8
10938,Nikola Jokić,0.018,0.102292,9
12551,Pascal Siakam,0.017,0.077603,10


In [97]:
combination = combination.sort_values('predictions',ascending=False)
combination['Predicted_Rk'] = list(range(1,combination.shape[0]+1))

In [98]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
11848,Giannis Antetokounmpo,0.952,0.239042,1,1
9307,James Harden,0.363,0.187201,3,2
8954,Luka Dončić,0.198,0.168907,4,3
3581,LeBron James,0.746,0.16124,2,4
3569,Anthony Davis,0.081,0.144882,6,5
7195,Damian Lillard,0.023,0.13291,8,6
5116,Kawhi Leonard,0.166,0.131559,5,7
2480,Joel Embiid,0.0,0.124611,300,8
4226,Karl-Anthony Towns,0.0,0.123287,508,9
11268,Trae Young,0.0,0.118689,265,10


In [99]:
def find_ap(combination):
    actual = combination.sort_values('Share', ascending=False).head(5)
    predicted= combination.sort_values("predictions", ascending=False)
    ps = []
    found = 0
    seen = 1
    for index, row in predicted.iterrows():
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found/seen)
        seen += 1
    return sum(ps) / len(ps)

In [100]:
find_ap(combination)

0.9428571428571428

In [101]:
years = list(range(1991,2021))

In [102]:
aps = []
all_predictions = []
for year in years[5:]:
    train = stats[stats["Year"] <year]
    test = stats[stats["Year"] == year]
    reg.fit(train[predictors], train["Share"])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=['predictions'], index=test.index)
    combination = pd.concat([test[['Player','Share']], predictions], axis=1)
    all_predictions.append(combination)
    aps.append(find_ap(combination))

In [103]:
sum(aps) / len(aps)

0.7091945189766616

In [104]:
def add_ranks(combination):
    combination = combination.sort_values('Share', ascending = False)
    combination['Rk'] = list(range(1,combination.shape[0]+1))
    combination = combination.sort_values('predictions', ascending = False)
    combination['Predicted_Rk'] = list(range(1,combination.shape[0]+1))
    combination["Diff"] = combination["Rk"] - combination["Predicted_Rk"]
    return combination

In [105]:
ranking = add_ranks(all_predictions[1])
ranking[ranking["Rk"] < 6].sort_values("Diff", ascending =False)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
1530,Karl Malone,0.857,0.192318,1,2,-1
10161,Michael Jordan,0.832,0.167629,2,3,-1
871,Grant Hill,0.327,0.128646,3,6,-3
4487,Tim Hardaway,0.207,0.059984,4,20,-16
7943,Glen Rice,0.117,0.03311,5,53,-48


In [106]:
def backtest(stats, model, year, predictors):
    aps = []
    all_predictions = []
    for year in years[5:]:
        train = stats[stats["Year"] <year]
        test = stats[stats["Year"] == year]
        reg.fit(train[predictors], train["Share"])
        predictions = reg.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=['predictions'], index=test.index)
        combination = pd.concat([test[['Player','Share']], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    return sum(aps)/len(aps), aps, pd.concat(all_predictions)

In [107]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [108]:
mean_ap

0.7091945189766616

In [109]:
all_predictions[all_predictions["Rk"] <=5].sort_values("Diff").head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
1170,Jason Kidd,0.712,0.02821,2,52,-50
7943,Glen Rice,0.117,0.03311,5,53,-48
4980,Steve Nash,0.839,0.0341,1,45,-44
8211,Peja Stojaković,0.228,0.03627,4,38,-34
4998,Steve Nash,0.739,0.054129,1,34,-33
12241,Joakim Noah,0.258,0.046968,4,37,-33
3518,Chauncey Billups,0.344,0.052696,5,35,-30
5013,Steve Nash,0.785,0.074421,2,21,-19
4487,Tim Hardaway,0.207,0.059984,4,20,-16
10150,Scottie Pippen,0.2,0.066921,5,20,-15


In [110]:
pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.055322,eFG%
18,0.035209,DRB
29,0.025842,W/L%
17,0.022591,ORB
10,0.016478,2P
22,0.012048,BLK
21,0.011805,STL
15,0.009191,FTA
5,0.007608,FGA
20,0.00759,AST


In [111]:
 stat_ratios = stats[["PTS", "AST", "STL", "BLK", "3P", "Year"]].groupby("Year").apply(lambda x: x/x.mean())

In [112]:
stat_ratios

Unnamed: 0,PTS,AST,STL,BLK,3P,Year
0,1.013334,0.420714,0.961127,0.673469,0.508587,1.0
1,1.614653,1.028412,1.647646,0.673469,4.577279,1.0
2,0.311795,0.093492,0.274608,1.571429,0.000000,1.0
3,0.200440,0.186984,0.274608,0.000000,0.000000,1.0
4,2.383005,1.636110,1.784950,0.897959,1.525760,1.0
...,...,...,...,...,...,...
13547,0.735752,0.819562,0.479763,1.528302,0.650951,1.0
13548,0.071202,0.000000,0.000000,0.000000,0.130190,1.0
13549,1.281633,0.601012,1.119447,2.547170,0.520761,1.0
13550,0.474679,0.218550,0.319842,1.273585,0.650951,1.0


In [113]:
stats[["PTS_T", "AST_R", "STL_R", "BLK_R", "3P_R"]] = stat_ratios[['PTS', 'AST', 'STL', 'BLK', '3P']]

In [114]:
predictors += ["PTS_T", "AST_R", "STL_R", "BLK_R", "3P_R"]

In [115]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [116]:
mean_ap

0.7180594999835173

In [117]:
stats["NPos"] = stats["Pos"].astype("category").cat.codes

In [118]:
stats["NTm"] = stats["Tm"].astype("category").cat.codes

In [119]:
stats["Pos"].unique()

array(['PF', 'SG', 'SF', 'PG', 'C', 'PG-SG', 'PF-SF', 'SG-PG', 'PF-C',
       'SG-SF', 'SF-PF', 'SF-SG', 'C-PF', 'SG-PF', 'PG-SF', 'SF-C'],
      dtype=object)

In [120]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)

mean_ap, aps, all_predictions = backtest(stats, rf, years[28:], predictors)

In [121]:
mean_ap

0.7180594999835173

In [122]:
 mean_ap, aps, all_predictions = backtest(stats, reg, years[28:], predictors)

In [75]:
mean_ap

0.7180594999835173