In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [14]:
pd.set_option('display.max_columns',50)

In [3]:
stats = pd.read_csv("data/player_mvp_stats.csv")

In [5]:
stats.shape

(17960, 41)

In [11]:
stats.isnull().sum()

Player        0
Pos           1
Age           1
Tm            1
G             1
GS          281
MP            1
FG            1
FGA           1
FG%          64
3P            1
3PA           1
3P%        2681
2P            1
2PA           1
2P%         107
eFG%         64
FT            1
FTA           1
FT%         582
ORB           1
DRB           1
TRB           1
AST           1
STL           1
BLK           1
TOV           1
PF            1
PTS           1
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          1
W             1
L             1
W/L%          1
GB            1
PS/G          1
PA/G          1
SRS           1
dtype: int64

In [16]:
stats[stats["3P%"].isnull()][["Player","3PA","3P%"]]

Unnamed: 0,Player,3PA,3P%
3,Jerome Henderson,0.0,
5,Kurt Rambis,0.0,
12,Pétur Guðmundsson,0.0,
20,Kurt Rambis,0.0,
23,Mike Smrek,0.0,
...,...,...,...
17933,Kenny Dennard,0.0,
17936,Leon Douglas,0.0,
17940,Reggie King,0.0,
17941,Steve Johnson,0.0,


In [17]:
stats[stats["FT%"].isnull()][["Player","FTA","FT%"]]

Unnamed: 0,Player,FTA,FT%
3,Jerome Henderson,0.0,
144,John Coker,0.0,
159,Jason Sasser,0.0,
170,Adrian Caldwell,0.0,
186,Bruno Šundov,0.0,
...,...,...,...
17806,Luke Zeller,0.0,
17859,Myron Brown,0.0,
17881,Malcolm Lee,0.0,
17955,Steve Novak,0.0,


In [18]:
stats.fillna(0, inplace = True)

In [19]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [28]:
features = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
            'W', 'L', 'W/L%', 'GB', 'PS/G','PA/G', 'SRS']

## Train test split

In [40]:
train = stats[stats["Year"]<2022]
test = stats[stats["Year"] == 2022]

In [41]:
from sklearn.linear_model import Ridge, Lasso, LinearRegression

In [42]:
ridge = Ridge(alpha = .1)

In [43]:
ridge.fit(train[features], train["Share"])

Ridge(alpha=0.1)

In [44]:
preds = ridge.predict(test[features])

In [45]:
preds = pd.DataFrame(preds, columns = ["predictions"], index = test.index)

In [46]:
preds

Unnamed: 0,predictions
715,0.010259
716,-0.032661
717,-0.005822
718,0.018560
719,-0.007688
...,...
15131,-0.017143
15132,-0.010045
15133,0.006152
15134,0.004272


In [47]:
combine = pd.concat([test[["Player","Share"]], preds], axis =1)

In [48]:
combine

Unnamed: 0,Player,Share,predictions
715,Aaron Gordon,0.0,0.010259
716,Austin Rivers,0.0,-0.032661
717,Bol Bol,0.0,-0.005822
718,Bones Hyland,0.0,0.018560
719,Bryn Forbes,0.0,-0.007688
...,...,...,...
15131,Micah Potter,0.0,-0.017143
15132,Rodney McGruder,0.0,-0.010045
15133,Saben Lee,0.0,0.006152
15134,Saddiq Bey,0.0,0.004272


In [49]:
combine.sort_values("Share", ascending = False).head(10)

Unnamed: 0,Player,Share,predictions
730,Nikola Jokić,0.875,0.20699
904,Joel Embiid,0.706,0.200762
14225,Giannis Antetokounmpo,0.595,0.220464
974,Devin Booker,0.216,0.093271
14016,Luka Dončić,0.146,0.16384
1246,Jayson Tatum,0.043,0.101878
14849,Ja Morant,0.01,0.119053
7524,Stephen Curry,0.004,0.101192
972,Chris Paul,0.002,0.085617
9573,LeBron James,0.001,0.159404


In [52]:
from sklearn.metrics import mean_squared_error, r2_score

In [51]:
mean_squared_error(combine["Share"],combine["predictions"])

0.0022517571502095454

In [53]:
combine["Share"].value_counts()

0.000    593
0.001      3
0.875      1
0.706      1
0.002      1
0.216      1
0.043      1
0.004      1
0.146      1
0.595      1
0.010      1
Name: Share, dtype: int64