In [21]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [2]:
# Read in player MVP stats
stats = pd.read_csv("../data/player_mvp_stats.csv")

In [3]:
stats.head()

Unnamed: 0.1,Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
1,1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
2,2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
3,3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
4,4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73


In [4]:
# Removing unneeded Unnamed: 0 Column
del stats["Unnamed: 0"]

In [5]:
# Checking for missing data

pd.isnull(stats).sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          59
3P            0
3PA           0
3P%        2086
2P            0
2PA           0
2P%         100
eFG%         59
FT            0
FTA           0
FT%         521
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

In [6]:
# Checking to see if 3P% nulls are coming from players that did not attempt any three pointers

stats[pd.isnull(stats["3P%"])][["Player", "3PA"]]

Unnamed: 0,Player,3PA
2,Elden Campbell,0.0
3,Irving Thomas,0.0
18,Jack Haley,0.0
20,Keith Owens,0.0
30,Benoit Benjamin,0.0
...,...,...
14666,Evan Eschmeyer,0.0
14667,Gheorghe Mureșan,0.0
14669,Jim McIlvaine,0.0
14675,Mark Hendrickson,0.0


In [7]:
# Checking to see if FTP% nulls are coming from players that did not attempt any free throws

stats[pd.isnull(stats["FT%"])][["Player", "FTA"]]

Unnamed: 0,Player,FTA
77,John Coker,0.0
92,Jason Sasser,0.0
103,Adrian Caldwell,0.0
119,Bruno Šundov,0.0
158,Jamal Robinson,0.0
...,...,...
14556,Mark McNamara,0.0
14584,Luke Zeller,0.0
14637,Myron Brown,0.0
14659,Malcolm Lee,0.0


In [8]:
# Replacing missing percentages with 0

stats = stats.fillna(0)

In [9]:
# Looking at all columns to determine most useful columns for prediction

stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [10]:
# Isolating numeric columns (taking out the pts won/max and share as that is what we are trying to predict)

predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS']

In [11]:
# Setting up a training dataframe using stats from before 2021

train = stats[stats["Year"] < 2021]

In [12]:
# Using data from 2021 as the test data

test = stats[stats["Year"] == 2021]

In [13]:
# Initialize ridge regression model

reg = Ridge(alpha = .1)

In [14]:
# Fitting the model to training data targeting mvp vote share

reg.fit(train[predictors], train['Share'])

In [15]:
# Making predictions with the test data

predictions = reg.predict(test[predictors])

In [16]:
predictions = pd.DataFrame(predictions, columns=["predictions"], index=test.index)

In [17]:
predictions

Unnamed: 0,predictions
630,0.013567
631,-0.013756
632,0.002414
633,-0.004421
634,0.010734
...,...
14502,-0.012571
14503,-0.011575
14504,0.016424
14505,-0.020434


In [18]:
# Combining test data with predictions

combination =  pd.concat([test[["Player","Share"]], predictions], axis=1)

In [19]:
combination

Unnamed: 0,Player,Share,predictions
630,Aaron Gordon,0.0,0.013567
631,Austin Rivers,0.0,-0.013756
632,Bol Bol,0.0,0.002414
633,Facundo Campazzo,0.0,-0.004421
634,Greg Whittington,0.0,0.010734
...,...,...,...
14502,Patty Mills,0.0,-0.012571
14503,Quinndary Weatherspoon,0.0,-0.011575
14504,Rudy Gay,0.0,0.016424
14505,Tre Jones,0.0,-0.020434


In [20]:
combination.sort_values("Share", ascending=False).head(10)

Unnamed: 0,Player,Share,predictions
641,Nikola Jokić,0.961,0.154306
9018,Joel Embiid,0.58,0.162713
3843,Stephen Curry,0.449,0.142386
10338,Giannis Antetokounmpo,0.345,0.207436
1499,Chris Paul,0.138,0.072293
11449,Luka Dončić,0.042,0.15143
7759,Damian Lillard,0.038,0.116303
3707,Julius Randle,0.02,0.088877
3702,Derrick Rose,0.01,0.033001
11871,Rudy Gobert,0.008,0.09535


In [22]:
mean_squared_error(combination["Share"], combination["predictions"])

0.0026668960013828758

In [23]:
# Checking how many players recieve mvp votes

combination["Share"].value_counts()

0.000    525
0.001      3
0.961      1
0.138      1
0.010      1
0.020      1
0.449      1
0.005      1
0.038      1
0.003      1
0.580      1
0.345      1
0.042      1
0.008      1
Name: Share, dtype: int64