# Step 1: Read and prepare data

In [1]:
import pandas as pd

In [2]:
stats = pd.read_csv("player_mvp_stats.csv")

In [3]:
stats

Unnamed: 0.1,Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS,Seed
0,0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,...,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73,2
1,1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,...,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73,2
2,2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,...,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73,2
3,3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,...,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73,2
4,4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,...,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14087,14087,Spencer Hawes,PF,28,MIL,54,1,14.8,2.5,5.1,...,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45,6
14088,14088,Steve Novak,PF,33,MIL,8,0,2.8,0.3,0.9,...,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45,6
14089,14089,Terrence Jones,PF,25,MIL,54,12,23.5,4.3,9.1,...,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45,6
14090,14090,Thon Maker,C,19,MIL,57,34,9.9,1.5,3.2,...,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45,6


In [4]:
del stats["Unnamed: 0"]

In [5]:
stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14092 entries, 0 to 14091
Data columns (total 42 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Player   14092 non-null  object 
 1   Pos      14092 non-null  object 
 2   Age      14092 non-null  int64  
 3   Tm       14092 non-null  object 
 4   G        14092 non-null  int64  
 5   GS       14092 non-null  int64  
 6   MP       14092 non-null  float64
 7   FG       14092 non-null  float64
 8   FGA      14092 non-null  float64
 9   FG%      14042 non-null  float64
 10  3P       14092 non-null  float64
 11  3PA      14092 non-null  float64
 12  3P%      12050 non-null  float64
 13  2P       14092 non-null  float64
 14  2PA      14092 non-null  float64
 15  2P%      14008 non-null  float64
 16  eFG%     14042 non-null  float64
 17  FT       14092 non-null  float64
 18  FTA      14092 non-null  float64
 19  FT%      13630 non-null  float64
 20  ORB      14092 non-null  float64
 21  DRB      140

### Null values in FG%, 3P%, and FT%; division by 0 is likely explanation

In [6]:
stats[pd.isnull(stats["3P%"])][["Player", "3PA"]]

Unnamed: 0,Player,3PA
2,Elden Campbell,0.0
3,Irving Thomas,0.0
18,Jack Haley,0.0
20,Keith Owens,0.0
30,Benoit Benjamin,0.0
...,...,...
14061,Evan Eschmeyer,0.0
14062,Gheorghe Mureșan,0.0
14064,Jim McIlvaine,0.0
14070,Mark Hendrickson,0.0


In [7]:
stats[pd.isnull(stats["FT%"])][["Player", "FTA"]]

Unnamed: 0,Player,FTA
77,John Coker,0.0
92,Jason Sasser,0.0
103,Adrian Caldwell,0.0
119,Bruno Šundov,0.0
158,Jamal Robinson,0.0
...,...,...
13951,Mark McNamara,0.0
13979,Luke Zeller,0.0
14032,Myron Brown,0.0
14054,Malcolm Lee,0.0


### This confirms that those with null % values did not attempt any FG, 3P or FT; impute with 0's (consider using league average for that year instead)

In [8]:
stats = stats.fillna(0)

In [9]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS', 'Seed'],
      dtype='object')

In [10]:
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS', 'Seed']

In [11]:
train = stats[stats["Year"] < 2021]
test = stats[stats["Year"] == 2021]

# Model 1: Ridge Regression

In [12]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha = 1)

In [13]:
ridge.fit(train[predictors], train["Share"])

Ridge(alpha=1)

In [14]:
predictions = ridge.predict(test[predictors])

In [15]:
predictions = pd.DataFrame(predictions, columns = ["predictions"], index = test.index)

In [16]:
predictions

Unnamed: 0,predictions
630,0.015362
631,-0.012965
632,0.003106
633,-0.004091
634,0.012598
...,...
13897,-0.013308
13898,-0.011936
13899,0.016189
13900,-0.019683


In [17]:
combination = pd.concat([test[["Player", "Share"]], predictions], axis=1)

In [18]:
combination

Unnamed: 0,Player,Share,predictions
630,Aaron Gordon,0.0,0.015362
631,Austin Rivers,0.0,-0.012965
632,Bol Bol,0.0,0.003106
633,Facundo Campazzo,0.0,-0.004091
634,Greg Whittington,0.0,0.012598
...,...,...,...
13897,Patty Mills,0.0,-0.013308
13898,Quinndary Weatherspoon,0.0,-0.011936
13899,Rudy Gay,0.0,0.016189
13900,Tre Jones,0.0,-0.019683


In [19]:
combination = combination.sort_values("Share", ascending = False)

In [20]:
from sklearn.metrics import mean_squared_error

mean_squared_error(combination["Share"], combination["predictions"])

0.0026812734738800837

### MSE is not a great error metric since only the top few MVP candidates are significant and most players will have a share of 0. Create new metric based on the top 5 MVP finishes each season

In [21]:
actual = combination.sort_values("Share", ascending = False)
combination["Rk"] = list(range(1, combination.shape[0]+1))

In [22]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk
641,Nikola Jokić,0.961,0.155672,1
8624,Joel Embiid,0.58,0.164936,2
3651,Stephen Curry,0.449,0.143951,3
9907,Giannis Antetokounmpo,0.345,0.209555,4
1389,Chris Paul,0.138,0.073629,5
10997,Luka Dončić,0.042,0.152304,6
7464,Damian Lillard,0.038,0.118753,7
3536,Julius Randle,0.02,0.091399,8
3531,Derrick Rose,0.01,0.03541,9
11358,Rudy Gobert,0.008,0.097795,10


In [23]:
combination = combination.sort_values("predictions", ascending = False)
combination["Predicted_Rk"] = list(range(1, combination.shape[0] + 1))


In [24]:
combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
9907,Giannis Antetokounmpo,0.345,0.209555,4,1
8624,Joel Embiid,0.58,0.164936,2,2
641,Nikola Jokić,0.961,0.155672,1,3
10997,Luka Dončić,0.042,0.152304,6,4
3736,LeBron James,0.001,0.149127,15,5
3651,Stephen Curry,0.449,0.143951,3,6
4177,Kevin Durant,0.0,0.143659,531,7
4174,James Harden,0.001,0.143004,13,8
11784,Zion Williamson,0.0,0.126547,251,9
3876,Russell Westbrook,0.005,0.120107,11,10


In [25]:
combination.sort_values("predictions", ascending = False).head(10)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk
9907,Giannis Antetokounmpo,0.345,0.209555,4,1
8624,Joel Embiid,0.58,0.164936,2,2
641,Nikola Jokić,0.961,0.155672,1,3
10997,Luka Dončić,0.042,0.152304,6,4
3736,LeBron James,0.001,0.149127,15,5
3651,Stephen Curry,0.449,0.143951,3,6
4177,Kevin Durant,0.0,0.143659,531,7
4174,James Harden,0.001,0.143004,13,8
11784,Zion Williamson,0.0,0.126547,251,9
3876,Russell Westbrook,0.005,0.120107,11,10


In [26]:
def find_ap(combination):
    actual = combination.sort_values("Share", ascending = False).head(5)
    predicted = combination.sort_values("predictions", ascending = False)
    ps = []
    found = 0
    seen = 0
    for index, row in predicted.iterrows():
        seen += 1
        if row["Player"] in actual["Player"].values:
            found += 1
            ps.append(found/seen)
    return sum(ps) / len(ps)

In [27]:
find_ap(combination)

0.7636363636363636

In [28]:
years = list(range(1991, 2022))

In [29]:
aps = []
all_predictions = []

for year in years[5:]:
    train = stats[stats["Year"] < year]
    test = stats[stats["Year"] == year]
    ridge.fit(train[predictors], train["Share"])
    predictions = ridge.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns = ["predictions"], index = test.index)
    combination = pd.concat([test[["Player", "Share"]], predictions], axis = 1)
    all_predictions.append(combination)
    aps.append(find_ap(combination))

In [30]:
sum(aps) / len(aps)

0.7145582020749968

### For the years 1996 - 2022, our model was able to predict the top 5 MVP finalists with about 71% accuracy using our player-searching metric.

In [31]:
def add_ranks(combination):
    combination = combination.sort_values("Share", ascending = False)
    combination["Rk"] = list(range(1, combination.shape[0]+1))
    combination = combination.sort_values("predictions", ascending = False)
    combination["Predicted_Rk"] = list(range(1, combination.shape[0] + 1))
    combination["Diff"] = combination["Rk"] - combination["Predicted_Rk"]
    return combination

In [32]:
ranking = add_ranks(all_predictions[1])
ranking[ranking["Rk"] <= 5].sort_values("Diff", ascending = False)

Unnamed: 0,Player,Share,predictions,Rk,Predicted_Rk,Diff
1600,Karl Malone,0.857,0.191039,1,2,-1
10524,Michael Jordan,0.832,0.166353,2,3,-1
908,Grant Hill,0.327,0.12861,3,6,-3
4682,Tim Hardaway,0.207,0.061123,4,20,-16
8248,Glen Rice,0.117,0.032864,5,53,-48


In [33]:
def backtest(stats, model, year, predictors):
    aps = []
    all_predictions = []

    for year in years[5:]:
        train = stats[stats["Year"] < year]
        test = stats[stats["Year"] == year]
        model.fit(train[predictors], train["Share"])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns = ["predictions"], index = test.index)
        combination = pd.concat([test[["Player", "Share"]], predictions], axis = 1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))
    
    return sum(aps) / len(aps), aps, pd.concat(all_predictions)

# Model 2: Lasso Regression
### more likely to zero out some variables

In [34]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha = 0.05)

In [35]:
mean_ap, aps, all_predictions = backtest(stats, lasso, years[5:], predictors)

In [36]:
lasso.coef_

array([ 0.00000000e+00, -0.00000000e+00,  1.23148855e-04,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  9.87685758e-04, -0.00000000e+00,  8.02709487e-05,
       -1.81525210e-04,  0.00000000e+00, -0.00000000e+00,  0.00000000e+00,
       -0.00000000e+00,  0.00000000e+00, -0.00000000e+00])

In [37]:
mean_ap

0.7550729679709028

In [38]:
ridge.coef_

array([ 3.22137656e-04,  9.04184092e-05, -7.78482795e-06, -4.12493672e-03,
        5.94566776e-03,  4.39614907e-03, -9.77494101e-02,  3.52681965e-03,
       -8.75304552e-03, -7.80624144e-03,  1.64912427e-02, -1.64045425e-02,
       -2.97726734e-03,  4.06461719e-02, -6.98560306e-03,  1.14611452e-02,
       -4.33506841e-03,  1.95027206e-02,  3.30920241e-02, -2.55254211e-02,
        7.41954848e-03,  1.15929317e-02,  1.12177893e-02, -9.71777866e-03,
       -2.68766544e-03,  6.08901841e-03, -1.59605744e-04,  9.41484642e-06,
       -2.10655092e-04,  1.73418853e-02,  2.63273749e-04, -1.59651565e-03,
        8.26349878e-04,  6.93245545e-04, -7.74596678e-04])

# Model 3: Random Forest

In [39]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=50, random_state=1, min_samples_split=5)

In [40]:
mean_ap, aps, all_predictions = backtest(stats, rf, years[5:], predictors)

In [41]:
mean_ap

0.7036509556305937

In [43]:
mean_ap, aps, all_predictions = backtest(stats, lasso, years[5:], predictors)

In [44]:
mean_ap

0.7550729679709028

# Model 4: Neural Network

In [50]:
from sklearn.neural_network import MLPRegressor

In [53]:
nn_params = {
    'hidden_layer_sizes': [25, 50, 100],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
}

nn_gs = GridSearchCV(estimator=MLPRegressor(),
                  param_grid = nn_params,)

mean_ap, aps, all_predictions = backtest(stats, nn_gs, years[28:], predictors)











In [54]:
mean_ap

0.7710961683436015

In [46]:
from sklearn.model_selection import GridSearchCV

In [47]:
from sklearn.linear_model import LinearRegression

linear_params = {
    'normalize': [True, False],
    'fit_intercept': [True, False],
    'copy_X': [True, False]
}

lin_gs = GridSearchCV(estimator=LinearRegression(),
                     param_grid = linear_params)

mean_ap, aps, all_predictions = backtest(stats, lin_gs, years[5:], predictors)

In [48]:
mean_ap

0.7125370555080472