In [2]:
import pandas as pd
import tensorflow as tf
import sklearn
import matplotlib
import seaborn as sns
import pandas as pd

In [3]:
stats=pd.read_csv('data/combined_stats.csv')

In [4]:
stats.columns

Index(['Player', 'Age', 'Abbreviation', 'Pos', 'G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
       'Awards', 'Year', 'voting_first', 'voting_pts_won', 'voting_pts_max',
       'voting_share', 'advanced_ws', 'advanced_ws_per_48', 'Team', 'W', 'L',
       'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS'],
      dtype='object')

In [5]:
pd.isnull(stats).sum()

Player                    0
Age                       0
Abbreviation              0
Pos                       0
G                         0
GS                        0
MP                        0
FG                        0
FGA                       0
FG%                      62
3P                        0
3PA                       0
3P%                    2162
2P                        0
2PA                       0
2P%                     106
eFG%                     62
FT                        0
FTA                       0
FT%                     551
ORB                       0
DRB                       0
TRB                       0
AST                       0
STL                       0
BLK                       0
TOV                       0
PF                        0
PTS                       0
Awards                13759
Year                      0
voting_first              0
voting_pts_won            0
voting_pts_max            0
voting_share              0
advanced_ws         

### resolving for players with no attempts

In [6]:
stats['3PA'][pd.isna(stats['3P%'])]
stats['2PA'][pd.isna(stats['2P%'])]
stats['FGA'][pd.isna(stats['FG%'])]


127      0.0
156      0.0
319      0.0
382      0.0
686      0.0
        ... 
14714    0.0
14736    0.0
14819    0.0
14841    0.0
14845    0.0
Name: FGA, Length: 62, dtype: float64

In [7]:
stats=stats.fillna(0)

In [8]:
pred_features=['Age','G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS',
        'Year', 'W', 'L',
       'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS']

### defining train and test data

In [9]:
train=stats[stats['Year'] < 2023].copy()
test=stats[stats['Year'] == 2023].copy()

### base model with rigde

In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

ridge_model = make_pipeline(StandardScaler(), Ridge(alpha=1.0))
ridge_model.fit(train[pred_features], train['voting_share'])
predics=ridge_model.predict(test[pred_features])



In [11]:
print(len(test), len(predics))


539 539


In [12]:
review = pd.concat([test[['Player', 'voting_share']], pd.Series(predics,name='prediction',index=test.index)], axis=1)
review.sort_values(by='voting_share', ascending=False, inplace=True)
review

Unnamed: 0,Player,voting_share,prediction
7584,Joel Embiid,0.915,0.203551
11303,Nikola Jokić,0.674,0.175564
5527,Giannis Antetokounmpo,0.606,0.225594
7049,Jayson Tatum,0.280,0.138577
13298,Shai Gilgeous-Alexander,0.046,0.148613
...,...,...,...
5862,Harrison Barnes,0.000,0.009086
5839,Hamidou Diallo,0.000,0.006169
5696,Greg Brown III,0.000,-0.003847
5675,Grayson Allen,0.000,-0.008647


### defining error metric

In [13]:
from sklearn.metrics import mean_squared_error
mean_squared_error(test['voting_share'], predics)

0.0026588123167627605

A traditional measure like the mean square error may not be appropriate even if it shows good results, since most players are not candidates. We want to focus on properly evaluating those most likely to win the MVP.

In [14]:
review['voting_share'].value_counts()

voting_share
0.000    526
0.001      2
0.915      1
0.674      1
0.606      1
0.280      1
0.046      1
0.030      1
0.027      1
0.010      1
0.005      1
0.003      1
0.002      1
Name: count, dtype: int64

In [None]:
review = review.sort_values(by='voting_share', ascending=False)  
review['rk'] = list(range(1, review.shape[0] + 1))

review = review.sort_values(by='prediction', ascending=False)  
review['Pred_rk'] = list(range(1, review.shape[0] + 1))


In [16]:
review.sort_values(by='voting_share', ascending=False).head(10)

Unnamed: 0,Player,voting_share,prediction,rk,Pred_rk
7584,Joel Embiid,0.915,0.203551,1,2
11303,Nikola Jokić,0.674,0.175564,2,4
5527,Giannis Antetokounmpo,0.606,0.225594,3,1
7049,Jayson Tatum,0.28,0.138577,4,9
13298,Shai Gilgeous-Alexander,0.046,0.148613,5,5
4351,Donovan Mitchell,0.03,0.087734,6,22
4270,Domantas Sabonis,0.027,0.094451,7,18
9631,Luka Dončić,0.01,0.196798,8,3
13664,Stephen Curry,0.005,0.107884,9,14
7422,Jimmy Butler,0.003,0.111564,10,13


A metric based on cumulative precision is proposed, where it is evaluated how quickly the model found the 5 best candidates.

In [17]:
def find_ap(df):
    actual_5=df.sort_values(by='voting_share', ascending=False).head(5)
    predics=df.sort_values(by='prediction', ascending=False)
    ps=[]
    found=0
    seen=1
    for index,row in predics.iterrows():
        if row['Player'] in actual_5['Player'].values:
            found+=1
            ps.append(found/seen)
        seen+=1
    return sum(ps)/len(ps)

In [18]:
find_ap(review)

0.821111111111111

A back test is performed to evaluate the accuracy with more data, taking into account that in this problem the chronological order is important.

In [19]:
def backtest(stats,model,pred_features,years_range=2024):
    years=list(range(1990, years_range))
    ap_list=[]
    all_predicss=[]
    for year in years[5:]:
        train=stats[stats['Year'] < year]
        test=stats[stats['Year'] == year]
        model.fit(train[pred_features], train['voting_share'])
        predics=model.predict(test[pred_features])
        review = pd.concat([test[['Player', 'voting_share']], pd.Series(predics,name='prediction',index=test.index)], axis=1)
        review.sort_values(by='voting_share', ascending=False, inplace=True)
        all_predicss.append(review)
        ap_list.append(find_ap(review))
    return sum(ap_list)/len(ap_list),ap_list,all_predicss




In [20]:
mean_ap,ap_list,all_predicss=backtest(stats,ridge_model,pred_features,2024)

In [21]:
print(mean_ap)
all_predicss[-1].head(10)

0.7347933944208411


Unnamed: 0,Player,voting_share,prediction
7584,Joel Embiid,0.915,0.203551
11303,Nikola Jokić,0.674,0.175564
5527,Giannis Antetokounmpo,0.606,0.225594
7049,Jayson Tatum,0.28,0.138577
13298,Shai Gilgeous-Alexander,0.046,0.148613
4351,Donovan Mitchell,0.03,0.087734
4270,Domantas Sabonis,0.027,0.094451
9631,Luka Dončić,0.01,0.196798
13664,Stephen Curry,0.005,0.107884
7422,Jimmy Butler,0.003,0.111564


### first NN proposed

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasRegressor


def create_nn_model():
    model = Sequential()
    model.add(Dense(64, input_dim=len(pred_features), activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))  
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])
    return model


nn_model = KerasRegressor(model=create_nn_model, epochs=100, batch_size=32, verbose=0)




In [23]:
nn_ap,nn_ap_list,nn_all_predicss=backtest(stats,make_pipeline(StandardScaler(),nn_model),pred_features,2024)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **

### some other models

In [24]:
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor



models_dict = {}

models_dict['ridge_regression'] = make_pipeline(StandardScaler(), Ridge(alpha=1.0))
models_dict['lasso_regression'] = make_pipeline(StandardScaler(), Lasso(alpha=0.1))
models_dict['linear_regression'] = make_pipeline(StandardScaler(), LinearRegression())
models_dict['svr'] = make_pipeline(StandardScaler(), SVR(kernel='linear'))
models_dict['random_forest'] =  RandomForestRegressor(n_estimators=100, random_state=42,min_samples_split=5, max_depth=10)

### model evaluation

In [None]:
aps=[]
std_ap=[]
for name, model in models_dict.items():
    try:
        mean_ap,ap_list,all_predicss=backtest(stats,model,pred_features,2024)
        aps.append(mean_ap)
        std_ap.append(np.std(ap_list))
    
    except Exception as e:
        print(f"Error with model {name}: {e}")


### model performance

In [31]:
perfmornace=pd.DataFrame({'model':models_dict.keys(),'mean_ap':aps,'std_ap':std_ap})
perfmornace.loc[len(perfmornace)] = ['nn', nn_ap, np.std(nn_ap_list)]
perfmornace

Unnamed: 0,model,mean_ap,std_ap
0,ridge_regression,0.734793,0.168011
1,lasso_regression,0.219314,0.035886
2,linear_regression,0.728973,0.174458
3,svr,0.794756,0.167586
4,random_forest,0.740563,0.148961
5,nn,0.728139,0.201166


### second NN model

In [None]:
def create_nn_model():
    model = Sequential()
    model.add(Dense(128, input_dim=len(pred_features), activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))  
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])
    return model

nn2_model = KerasRegressor(model=create_nn_model, epochs=100, batch_size=32, verbose=0)
train=stats[stats['Year'] < 2023].copy()
test=stats[stats['Year'] == 2023].copy()
nn2_model.fit(train[pred_features], train['voting_share'])
predics=nn2_model.predict(test[pred_features])
review = pd.concat([test[['Player', 'voting_share']], pd.Series(predics,name='prediction',index=test.index)], axis=1)
review = review.sort_values(by='voting_share', ascending=False)  
review['rk'] = list(range(1, review.shape[0] + 1))

review = review.sort_values(by='prediction', ascending=False)  
review['Pred_rk'] = list(range(1, review.shape[0] + 1))
review.sort_values(by='voting_share', ascending=False).head(10)
find_ap(review)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


0.5149286987522281

In [34]:
best_model=models_dict['svr']
second_model=models_dict['random_forest']

### saving models

In [35]:
import joblib


joblib.dump(best_model, 'models/best_model_pipeline.pkl')
joblib.dump(second_model, 'models/second_model_pipeline.pkl')


['models/second_model_pipeline.pkl']