In [None]:
# import packages

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import sklearn.metrics as metrics
from operator import itemgetter

In [None]:
# set random state

np.random.seed(0)

In [None]:
# load data set and convert raw all-star votes into yearly all-star vote ranke (we do this because votes increase over time)

df_hist = pd.read_csv('historical-data.csv')
df_hist['all_star_votes'] = df_hist.groupby('season_start')['all_star_votes'].rank(ascending = False, method = 'dense')

# Visualize data

In [None]:
# plot of overall seed and pre-season title odds rank

plt.style.use('fivethirtyeight')

exp_seed, ax = plt.subplots()

mvp = df_hist[df_hist['won_mvp'] == 1]
non_mvp = df_hist[df_hist['won_mvp'] != 1]

ax.scatter(mvp['overall_seed'], mvp['preseason_odds_rank'], label = "MVP winners", marker = '^', s = 100)
ax.scatter(non_mvp['overall_seed'], non_mvp['preseason_odds_rank'], label = "The rest", alpha = .2)

ax.legend(loc='best', prop={'size': 9, "family": "Rockwell"})

ax.set_xlabel('Overall seed')
ax.set_ylabel('Pre-season title odds')

ax.set_xlim(ax.get_xlim()[::-1])
ax.set_ylim(ax.get_ylim()[::-1])

exp_seed.suptitle("Wins and the MVP", weight = 'bold', size = 18)

exp_seed.text(x = -0.02, y = -0.08,
    s = '____________________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left', alpha = .3)

exp_seed.text(x = -0.02, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

exp_seed.savefig('exp_seed.png', dpi = 400, bbox_inches = 'tight')

In [None]:
# find 2 players who won with an overall seed below 5

df_hist[(df_hist['won_mvp'] == 1) & (df_hist['overall_seed'] > 5)]

In [None]:
# plot difference between pre-season title odds and overall seed vs. all-star vote rank

plt.style.use('fivethirtyeight')

exp_diff_pop, ax = plt.subplots()

ax.scatter(mvp['preseason_odds_rank'] - mvp['overall_seed'], mvp['all_star_votes'],
           label = "MVP winners", marker = '^', s = 100)
ax.scatter(non_mvp['preseason_odds_rank'] - non_mvp['overall_seed'], non_mvp['all_star_votes'],
           label = "The rest", alpha = .2)

ax.legend(loc='best', prop={'size': 9, "family": "Rockwell"})

ax.set_xlabel('Seed - pre-season title odds rank')
ax.set_ylabel('Yearly All-Star vote rank')

ax.set_ylim(ax.get_ylim()[::-1])

exp_diff_pop.suptitle("Wins, popularity and the MVP", weight = 'bold', size = 18)

exp_diff_pop.text(x = -0.02, y = -0.08,
    s = '____________________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left', alpha = .3)

exp_diff_pop.text(x = -0.02, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

exp_diff_pop.savefig('exp_diff_pop.png', dpi = 400, bbox_inches = 'tight')

In [None]:
# find the 3 players who won mvp while ranking lower than 5 in yearly all-star votes

df_hist[(df_hist['won_mvp'] == 1) & (df_hist['all_star_votes'] > 5)]

In [None]:
# find the 5 players who won mvp while underperforming their pre-season odds

df_hist[(df_hist['won_mvp'] == 1) & (df_hist['preseason_odds_rank'] - df_hist['overall_seed'] < 0)]

# Create MVP models

In [None]:
features = ['age', 'pos_number', 'all_star_votes', 'g',
       'team_wins', 'overall_seed', 'preseason_odds_rank', 'mp', 'fgm',
       'fga', 'fg_perc', '3pm', '3pa', '3p_perc', '2pm', '2pa', '2p_perc',
       'efg', 'ftm', 'fta', 'ft_perc', 'orb', 'drb', 'trb', 'ast', 'stl',
       'blk', 'tov', 'pf', 'pts', 'per', 'ts', '3par', 'ftr', 'orb_perc',
       'drb_perc', 'trb_perc', 'ast_perc', 'stl_perc', 'blk_perc',
       'tov_perv', 'usg_perc', 'ws', 'ws_per_48', 'vorp', 'bpm',
       'offensive_raptor', 'defensive_raptor']

output = ['won_mvp']

In [None]:
log = LogisticRegression(solver = 'liblinear')
lda = LinearDiscriminantAnalysis()

In [None]:
# function to fit a model and return its predicted probability for a given data set

def fit_pred_proba(model, x, y, pred):
    
    model.fit(x, y)
    return model.predict_proba(pred)[:, 1]

In [None]:
# loop to select one year as the testing set, train the models on the remaining years, and return the model predictions for the
# given year

pred_df = []

for season in range(1984, 2019):
    df_pred = df_hist[df_hist['season_start'] == season]
    df_train = df_hist[df_hist['season_start'] != season]
    
    x = df_train[features]
    y = df_train[output].values.ravel()
    pred = df_pred[features]
    
    pred_log = fit_pred_proba(log, x, y, pred)
    pred_lda = fit_pred_proba(lda, x, y, pred)
    
    df_curr = df_pred.copy()
    df_curr['mvp_pred'] = (pred_log + pred_lda) / (2)
    
    pred_df.append(df_curr)

In [None]:
pred_mvp = pd.concat(pred_df)

pred_mvp = pred_mvp[['rank', 'won_mvp', 'player', 'season_start', 'mvp_pred']]

pred_mvp['mvp_pred_rank'] = pred_mvp.groupby('season_start')['mvp_pred'].rank(ascending = False, method = 'dense')

In [None]:
# evaluate accuracy (what percent of players that our model predicted would win/come top-3/top-5 actually did)

len(pred_mvp[(pred_mvp['won_mvp'] == 1) & (pred_mvp['mvp_pred_rank'] == 1)]) / len(pred_mvp[pred_mvp['won_mvp'] == 1])

In [None]:
len(pred_mvp[(pred_mvp['rank'] <= 3) & (pred_mvp['mvp_pred_rank'] <= 3)]) / len(pred_mvp[pred_mvp['rank'] <= 3])

In [None]:
len(pred_mvp[(pred_mvp['rank'] <= 5) & (pred_mvp['mvp_pred_rank'] <= 5)]) / len(pred_mvp[pred_mvp['rank'] <= 5])

In [None]:
# find the model's incorrect predictions

pred_mvp[(pred_mvp['won_mvp'] == 1) & (pred_mvp['mvp_pred_rank'] != 1)]

In [None]:
# convert rank to classes; player with the highest probability in a given year earns a 1, while the rest of the players earn a 0

pred_mvp['pred_winner'] = pred_mvp['mvp_pred_rank']
pred_mvp['pred_winner'].loc[pred_mvp['pred_winner'] != 1] = 0

In [None]:
pred_mvp[(pred_mvp['won_mvp'] != 1) & (pred_mvp['pred_winner'] == 1)]

In [None]:
# calculate model accuracy metrics

ytest = pred_mvp['won_mvp']
y_pred = pred_mvp['pred_winner']

print("Accuracy score: %.3f" % metrics.accuracy_score(ytest, y_pred))
print("Recall: %.3f" % metrics.recall_score(ytest, y_pred))
print("Precision: %.3f" % metrics.precision_score(ytest, y_pred))
print("F1: %.3f" % metrics.f1_score(ytest, y_pred))
    
proba_list = []

for i in pred_mvp['mvp_pred'].values:
    proba_list.append([1 - i, i])
    
proba_list = np.array(proba_list)

print("Log loss: %.3f" % metrics.log_loss(ytest, proba_list))
    
pos_prob = pred_mvp['mvp_pred'].values
print("Breir score: %.3f" % metrics.brier_score_loss(ytest, pos_prob))
    
print("Area under ROC curve: %.3f" % metrics.roc_auc_score(ytest, pos_prob))

In [None]:
# plot roc curve and calculate auc-roc

plt.style.use('fivethirtyeight')

roc, ax = plt.subplots()

fpr, tpr, _ = metrics.roc_curve(ytest, pos_prob)
ax.plot(fpr, tpr)
ax.plot([0, 1], [0, 1], linestyle = '--')

ax.set_xlabel("False Positive Rate")
ax.set_ylabel("True Positive Rate")
ax.set_title("AUC-ROC: 0.952", fontname = 'Rockwell', fontsize = 14)

roc.suptitle("MVP Model ROC Curve", weight = 'bold', y = 1.007, size = 18)

roc.text(x = -0.03, y = -0.08,
        s = '______________________________________________________________',
        fontsize = 14, color = 'grey', horizontalalignment='left', alpha = .3)

roc.text(x = -0.03, y = -.14,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

roc.savefig('roc.png', dpi = 400, bbox_inches = 'tight')

# Create 2020 MVP predictions

In [None]:
# load 2019-20 data and convert all-star votes to rank

df_curr = pd.read_csv('current-data.csv')
df_curr['all_star_votes'] = df_curr.groupby('season_start')['all_star_votes'].rank(ascending = False, method = 'dense')

In [None]:
# create model predictions for new data

log_pred = fit_pred_proba(log, df_hist[features], df_hist[output].values.ravel(), df_curr[features])
lda_pred = fit_pred_proba(lda, df_hist[features], df_hist[output].values.ravel(), df_curr[features])

avg_pred = (log_pred + lda_pred) / 2

In [None]:
# sort data while maintaining order of player names for graph

pred_unsorted = [[i, j] for i, j in zip(avg_pred.tolist(), df_curr['label'].values.tolist())]
pred_sorted = sorted(pred_unsorted, key = itemgetter(0), reverse = True)

pred_data = [row[0] for row in pred_sorted]
pred_labels = [row[1] for row in pred_sorted]

In [None]:
# graph mvp predictions

plt.style.use('fivethirtyeight')

pred, ax = plt.subplots()

ax.bar(np.arange(len(pred_data)), pred_data, width = .7, edgecolor = 'white', color = 'skyblue', linewidth = 4)

rects = ax.patches
for rect, label in zip(rects, pred_labels):
    if rect.get_x() > 2:
        ax.text(rect.get_x() + rect.get_width() / 1.75, rect.get_height() + .02, label,
        ha='center', va='bottom', rotation = 'vertical', color = 'black')
    elif rect.get_x() <= 2:
        height = .03
        ax.text(rect.get_x() + rect.get_width() / 1.75, height, label,
        ha='center', va='bottom', rotation = 'vertical', color = 'black')
    
pred.suptitle("Predicted MVP probability", weight = 'bold', size = 18, y = 1.005)
ax.set_title("NBA.com MVP ladder rank listed in parentheses", size = 14, fontname = 'Rockwell')
ax.xaxis.set_visible(False)
ax.set_ylabel("MVP probability")
ax.set_ylim([0, 1])

pred.text(x = -0.02, y = 0.03,
    s = '_______________________________________________________________',
    fontsize = 14, color = 'grey', horizontalalignment='left', alpha = .3)

pred.text(x = -0.02, y = -.03,
    s = 'https://dribbleanalytics.blog                     ',
    fontsize = 14, fontname = 'Rockwell', color = 'grey', horizontalalignment='left')

pred.savefig('pred.png', dpi = 400, bbox_inches = 'tight')