**Author(s)**: Daniel Tsan

## Introduction
This is my attempt at using Data 100 techniques to predict this season's (2022-2023) NBA MVP. All data is taken from Basketball Reference.


In [1]:
# Run this cell to set up your notebook.
import numpy as np
import pandas as pd

import piplite
await piplite.install('seaborn')
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# advanced stats of previous MVP's 
# Basketball Reference: https://www.basketball-reference.com/awards/mvp.html#mvp_NBA
mvp_adv = pd.read_csv('mvp.csv', header=1)
mvp_adv_simple = mvp_adv.drop(labels=['season', 'Lg', 'Voting', 'Age', 'Tm', 'id'], axis=1)

In [3]:
# get averages of all the stats
mvp_adv_descr = mvp_adv_simple.describe()
mvp_adv_mean = mvp_adv_descr.loc['mean']
mvp_adv_mean

games       77.462687
minutes     38.940299
points      26.085075
rebounds    12.211940
assists      5.635821
steals       1.522449
blocks       1.353061
FG%          0.507746
3P%          0.285651
FT%          0.756015
WS          15.937313
WS/48        0.253507
Name: mean, dtype: float64

In [4]:
# replace NaN values in the table with averages
has_NaN = ['steals', 'blocks', '3P%']

for col in has_NaN:
    mvp_adv_simple[col] = mvp_adv_simple[col].fillna(mvp_adv_mean[col])
    
mvp_adv_simple

Unnamed: 0,player,games,minutes,points,rebounds,assists,steals,blocks,FG%,3P%,FT%,WS,WS/48
0,Nikola Jokić,74,33.5,27.1,13.8,7.9,1.500000,0.900000,0.583,0.337000,0.810,15.2,0.296
1,Nikola Jokić,72,34.6,26.4,10.8,8.3,1.300000,0.700000,0.566,0.388000,0.868,15.6,0.301
2,Giannis Antetokounmpo,63,30.4,29.5,13.6,5.6,1.000000,1.000000,0.553,0.304000,0.633,11.1,0.279
3,Giannis Antetokounmpo,72,32.8,27.7,12.5,5.9,1.300000,1.500000,0.578,0.256000,0.729,14.4,0.292
4,James Harden,72,35.4,30.4,5.4,8.8,1.800000,0.700000,0.449,0.367000,0.858,15.4,0.289
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,Wilt Chamberlain,72,46.4,37.6,27.0,2.3,1.522449,1.353061,0.461,0.285651,0.582,17.0,0.245
63,Bob Pettit,72,39.9,29.2,16.4,3.1,1.522449,1.353061,0.438,0.285651,0.759,14.8,0.246
64,Bill Russell,69,38.3,16.6,22.7,2.9,1.522449,1.353061,0.442,0.285651,0.519,11.3,0.206
65,Bob Cousy,64,36.9,20.6,4.8,7.5,1.522449,1.353061,0.378,0.285651,0.821,8.8,0.178


In [5]:
# advanced stats for 2021-2022 season (all players) 
# Basketball Reference: https://www.basketball-reference.com/leagues/NBA_2022_advanced.html#advanced_stats
s_22_adv = pd.read_csv('s_22_adv.csv', header=0)
s_22_adv_simple = s_22_adv.drop(labels=['Rk', 'Pos', 'Age', 'Tm', 'Unnamed: 19', 'Unnamed: 24'], axis=1)
s_22_adv_simple

Unnamed: 0,player,games,minutes,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,id
0,Precious Achiuwa,73,1725,12.7,0.503,0.259,0.217,8.7,21.7,14.9,...,18.5,0.4,2.1,2.5,0.070,-2.0,-0.6,-2.6,-0.2,achiupr01
1,Steven Adams,76,1999,17.6,0.560,0.003,0.518,17.9,22.0,19.9,...,12.0,3.8,3.0,6.8,0.163,1.0,1.0,2.0,2.0,adamsst01
2,Bam Adebayo,56,1825,21.8,0.608,0.008,0.466,8.7,26.1,17.5,...,25.0,3.6,3.5,7.2,0.188,1.7,2.1,3.8,2.7,adebaba01
3,Santi Aldama,32,360,10.2,0.452,0.364,0.242,9.4,16.1,12.6,...,18.4,-0.1,0.4,0.3,0.044,-4.2,-1.5,-5.7,-0.3,aldamsa01
4,LaMarcus Aldridge,47,1050,19.6,0.604,0.100,0.223,7.8,18.7,13.4,...,22.4,2.1,1.0,3.1,0.141,1.3,-0.6,0.7,0.7,aldrila01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
807,Thaddeus Young,26,475,15.8,0.526,0.299,0.188,8.7,18.1,13.1,...,16.0,0.5,0.8,1.3,0.127,-0.2,2.2,2.0,0.5,youngth01
808,Trae Young,76,2652,25.4,0.603,0.395,0.358,2.1,9.6,5.9,...,34.4,9.0,1.0,10.0,0.181,7.1,-2.0,5.2,4.8,youngtr01
809,Omer Yurtseven,56,706,17.4,0.546,0.045,0.247,13.9,33.0,23.6,...,19.9,0.8,1.4,2.1,0.145,-1.4,0.4,-1.0,0.2,yurtsom01
810,Cody Zeller,27,355,17.2,0.627,0.044,0.544,14.9,24.1,19.3,...,15.9,0.9,0.2,1.1,0.143,-1.2,-1.0,-2.1,0.0,zelleco01


In [6]:
# get averages of all the stats
s_22_adv_descr = s_22_adv_simple.describe()
s_22_adv_mean = s_22_adv_descr.loc['mean']
s_22_adv_mean

games       36.705665
minutes    825.188424
PER         12.218719
TS%          0.527232
3PAr         0.400491
FTr          0.248739
ORB%         5.402709
DRB%        14.759360
TRB%        10.085345
AST%        12.868473
STL%         1.591872
BLK%         1.753695
TOV%        12.219274
USG%        18.014409
OWS          0.887685
DWS          0.821305
WS           1.712808
WS/48        0.065845
OBPM        -1.998030
DBPM        -0.403325
BPM         -2.399877
VORP         0.397044
Name: mean, dtype: float64

In [7]:
# replace NaN values in the table with averages
has_NaN = ['TS%', '3PAr', 'FTr', 'TOV%']

for col in has_NaN:
    s_22_adv_simple[col] = s_22_adv_simple[col].fillna(s_22_adv_mean[col])
    
s_22_adv_simple

Unnamed: 0,player,games,minutes,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,...,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP,id
0,Precious Achiuwa,73,1725,12.7,0.503,0.259,0.217,8.7,21.7,14.9,...,18.5,0.4,2.1,2.5,0.070,-2.0,-0.6,-2.6,-0.2,achiupr01
1,Steven Adams,76,1999,17.6,0.560,0.003,0.518,17.9,22.0,19.9,...,12.0,3.8,3.0,6.8,0.163,1.0,1.0,2.0,2.0,adamsst01
2,Bam Adebayo,56,1825,21.8,0.608,0.008,0.466,8.7,26.1,17.5,...,25.0,3.6,3.5,7.2,0.188,1.7,2.1,3.8,2.7,adebaba01
3,Santi Aldama,32,360,10.2,0.452,0.364,0.242,9.4,16.1,12.6,...,18.4,-0.1,0.4,0.3,0.044,-4.2,-1.5,-5.7,-0.3,aldamsa01
4,LaMarcus Aldridge,47,1050,19.6,0.604,0.100,0.223,7.8,18.7,13.4,...,22.4,2.1,1.0,3.1,0.141,1.3,-0.6,0.7,0.7,aldrila01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
807,Thaddeus Young,26,475,15.8,0.526,0.299,0.188,8.7,18.1,13.1,...,16.0,0.5,0.8,1.3,0.127,-0.2,2.2,2.0,0.5,youngth01
808,Trae Young,76,2652,25.4,0.603,0.395,0.358,2.1,9.6,5.9,...,34.4,9.0,1.0,10.0,0.181,7.1,-2.0,5.2,4.8,youngtr01
809,Omer Yurtseven,56,706,17.4,0.546,0.045,0.247,13.9,33.0,23.6,...,19.9,0.8,1.4,2.1,0.145,-1.4,0.4,-1.0,0.2,yurtsom01
810,Cody Zeller,27,355,17.2,0.627,0.044,0.544,14.9,24.1,19.3,...,15.9,0.9,0.2,1.1,0.143,-1.2,-1.0,-2.1,0.0,zelleco01


In [8]:
#extract just the winshare columns
s_22_ws = s_22_adv_simple[['player', 'WS', 'WS/48']]
s_22_ws

Unnamed: 0,player,WS,WS/48
0,Precious Achiuwa,2.5,0.070
1,Steven Adams,6.8,0.163
2,Bam Adebayo,7.2,0.188
3,Santi Aldama,0.3,0.044
4,LaMarcus Aldridge,3.1,0.141
...,...,...,...
807,Thaddeus Young,1.3,0.127
808,Trae Young,10.0,0.181
809,Omer Yurtseven,2.1,0.145
810,Cody Zeller,1.1,0.143


In [9]:
# per game stats from 2021-2022 season
# Basketball Reference: https://www.basketball-reference.com/leagues/NBA_2022_per_game.html#per_game_stats
s_22 = pd.read_csv('s_22.csv', header=0)
s_22_simple = s_22.drop(labels=['Rk', 'Pos', 'Age', 'Tm'], axis=1)
s_22_simple

Unnamed: 0,player,games,starter,minutes,FG,FGA,FG%,3P,3PA,3P%,...,ORB,DRB,rebounds,assists,steals,blocks,TOV,fouls,points,id
0,Precious Achiuwa,73,28,23.6,3.6,8.3,0.439,0.8,2.1,0.359,...,2.0,4.5,6.5,1.1,0.5,0.6,1.2,2.1,9.1,achiupr01
1,Steven Adams,76,75,26.3,2.8,5.1,0.547,0.0,0.0,0.000,...,4.6,5.4,10.0,3.4,0.9,0.8,1.5,2.0,6.9,adamsst01
2,Bam Adebayo,56,56,32.6,7.3,13.0,0.557,0.0,0.1,0.000,...,2.4,7.6,10.1,3.4,1.4,0.8,2.6,3.1,19.1,adebaba01
3,Santi Aldama,32,0,11.3,1.7,4.1,0.402,0.2,1.5,0.125,...,1.0,1.7,2.7,0.7,0.2,0.3,0.5,1.1,4.1,aldamsa01
4,LaMarcus Aldridge,47,12,22.3,5.4,9.7,0.550,0.3,1.0,0.304,...,1.6,3.9,5.5,0.9,0.3,1.0,0.9,1.7,12.9,aldrila01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
807,Thaddeus Young,26,0,18.3,2.6,5.5,0.465,0.7,1.7,0.395,...,1.5,2.9,4.4,1.7,1.2,0.4,0.8,1.7,6.3,youngth01
808,Trae Young,76,76,34.9,9.4,20.3,0.460,3.1,8.0,0.382,...,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,youngtr01
809,Omer Yurtseven,56,12,12.6,2.3,4.4,0.526,0.0,0.2,0.091,...,1.5,3.7,5.3,0.9,0.3,0.4,0.7,1.5,5.3,yurtsom01
810,Cody Zeller,27,0,13.1,1.9,3.3,0.567,0.0,0.1,0.000,...,1.9,2.8,4.6,0.8,0.3,0.2,0.7,2.1,5.2,zelleco01


In [10]:
# get averages of all the stats
s_22_descr = s_22_simple.describe()
s_22_mean = s_22_descr.loc['mean']
s_22_mean

games       36.705665
starter     16.672414
minutes     18.264655
FG           2.869828
FGA          6.386330
FG%          0.434257
3P           0.871305
3PA          2.560591
3P%          0.303445
2P           2.000000
2PA          3.828448
2P%          0.505523
eFG%         0.497483
FT           1.204310
FTA          1.575123
FT%          0.747571
ORB          0.812808
DRB          2.519704
rebounds     3.331527
assists      1.808251
steals       0.582759
blocks       0.353448
TOV          0.978695
fouls        1.564532
points       7.812192
Name: mean, dtype: float64

In [11]:
# replace NaN values in the table with averages
has_NaN = ['FG%', '3P%', '2P%', 'eFG%', 'FT%']

for col in has_NaN:
    s_22_simple[col] = s_22_simple[col].fillna(s_22_mean[col])
    
s_22_simple

Unnamed: 0,player,games,starter,minutes,FG,FGA,FG%,3P,3PA,3P%,...,ORB,DRB,rebounds,assists,steals,blocks,TOV,fouls,points,id
0,Precious Achiuwa,73,28,23.6,3.6,8.3,0.439,0.8,2.1,0.359000,...,2.0,4.5,6.5,1.1,0.5,0.6,1.2,2.1,9.1,achiupr01
1,Steven Adams,76,75,26.3,2.8,5.1,0.547,0.0,0.0,0.000000,...,4.6,5.4,10.0,3.4,0.9,0.8,1.5,2.0,6.9,adamsst01
2,Bam Adebayo,56,56,32.6,7.3,13.0,0.557,0.0,0.1,0.000000,...,2.4,7.6,10.1,3.4,1.4,0.8,2.6,3.1,19.1,adebaba01
3,Santi Aldama,32,0,11.3,1.7,4.1,0.402,0.2,1.5,0.125000,...,1.0,1.7,2.7,0.7,0.2,0.3,0.5,1.1,4.1,aldamsa01
4,LaMarcus Aldridge,47,12,22.3,5.4,9.7,0.550,0.3,1.0,0.304000,...,1.6,3.9,5.5,0.9,0.3,1.0,0.9,1.7,12.9,aldrila01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
807,Thaddeus Young,26,0,18.3,2.6,5.5,0.465,0.7,1.7,0.395000,...,1.5,2.9,4.4,1.7,1.2,0.4,0.8,1.7,6.3,youngth01
808,Trae Young,76,76,34.9,9.4,20.3,0.460,3.1,8.0,0.382000,...,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4,youngtr01
809,Omer Yurtseven,56,12,12.6,2.3,4.4,0.526,0.0,0.2,0.091000,...,1.5,3.7,5.3,0.9,0.3,0.4,0.7,1.5,5.3,yurtsom01
810,Cody Zeller,27,0,13.1,1.9,3.3,0.567,0.0,0.1,0.000000,...,1.9,2.8,4.6,0.8,0.3,0.2,0.7,2.1,5.2,zelleco01


In [12]:
# extract games, minutes, points, rebounds, assists, steals, blocks, FG%, 3P%, FT%
s_22_extract = s_22_simple[['player', 'games', 'minutes', 'points', 'rebounds', 'assists', 'steals', 'blocks', 'FG%', '3P%', 'FT%']]
s_22_extract

Unnamed: 0,player,games,minutes,points,rebounds,assists,steals,blocks,FG%,3P%,FT%
0,Precious Achiuwa,73,23.6,9.1,6.5,1.1,0.5,0.6,0.439,0.359000,0.595
1,Steven Adams,76,26.3,6.9,10.0,3.4,0.9,0.8,0.547,0.000000,0.543
2,Bam Adebayo,56,32.6,19.1,10.1,3.4,1.4,0.8,0.557,0.000000,0.753
3,Santi Aldama,32,11.3,4.1,2.7,0.7,0.2,0.3,0.402,0.125000,0.625
4,LaMarcus Aldridge,47,22.3,12.9,5.5,0.9,0.3,1.0,0.550,0.304000,0.873
...,...,...,...,...,...,...,...,...,...,...,...
807,Thaddeus Young,26,18.3,6.3,4.4,1.7,1.2,0.4,0.465,0.395000,0.481
808,Trae Young,76,34.9,28.4,3.7,9.7,0.9,0.1,0.460,0.382000,0.904
809,Omer Yurtseven,56,12.6,5.3,5.3,0.9,0.3,0.4,0.526,0.091000,0.623
810,Cody Zeller,27,13.1,5.2,4.6,0.8,0.3,0.2,0.567,0.000000,0.776


In [13]:
# join the extracted tables together
s_22_join = s_22_extract.set_index('player').join(s_22_ws.set_index('player')).reset_index()
s_22_join

Unnamed: 0,player,games,minutes,points,rebounds,assists,steals,blocks,FG%,3P%,FT%,WS,WS/48
0,Aaron Gordon,75,31.7,15.0,5.9,2.5,0.6,0.6,0.520,0.335,0.743000,5.2,0.105
1,Aaron Henry,6,2.8,0.3,0.2,0.0,0.0,0.3,0.200,0.000,0.747571,-0.1,-0.306
2,Aaron Holiday,63,16.2,6.3,1.9,2.4,0.7,0.1,0.447,0.379,0.868000,1.5,0.068
3,Aaron Holiday,63,16.2,6.3,1.9,2.4,0.7,0.1,0.447,0.379,0.868000,0.5,0.038
4,Aaron Holiday,63,16.2,6.3,1.9,2.4,0.7,0.1,0.447,0.379,0.868000,0.9,0.125
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1469,Zach LaVine,67,34.7,24.4,4.6,4.5,0.6,0.3,0.476,0.389,0.853000,5.8,0.120
1470,Zavier Simpson,4,43.5,11.0,5.3,7.5,1.3,1.0,0.365,0.125,1.000000,-0.1,-0.018
1471,Zeke Nnaji,41,17.0,6.6,3.6,0.4,0.4,0.3,0.516,0.463,0.631000,1.8,0.123
1472,Ziaire Williams,62,21.7,8.1,2.1,1.0,0.6,0.2,0.450,0.314,0.782000,2.2,0.080


In [14]:
mvp_adv_simple

Unnamed: 0,player,games,minutes,points,rebounds,assists,steals,blocks,FG%,3P%,FT%,WS,WS/48
0,Nikola Jokić,74,33.5,27.1,13.8,7.9,1.500000,0.900000,0.583,0.337000,0.810,15.2,0.296
1,Nikola Jokić,72,34.6,26.4,10.8,8.3,1.300000,0.700000,0.566,0.388000,0.868,15.6,0.301
2,Giannis Antetokounmpo,63,30.4,29.5,13.6,5.6,1.000000,1.000000,0.553,0.304000,0.633,11.1,0.279
3,Giannis Antetokounmpo,72,32.8,27.7,12.5,5.9,1.300000,1.500000,0.578,0.256000,0.729,14.4,0.292
4,James Harden,72,35.4,30.4,5.4,8.8,1.800000,0.700000,0.449,0.367000,0.858,15.4,0.289
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,Wilt Chamberlain,72,46.4,37.6,27.0,2.3,1.522449,1.353061,0.461,0.285651,0.582,17.0,0.245
63,Bob Pettit,72,39.9,29.2,16.4,3.1,1.522449,1.353061,0.438,0.285651,0.759,14.8,0.246
64,Bill Russell,69,38.3,16.6,22.7,2.9,1.522449,1.353061,0.442,0.285651,0.519,11.3,0.206
65,Bob Cousy,64,36.9,20.6,4.8,7.5,1.522449,1.353061,0.378,0.285651,0.821,8.8,0.178


In [None]:
# create an indicator table where 1 is they are at least the mvp average and 0 if they are under.
