# Data Analysis and Machine Learning [3/3]

In [1]:
import pandas as pd 
import numpy as np

In [2]:
stats = pd.read_csv("player_mvp_stats.csv", index_col=0)
stats

# we are gonna use the stats to predic thte votes everyone would get in the mvp voting

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Pts Max,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.340,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,0.0,0.0,Los Angeles Lakers,58,24,0.707,5.0,106.3,99.6,6.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14087,Spencer Hawes,PF,28,MIL,54,1,14.8,2.5,5.1,0.484,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14088,Steve Novak,PF,33,MIL,8,0,2.8,0.3,0.9,0.286,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14089,Terrence Jones,PF,25,MIL,54,12,23.5,4.3,9.1,0.470,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45
14090,Thon Maker,C,19,MIL,57,34,9.9,1.5,3.2,0.459,...,0.0,0.0,Milwaukee Bucks,42,40,0.512,9.0,103.6,103.8,-0.45


## 1. Looking at the Data / Cleaning

In [3]:
# checking null values 
pd.isnull(stats).sum()

Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
FG            0
FGA           0
FG%          50
3P            0
3PA           0
3P%        2042
2P            0
2PA           0
2P%          84
eFG%         50
FT            0
FTA           0
FT%         462
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
Year          0
Pts Won       0
Pts Max       0
Share         0
Team          0
W             0
L             0
W/L%          0
GB            0
PS/G          0
PA/G          0
SRS           0
dtype: int64

In [4]:
stats[pd.isnull(stats['3P%'])][['Player', '3PA']]

Unnamed: 0,Player,3PA
2,Elden Campbell,0.0
3,Irving Thomas,0.0
18,Jack Haley,0.0
20,Keith Owens,0.0
30,Benoit Benjamin,0.0
...,...,...
14061,Evan Eschmeyer,0.0
14062,Gheorghe Mureșan,0.0
14064,Jim McIlvaine,0.0
14070,Mark Hendrickson,0.0


In [5]:
stats[pd.isnull(stats['FT%'])][['Player', 'FTA']]

Unnamed: 0,Player,FTA
77,John Coker,0.0
92,Jason Sasser,0.0
103,Adrian Caldwell,0.0
119,Bruno Šundov,0.0
158,Jamal Robinson,0.0
...,...,...
13951,Mark McNamara,0.0
13979,Luke Zeller,0.0
14032,Myron Brown,0.0
14054,Malcolm Lee,0.0


In [6]:
# so we can replace the missing percentages with 0
# technically not correct but we are doing so for the moment
stats = stats.fillna(0)

## 2. Training a model

In [7]:
stats.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year',
       'Pts Won', 'Pts Max', 'Share', 'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G',
       'PA/G', 'SRS'],
      dtype='object')

In [8]:
#numeric columns
predictors = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
              '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
              'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year','W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS']

# we exclude ['Pts Won', 'Pts Max', 'Share'] because it's basically what we want to predict

In [9]:
train = stats[stats['Year']<2021]
test = stats[stats['Year']==2021]

In [10]:
from sklearn.linear_model import Ridge
# ridge is designed by Tikhonov to prevent overfitting
reg = Ridge(alpha=0.1) # alpha controls the shrinking of coefficients to prevent overfitting

In [11]:
reg.fit(train[predictors], train['Share'])

In [12]:
predictions = reg.predict(test[predictors])

In [13]:
predictions = pd.DataFrame(predictions, columns=['predictions'], index=test.index)

In [14]:
predictions

Unnamed: 0,predictions
630,0.013567
631,-0.013756
632,0.002414
633,-0.004421
634,0.010734
...,...
13897,-0.012571
13898,-0.011575
13899,0.016424
13900,-0.020434


In [15]:
combination = pd.concat([test[['Player', 'Share']], predictions], axis=1)

In [16]:
combination #for every player has a share which is a percentage of the mvp votes

Unnamed: 0,Player,Share,predictions
630,Aaron Gordon,0.0,0.013567
631,Austin Rivers,0.0,-0.013756
632,Bol Bol,0.0,0.002414
633,Facundo Campazzo,0.0,-0.004421
634,Greg Whittington,0.0,0.010734
...,...,...,...
13897,Patty Mills,0.0,-0.012571
13898,Quinndary Weatherspoon,0.0,-0.011575
13899,Rudy Gay,0.0,0.016424
13900,Tre Jones,0.0,-0.020434


In [17]:
combination.sort_values('Share', ascending=False).head(10)

Unnamed: 0,Player,Share,predictions
641,Nikola Jokić,0.961,0.154307
8624,Joel Embiid,0.58,0.162713
3651,Stephen Curry,0.449,0.142386
9907,Giannis Antetokounmpo,0.345,0.207436
1389,Chris Paul,0.138,0.072294
10997,Luka Dončić,0.042,0.15143
7464,Damian Lillard,0.038,0.116303
3536,Julius Randle,0.02,0.088878
3531,Derrick Rose,0.01,0.033
11358,Rudy Gobert,0.008,0.095349


## 3. Identifying an error metric

In [18]:
from sklearn.metrics import mean_squared_error

mean_squared_error(combination['Share'], combination['predictions'])

0.002666895456710413

In [19]:
combination['Share'].value_counts() #majority is zero so mse is not super insightful
# we just care about the top5 for example

0.000    525
0.001      3
0.961      1
0.138      1
0.010      1
0.020      1
0.449      1
0.005      1
0.038      1
0.003      1
0.580      1
0.345      1
0.042      1
0.008      1
Name: Share, dtype: int64

In [20]:
#introduce rank 
combination = combination.sort_values('Share', ascending=False)
combination['Rk'] = list(range(1, combination.shape[0]+1))

combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk
641,Nikola Jokić,0.961,0.154307,1
8624,Joel Embiid,0.58,0.162713,2
3651,Stephen Curry,0.449,0.142386,3
9907,Giannis Antetokounmpo,0.345,0.207436,4
1389,Chris Paul,0.138,0.072294,5
10997,Luka Dončić,0.042,0.15143,6
7464,Damian Lillard,0.038,0.116303,7
3536,Julius Randle,0.02,0.088878,8
3531,Derrick Rose,0.01,0.033,9
11358,Rudy Gobert,0.008,0.095349,10


In [21]:
# same for our predictions
combination = combination.sort_values('predictions', ascending=False)
combination['Pred_Rk'] = list(range(1, combination.shape[0]+1))

combination.head(10)

Unnamed: 0,Player,Share,predictions,Rk,Pred_Rk
9907,Giannis Antetokounmpo,0.345,0.207436,4,1
8624,Joel Embiid,0.58,0.162713,2,2
641,Nikola Jokić,0.961,0.154307,1,3
10997,Luka Dončić,0.042,0.15143,6,4
3736,LeBron James,0.001,0.147511,15,5
3651,Stephen Curry,0.449,0.142386,3,6
4177,Kevin Durant,0.0,0.14135,531,7
4174,James Harden,0.001,0.140598,13,8
11784,Zion Williamson,0.0,0.127926,251,9
3876,Russell Westbrook,0.005,0.120228,11,10


In [22]:
# so question is : Out of the top5 how many correct predictions did we get
# let's create one metric
# heavy penalties for deviations
combination.sort_values('Share', ascending=False).head(10)

Unnamed: 0,Player,Share,predictions,Rk,Pred_Rk
641,Nikola Jokić,0.961,0.154307,1,3
8624,Joel Embiid,0.58,0.162713,2,2
3651,Stephen Curry,0.449,0.142386,3,6
9907,Giannis Antetokounmpo,0.345,0.207436,4,1
1389,Chris Paul,0.138,0.072294,5,33
10997,Luka Dončić,0.042,0.15143,6,4
7464,Damian Lillard,0.038,0.116303,7,12
3536,Julius Randle,0.02,0.088878,8,24
3531,Derrick Rose,0.01,0.033,9,76
11358,Rudy Gobert,0.008,0.095349,10,19


In [23]:
def find_ap(combination):
    actual = combination.sort_values('Share', ascending=False).head(5) # get top5
    predicted = combination.sort_values('predictions', ascending=False)
    ps = []
    found = 0 
    seen = 1
    
    for index, row in predicted.iterrows():
        if row['Player'] in actual['Player'].values:
            found += 1
            ps.append(found/seen)
        seen += 1 
    return sum(ps) / len(ps)

In [24]:
find_ap(combination) # high values are better

0.7636363636363636

## 4. Implementing Backtesting to predict each year

In [25]:
years = list(range(1991, 2022))

In [26]:
# this makes our metric more robust, fights overfitting
aps = []
all_predictions = []

for year in years[5:]:
    train = stats[stats['Year'] < year]
    test = stats[stats['Year'] == year]
    
    reg.fit(train[predictors], train['Share'])
    predictions = reg.predict(test[predictors])
    predictions = pd.DataFrame(predictions, columns=['predictions'], index=test.index)
    combination = pd.concat([test[['Player', 'Share']], predictions], axis=1)
    
    all_predictions.append(combination)
    aps.append(find_ap(combination))

In [27]:
#mean average precision
sum(aps)/len(aps)

0.7112884360789578

In [28]:
def add_ranks(combination):
    combination = combination.sort_values('Share', ascending=False)
    combination['Rk'] = list(range(1, combination.shape[0]+1))

    combination = combination.sort_values('predictions', ascending=False)
    combination['Pred_Rk'] = list(range(1, combination.shape[0]+1))
    
    combination['Diff'] = combination['Rk'] - combination['Pred_Rk']
    return combination

In [29]:
ranking = add_ranks(all_predictions[1])
ranking[ranking['Rk']<5].sort_values('Diff', ascending=False)

Unnamed: 0,Player,Share,predictions,Rk,Pred_Rk,Diff
1600,Karl Malone,0.857,0.192318,1,2,-1
10524,Michael Jordan,0.832,0.167629,2,3,-1
908,Grant Hill,0.327,0.128646,3,6,-3
4682,Tim Hardaway,0.207,0.059984,4,20,-16


In [30]:
#so let's make a function that does the same as before
def backtest(stats, model, year, predictors):
    aps = []
    all_predictions = []

    for year in years[5:]:
        train = stats[stats['Year'] < year]
        test = stats[stats['Year'] == year]

        model.fit(train[predictors], train['Share'])
        predictions = model.predict(test[predictors])
        predictions = pd.DataFrame(predictions, columns=['predictions'], index=test.index)
        combination = pd.concat([test[['Player', 'Share']], predictions], axis=1)
        combination = add_ranks(combination)
        all_predictions.append(combination)
        aps.append(find_ap(combination))    
    return sum(aps)/len(aps), aps, pd.concat(all_predictions)

In [31]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)
mean_ap

0.7112884360789578

## 5. Diagnosing model performance

In [35]:
all_predictions[all_predictions['Rk']<=5].sort_values('Diff').head(10)  # what is the differnece with other mvp, why would he be missranked

Unnamed: 0,Player,Share,predictions,Rk,Pred_Rk,Diff
1224,Jason Kidd,0.712,0.028209,2,52,-50
8248,Glen Rice,0.117,0.03311,5,53,-48
5175,Steve Nash,0.839,0.0341,1,45,-44
8516,Peja Stojaković,0.228,0.036269,4,38,-34
5193,Steve Nash,0.739,0.054128,1,34,-33
12726,Joakim Noah,0.258,0.046968,4,37,-33
3657,Chauncey Billups,0.344,0.052698,5,35,-30
1389,Chris Paul,0.138,0.072294,5,33,-28
5208,Steve Nash,0.785,0.074421,2,21,-19
4682,Tim Hardaway,0.207,0.059984,4,20,-16


In [41]:
# look at the coefficients of the regression

pd.concat([pd.Series(reg.coef_), pd.Series(predictors)], axis=1).sort_values(0, ascending=False)

Unnamed: 0,0,1
13,0.070002,eFG%
18,0.035041,DRB
29,0.027126,W/L%
17,0.02161,ORB
10,0.016945,2P
21,0.011635,STL
15,0.011351,FTA
22,0.011234,BLK
20,0.007456,AST
25,0.005893,PTS


## 6. Adding more predictors

In [47]:
stat_ratios = stats[['PTS', 'AST', 'STL', 'BLK', '3P', 'Year']].groupby('Year').apply(lambda x : x/x.mean())

In [48]:
stat_ratios

Unnamed: 0,PTS,AST,STL,BLK,3P,Year
0,1.013334,0.420714,0.961127,0.673469,0.508587,1.0
1,1.614653,1.028412,1.647646,0.673469,4.577279,1.0
2,0.311795,0.093492,0.274608,1.571429,0.000000,1.0
3,0.200440,0.186984,0.274608,0.000000,0.000000,1.0
4,2.383005,1.636110,1.784950,0.897959,1.525760,1.0
...,...,...,...,...,...,...
14087,0.735752,0.819562,0.479763,1.528302,0.650951,1.0
14088,0.071202,0.000000,0.000000,0.000000,0.130190,1.0
14089,1.281633,0.601012,1.119447,2.547170,0.520761,1.0
14090,0.474679,0.218550,0.319842,1.273585,0.650951,1.0


In [52]:
stats[['PTS_R', 'AST_R', 'STL_R', 'BLK_R', '3P_R']] = stat_ratios[['PTS','AST','STL','BLK','3P']]
stats.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,W/L%,GB,PS/G,PA/G,SRS,PTS_R,AST_R,STL_R,BLK_R,3P_R
0,A.C. Green,PF,27,LAL,82,21,26.4,3.1,6.6,0.476,...,0.707,5.0,106.3,99.6,6.73,1.013334,0.420714,0.961127,0.673469,0.508587
1,Byron Scott,SG,29,LAL,82,82,32.1,6.1,12.8,0.477,...,0.707,5.0,106.3,99.6,6.73,1.614653,1.028412,1.647646,0.673469,4.577279
2,Elden Campbell,PF,22,LAL,52,0,7.3,1.1,2.4,0.455,...,0.707,5.0,106.3,99.6,6.73,0.311795,0.093492,0.274608,1.571429,0.0
3,Irving Thomas,PF,25,LAL,26,0,4.2,0.7,1.9,0.34,...,0.707,5.0,106.3,99.6,6.73,0.20044,0.186984,0.274608,0.0,0.0
4,James Worthy,SF,29,LAL,78,74,38.6,9.2,18.7,0.492,...,0.707,5.0,106.3,99.6,6.73,2.383005,1.63611,1.78495,0.897959,1.52576


In [53]:
predictors += ['PTS_R', 'AST_R', 'STL_R', 'BLK_R', '3P_R']

In [54]:
mean_ap, aps, all_predictions = backtest(stats, reg, years[5:], predictors)

In [56]:
mean_ap # there was a slight improvement

0.7208380973034985

In [62]:
stats['NPos']=stats['Pos'].astype('category').cat.codes

In [63]:
stats['NTm']=stats['Tm'].astype('category').cat.codes

In [66]:
stats['NTm'].value_counts()

7     499
27    495
14    493
8     493
11    492
17    491
12    490
0     485
9     480
31    479
1     479
26    476
15    475
18    475
13    474
5     473
24    468
19    466
30    465
34    464
29    463
28    463
10    461
33    421
36    387
20    343
16    331
32    263
25    214
4     177
2     165
3     157
23    145
21    143
37    114
6     113
35     88
22     32
Name: NTm, dtype: int64

## 7. Using a Random Forest Model

In [77]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=50, random_state=42, min_samples_split=5)

mean_ap, aps, all_predictions = backtest(stats, rf, years[28:], predictors)

In [78]:
mean_ap

0.7123369948657577

In [79]:
#compare to previous regressor 
mean_ap, aps, all_predictions = backtest(stats, reg, years[28:], predictors)
mean_ap

0.7208380973034985

## 8. Discussion