In [1]:
import numpy as np
import pandas as pd

In [56]:
stats_df = pd.read_csv('all_stats.csv')
stats_df.head()

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,...,Year,Share,Team,W,L,W/L%,GB,PS/G,PA/G,SRS
0,A.C. Green,PF,26.0,LAL,82.0,82.0,33.0,4.7,9.8,0.478,...,1990,0.0,Los Angeles Lakers,63.0,19.0,0.768,0.0,110.7,103.9,6.74
1,Byron Scott,SG,28.0,LAL,77.0,77.0,33.7,6.1,13.1,0.47,...,1990,0.0,Los Angeles Lakers,63.0,19.0,0.768,0.0,110.7,103.9,6.74
2,James Worthy,SF,28.0,LAL,80.0,80.0,37.0,8.9,16.2,0.548,...,1990,0.0,Los Angeles Lakers,63.0,19.0,0.768,0.0,110.7,103.9,6.74
3,Jawann Oldham,C,32.0,LAL,6.0,0.0,7.5,0.5,1.0,0.5,...,1990,0.0,Los Angeles Lakers,63.0,19.0,0.768,0.0,110.7,103.9,6.74
4,Jay Vincent,SF,30.0,LAL,41.0,6.0,11.2,2.1,4.5,0.47,...,1990,0.0,Los Angeles Lakers,63.0,19.0,0.768,0.0,110.7,103.9,6.74


In [57]:
pd.isnull(stats_df).sum()

Player       0
Pos          0
Age          0
Tm           0
G            0
GS           0
MP           0
FG           0
FGA          0
FG%         51
3P           0
3PA          0
3P%       2102
2P           0
2PA          0
2P%         85
eFG%        51
FT           0
FTA          0
FT%        468
ORB          0
DRB          0
TRB          0
AST          0
STL          0
BLK          0
TOV          0
PF           0
PTS          0
Year         0
Share        0
Team       226
W          383
L          383
W/L%       383
GB         383
PS/G       383
PA/G       383
SRS        383
dtype: int64

A lot of players did not attempt any 3-pointers or free throws, so the percentages will be NAN.

In [58]:
stats_df.fillna(0, inplace=True) # make percentages 0

In [59]:
stats_df.columns

Index(['Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'Year', 'Share',
       'Team', 'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS'],
      dtype='object')

In [60]:
# remove strings and Share since we are trying to predict Share
predictors = ['G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'W', 'L', 'W/L%', 'GB', 'PS/G', 'PA/G', 'SRS']

In [61]:
from sklearn.linear_model import Ridge

In [62]:
train = stats_df[stats_df['Year'] < 2021]
test = stats_df[stats_df['Year'] == 2021]

In [63]:
reg = Ridge(alpha=0.1)
reg.fit(train[predictors], train['Share'])

Ridge(alpha=0.1)

In [64]:
pred = reg.predict(test[predictors])
pred_df = pd.DataFrame(pred, columns=['Pred'], index=test.index)

In [65]:
result = pd.concat([test[['Player', 'Share']], pred_df], axis=1)
result.head()

Unnamed: 0,Player,Share,Pred
644,Aaron Gordon,0.0,0.017909
645,Austin Rivers,0.0,-0.010505
646,Bol Bol,0.0,0.007476
647,Facundo Campazzo,0.0,0.000411
648,Greg Whittington,0.0,0.0156


In [66]:
result.sort_values('Share', ascending=False, inplace=True)
result.head(10)

Unnamed: 0,Player,Share,Pred
655,Nikola Jokić,0.961,0.158815
8791,Joel Embiid,0.58,0.167851
3707,Stephen Curry,0.449,0.142324
10102,Giannis Antetokounmpo,0.345,0.207624
1403,Chris Paul,0.138,0.074964
11288,Luka Dončić,0.042,0.152718
7613,Damian Lillard,0.038,0.118737
3592,Julius Randle,0.02,0.091343
3587,Derrick Rose,0.01,0.033955
11649,Rudy Gobert,0.008,0.097218


The predictions aren't lining up with the actual shares.

In [67]:
from sklearn.metrics import mean_squared_error

mean_squared_error(result['Share'], result['Pred'])

0.002664676008690577

MSE is not accurate since majority of the shares/predicted shares are 0.

In [68]:
# Ranks for actual shares
result['Rk'] = np.arange(0, result.shape[0]) + 1
result.head(10)

Unnamed: 0,Player,Share,Pred,Rk
655,Nikola Jokić,0.961,0.158815,1
8791,Joel Embiid,0.58,0.167851,2
3707,Stephen Curry,0.449,0.142324,3
10102,Giannis Antetokounmpo,0.345,0.207624,4
1403,Chris Paul,0.138,0.074964,5
11288,Luka Dončić,0.042,0.152718,6
7613,Damian Lillard,0.038,0.118737,7
3592,Julius Randle,0.02,0.091343,8
3587,Derrick Rose,0.01,0.033955,9
11649,Rudy Gobert,0.008,0.097218,10


In [69]:
# Ranks for predicted shares
result.sort_values('Pred', ascending=False, inplace=True)
result['Pred_Rk'] = np.arange(0, result.shape[0]) + 1
result.head(10)

Unnamed: 0,Player,Share,Pred,Rk,Pred_Rk
10102,Giannis Antetokounmpo,0.345,0.207624,4,1
8791,Joel Embiid,0.58,0.167851,2,2
655,Nikola Jokić,0.961,0.158815,1,3
11288,Luka Dončić,0.042,0.152718,6,4
3792,LeBron James,0.001,0.144385,15,5
4260,James Harden,0.001,0.14361,13,6
4263,Kevin Durant,0.0,0.142657,531,7
3707,Stephen Curry,0.449,0.142324,3,8
12088,Zion Williamson,0.0,0.129777,251,9
6211,Jimmy Butler,0.0,0.121273,442,10


In [70]:
result.sort_values('Share', ascending=False, inplace=True)
result.head(10)

Unnamed: 0,Player,Share,Pred,Rk,Pred_Rk
655,Nikola Jokić,0.961,0.158815,1,3
8791,Joel Embiid,0.58,0.167851,2,2
3707,Stephen Curry,0.449,0.142324,3,8
10102,Giannis Antetokounmpo,0.345,0.207624,4,1
1403,Chris Paul,0.138,0.074964,5,33
11288,Luka Dončić,0.042,0.152718,6,4
7613,Damian Lillard,0.038,0.118737,7,11
3592,Julius Randle,0.02,0.091343,8,22
3587,Derrick Rose,0.01,0.033955,9,73
11649,Rudy Gobert,0.008,0.097218,10,19


In [71]:
# average precision
def AP(df):
    actual = df.sort_values('Share', ascending=False).head(5)
    pred = df.sort_values('Pred', ascending=False)
    predictions = []
    found, total = 0, 1
    for i, row in pred.iterrows():
        if row['Player'] in actual['Player'].values:
            found += 1
            predictions.append(found / total)
        total += 1
    
    return sum(predictions) / len(predictions)

In [72]:
AP(result)

0.7303030303030302

In [73]:
years = range(1990, 2022)

In [74]:
def add_ranks(df):
    df.sort_values('Share', ascending=False, inplace=True)
    df['Rk'] = np.arange(0, df.shape[0]) + 1
    df.sort_values('Pred', ascending=False, inplace=True)
    df['Pred_Rk'] = np.arange(0, df.shape[0]) + 1
    df['Rk_Diff'] = df['Rk'] - df['Pred_Rk']
    return df

In [88]:
def backtest(df, model, year, predictors):
    avg_precisions = []
    preds = []
    for year in years[5:]:
        train = stats_df[stats_df['Year'] < year]
        test = stats_df[stats_df['Year'] == year]
        model.fit(train[predictors], train['Share'])
        pred = model.predict(test[predictors])
        pred_df = pd.DataFrame(pred, columns=['Pred'], index=test.index)
        result = pd.concat([test[['Player', 'Share']], pred_df], axis=1)
        add_ranks(result)
        preds.append(result)
        avg_precisions.append(AP(result))
    return sum(avg_precisions) / len(avg_precisions), avg_precisions, pd.concat(preds)

In [89]:
mAP, aps, preds = backtest(stats_df, reg, years[5:], predictors)
mAP

0.7313239552346306

In [90]:
preds[preds['Rk'] <= 5].sort_values('Rk_Diff').head(10)

Unnamed: 0,Player,Share,Pred,Rk,Pred_Rk,Rk_Diff
8415,Glen Rice,0.117,0.029045,5,72,-67
1238,Jason Kidd,0.712,0.028396,2,55,-53
13081,Joakim Noah,0.258,0.040672,4,45,-41
5293,Steve Nash,0.839,0.039205,1,40,-39
1403,Chris Paul,0.138,0.062293,5,33,-28
4768,Tim Hardaway,0.207,0.051444,4,28,-24
8683,Peja Stojaković,0.228,0.049517,4,27,-23
3713,Chauncey Billups,0.344,0.060018,5,28,-23
5311,Steve Nash,0.739,0.063241,1,24,-23
10764,Scottie Pippen,0.2,0.058597,5,24,-19


In [91]:
# ratios to average
ratios = stats_df[['PTS', 'AST', 'STL', 'BLK', '3P', 'Year']].groupby('Year').apply(lambda x: x/x.mean())
stats_df[['PTS_R', 'AST_R', 'STL_R', 'BLK_R', '3P_R']] = ratios[['PTS', 'AST', 'STL', 'BLK', '3P']]
predictors += ['PTS_R', 'AST_R', 'STL_R', 'BLK_R', '3P_R']

In [92]:
mAP, aps, preds = backtest(stats_df, reg, years[5:], predictors)
mAP

0.7312287937974025

In [93]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=350, random_state=1, min_samples_split=5)

In [94]:
mAP, aps, preds = backtest(stats_df, rf, years[29:], predictors)
mAP

0.7348395900928582

In [96]:
mAP, aps, preds = backtest(stats_df, reg, years[29:], predictors)
mAP

0.7312287937974025