In [44]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection
from sklearn.metrics import SCORERS
from sklearn.metrics import mean_squared_error
#from lightgbm import LGBMRegressor
#import xgboost as xgb


#next: try boosting algorithms, and gridsearch aiming to improve SVC or even better reduces forests overfitting
#identify injury seasons (less than some hisorical treshold) and replace their values with median, should improve Curry 2021 32pts
#download 2022 squads to make predictions

In [125]:
#loading data and retaining only significant columns
df=pd.read_csv('NBA_Pts.csv', sep=';', decimal=',')
df=df[df['Team2']!='nok'][['Season_endyear', 'Player', 'Pos', 'Age', 'G', 'PTS', 'Team2']]
df.head(5)

Unnamed: 0,Season_endyear,Player,Pos,Age,G,PTS,Team2
0,2010,Arron Afflalo,SG,24,82,8.829268,DEN
1,2010,Alexis Ajinça,C,21,6,1.666667,CHO
2,2010,LaMarcus Aldridge,PF,24,78,17.858974,POR
3,2010,Joe Alexander,SF,23,8,0.5,CHI
4,2010,Malik Allen,PF,31,51,2.058824,DEN


In [149]:
#searching for serious injuries through game tresholds

df_games=df.copy()
for year in range(5):
    df_games['G-'+str(year+1)]=(df_games.sort_values(by=['Season_endyear'], ascending=True).groupby(['Player'])['G'].shift(year+1))

df_games['pts_injury_adjusted']=np.where(df_games['G']/df_games['G-1']<0.5, 1, 0)
#df_games['std']=df_games[['G-1','G-2','G-3','G-4','G-5']].std(axis=1)
#df_games['hist']=((df_games['G']-df_games['mean'])/df_games['mean'])
#df_games['low_tres']=df_games['mean']-df_games['std']
#df_games[df_games['Player']=='Stephen Curry']
#df_games[df_games['Player']=='Kawhi Leonard']
#df_games[df_games['hist']<1]['hist'].hist(bins=50)
df_games

Unnamed: 0,Season_endyear,Player,Pos,Age,G,PTS,Team2,G-1,G-2,G-3,G-4,G-5,pts_injury_adjusted
0,2010,Arron Afflalo,SG,24,82,8.829268,DEN,74.0,,,,,0
1,2010,Alexis Ajinça,C,21,6,1.666667,CHO,31.0,,,,,1
2,2010,LaMarcus Aldridge,PF,24,78,17.858974,POR,81.0,,,,,0
3,2010,Joe Alexander,SF,23,8,0.500000,CHI,59.0,,,,,1
4,2010,Malik Allen,PF,31,51,2.058824,DEN,49.0,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8067,2011,Dorell Wright,SF,25,82,16.390244,GSW,72.0,6.0,,,,0
8068,2011,Julian Wright,SF,23,52,3.615385,TOR,68.0,54.0,,,,0
8069,2011,Nick Young,SG,25,64,17.421875,WAS,74.0,82.0,,,,0
8070,2011,Sam Young,SF,25,78,7.333333,MEM,80.0,,,,,0


In [99]:
#creating last 5 years data

for year in range(5):
    df['PTS_year-'+str(year+1)]=(df.sort_values(by=['Season_endyear'], ascending=True).groupby(['Player'])['PTS'].shift(year+1))


In [101]:
#creating the yearly current team past total ppg

df_team=df.groupby(['Season_endyear', 'Team2'])[['PTS_year-1','PTS_year-2','PTS_year-3','PTS_year-4','PTS_year-5']].sum().reset_index()


#creating the yearly team-pos total ppg
df_pos=df.groupby(['Season_endyear', 'Team2', 'Pos'])[['PTS_year-1','PTS_year-2','PTS_year-3','PTS_year-4','PTS_year-5']].sum().reset_index()

#creating player bins
df_bins=df[['Season_endyear', 'Team2', 'PTS']].copy()
for pts in [5, 10, 15, 20, 25, 30]:
    df_bins['bin_'+str(pts)]=np.where(df_bins['PTS']>=pts,1,0)
df_bins=df_bins.groupby(['Season_endyear', 'Team2']).sum().reset_index().drop('PTS', axis=1)

for year in range(5):
    for bins in [5, 10, 15, 20, 25, 30]:
        df_bins['bin_'+str(bins)+'-'+str(year+1)]=(df_bins.sort_values(by=['Season_endyear'], ascending=True).groupby(['Team2'])['bin_'+str(bins)].shift(year+1))


#merging in the final dataframe
df_stats=df.merge(df_team, on=['Season_endyear', 'Team2'], suffixes=('','_team'))
df_stats=df_stats.merge(df_pos, on=['Season_endyear', 'Team2', 'Pos'], suffixes=('','_pos'))
df_stats=df_stats.merge(df_bins, on=['Season_endyear', 'Team2'])
df_stats.head(5)

#drop season prior to 2013 for lack of data
df_stats=df_stats[df_stats['Season_endyear']>2013]

#replacing NA with 0. Usually it will mean a new player, sometimes can account for long-term injuries. But let's see how it goes

df_stats=df_stats.fillna(0)

In [121]:
#separating train and test sets

X=df_stats[df_stats['Season_endyear']<2021].drop(['Season_endyear', 'Player', 'G', 'Team2', 'Pos', 'PTS'], axis=1)
X_oot=df_stats[df_stats['Season_endyear']==2021].drop(['Season_endyear', 'Player', 'G', 'Team2','Pos', 'PTS'], axis=1)
y=df_stats[df_stats['Season_endyear']<2021]['PTS']
y_oot=df_stats[df_stats['Season_endyear']==2021]['PTS']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=8)



In [122]:
#training and evaluating the models
lr=LinearRegression()
sv=SVR()
rf=RandomForestRegressor()
#lgbm=lightgbm.LGBMRegressor()

r2=[]
mse=[]
models=[]
dataset=[]

for model in (lr, sv, rf):
    model.fit(X_train, y_train)
    r2.append(r2_score(y_train, model.predict(X_train)))
    r2.append(r2_score(y_test, model.predict(X_test)))
    r2.append(r2_score(y_oot, model.predict(X_oot)))
    mse.append(mean_squared_error(y_train, model.predict(X_train)))
    mse.append(mean_squared_error(y_test, model.predict(X_test)))
    mse.append(mean_squared_error(y_oot, model.predict(X_oot)))
    dataset.append(['train', 'test', 'oot'])
    models.append([str(model),str(model),str(model)])


In [117]:
#evaluating results
r2

[0.6736901371020818,
 0.6605036970300868,
 0.7067732195171135,
 0.4816965155888525,
 0.4736913023497801,
 0.45838516318786726,
 0.9478484236010751,
 0.6963274084472602,
 0.7501610142459837]

In [118]:
#evaluating results
mse

[11.49072358166648,
 12.010657711405104,
 12.304642195246908,
 18.251615252727543,
 18.619683226925737,
 22.7277220847135,
 1.8364732939000303,
 10.74329093297525,
 10.483965076668627]

In [119]:
#checking 2021 predictions vs. actual

df_check=df_stats[df_stats['Season_endyear']==2021][['Player', 'PTS']].copy()
df_check['prediction']=rf.predict(X_oot)
df_check.sort_values(by=['prediction'], ascending=False).head(10)

Unnamed: 0,Player,PTS,prediction
1117,Russell Westbrook,22.2,28.39112
1103,Bradley Beal,31.3,28.318944
1062,Damian Lillard,28.8,26.4397
1135,Donovan Mitchell,26.4,25.996574
1386,James Harden,24.6,25.98315
1090,Zach LaVine,27.4,25.913895
907,Giannis Antetokounmpo,28.1,25.875939
1381,Kevin Durant,26.9,25.791349
1036,Anthony Davis,21.8,25.721507
1045,LeBron James,25.0,25.686122


In [120]:
df_check[(df_check['Player'] == 'Stephen Curry') | (df_check['Player'] == 'Bradley Beal') | (df_check['Player'] == 'Russell Westbrook')]


Unnamed: 0,Player,PTS,prediction
1103,Bradley Beal,31.3,28.318944
1117,Russell Westbrook,22.2,28.39112
1228,Stephen Curry,32.0,21.232945


In [114]:
df_stats[(df_stats['Season_endyear'] == 2021) & ((df_stats['Player'] == 'Stephen Curry') | (df_stats['Player'] == 'Bradley Beal'))]

Unnamed: 0,Season_endyear,Player,Pos,Age,G,PTS,Team2,PTS_year-1,PTS_year-2,PTS_year-3,...,bin_15-4,bin_20-4,bin_25-4,bin_30-4,bin_5-5,bin_10-5,bin_15-5,bin_20-5,bin_25-5,bin_30-5
1103,2021,Bradley Beal,SG,27,60,31.3,WAS,30.5,25.6,22.6,...,2.0,2.0,0.0,0.0,11.0,4.0,2.0,0.0,0.0,0.0
1228,2021,Stephen Curry,PG,32,63,32.0,GSW,20.8,27.3,26.4,...,3.0,3.0,2.0,0.0,10.0,4.0,2.0,2.0,1.0,1.0
