In [331]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsRegressor
import warnings
warnings.filterwarnings('ignore')

In [332]:
nba=pd.read_csv('C:Desktop/nba_2013.csv')

In [333]:
#Viewing the dataset for the first 5 rows

In [334]:
nba.head()

Unnamed: 0,player,pos,age,bref_team_id,g,gs,mp,fg,fga,fg.,...,drb,trb,ast,stl,blk,tov,pf,pts,season,season_end
0,Quincy Acy,SF,23,TOT,63,0,847,66,141,0.468,...,144,216,28,23,26,30,122,171,2013-2014,2013
1,Steven Adams,C,20,OKC,81,20,1197,93,185,0.503,...,190,332,43,40,57,71,203,265,2013-2014,2013
2,Jeff Adrien,PF,27,TOT,53,12,961,143,275,0.52,...,204,306,38,24,36,39,108,362,2013-2014,2013
3,Arron Afflalo,SG,28,ORL,73,73,2552,464,1011,0.459,...,230,262,248,35,3,146,136,1330,2013-2014,2013
4,Alexis Ajinca,C,25,NOP,56,30,951,136,249,0.546,...,183,277,40,23,46,63,187,328,2013-2014,2013


In [335]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 31 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   player        481 non-null    object 
 1   pos           481 non-null    object 
 2   age           481 non-null    int64  
 3   bref_team_id  481 non-null    object 
 4   g             481 non-null    int64  
 5   gs            481 non-null    int64  
 6   mp            481 non-null    int64  
 7   fg            481 non-null    int64  
 8   fga           481 non-null    int64  
 9   fg.           479 non-null    float64
 10  x3p           481 non-null    int64  
 11  x3pa          481 non-null    int64  
 12  x3p.          414 non-null    float64
 13  x2p           481 non-null    int64  
 14  x2pa          481 non-null    int64  
 15  x2p.          478 non-null    float64
 16  efg.          479 non-null    float64
 17  ft            481 non-null    int64  
 18  fta           481 non-null    

In [336]:
#Checking null values

In [337]:
nba.isnull().sum()

player           0
pos              0
age              0
bref_team_id     0
g                0
gs               0
mp               0
fg               0
fga              0
fg.              2
x3p              0
x3pa             0
x3p.            67
x2p              0
x2pa             0
x2p.             3
efg.             2
ft               0
fta              0
ft.             20
orb              0
drb              0
trb              0
ast              0
stl              0
blk              0
tov              0
pf               0
pts              0
season           0
season_end       0
dtype: int64

In [338]:
# We will drop the columns - player,season,season_end & bref_team_id

In [339]:
nba.drop(columns=['player','season','season_end','bref_team_id'],axis=1,inplace=True)

In [340]:
#viewing the dataset

In [341]:
nba.head()

Unnamed: 0,pos,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,...,ft.,orb,drb,trb,ast,stl,blk,tov,pf,pts
0,SF,23,63,0,847,66,141,0.468,4,15,...,0.66,72,144,216,28,23,26,30,122,171
1,C,20,81,20,1197,93,185,0.503,0,0,...,0.581,142,190,332,43,40,57,71,203,265
2,PF,27,53,12,961,143,275,0.52,0,0,...,0.639,102,204,306,38,24,36,39,108,362
3,SG,28,73,73,2552,464,1011,0.459,128,300,...,0.815,32,230,262,248,35,3,146,136,1330
4,C,25,56,30,951,136,249,0.546,0,1,...,0.836,94,183,277,40,23,46,63,187,328


In [342]:
nba.pos.value_counts()

SG    109
SF     99
PF     96
C      90
PG     85
G       1
F       1
Name: pos, dtype: int64

In [343]:
#We will convert the pos into numerical values using dummies

In [344]:
cat=pd.get_dummies(nba.pos,drop_first=True)

In [345]:
#Combining Numerical & Categorical data(dummies)

In [346]:
nba=pd.concat([nba,cat],axis=1)

In [347]:
#Now dropping the pos varable from the dataset

In [348]:
nba.drop(columns=['pos'],axis=1,inplace=True)

In [349]:
#Filling the missing values by median

In [350]:
#Writing a function to compute median for the complete dataset

In [351]:
def miss(x):
    
    return x.fillna(x.median())

In [352]:
nba=nba.apply(miss)

In [353]:
nba.isnull().sum()


age     0
g       0
gs      0
mp      0
fg      0
fga     0
fg.     0
x3p     0
x3pa    0
x3p.    0
x2p     0
x2pa    0
x2p.    0
efg.    0
ft      0
fta     0
ft.     0
orb     0
drb     0
trb     0
ast     0
stl     0
blk     0
tov     0
pf      0
pts     0
F       0
G       0
PF      0
PG      0
SF      0
SG      0
dtype: int64

In [354]:
#Now, we need to take care of Outliers in the dataset

In [355]:
#We will write a function to the complete dataset as below

In [356]:
def outliers(x):
    
    x=x.clip(upper=x.quantile(0.99))
    x=x.clip(lower=x.quantile(0.01))
    
    return x

In [357]:
nba=nba.apply(outliers)

In [358]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 481 entries, 0 to 480
Data columns (total 32 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     481 non-null    int64  
 1   g       481 non-null    int64  
 2   gs      481 non-null    int64  
 3   mp      481 non-null    float64
 4   fg      481 non-null    float64
 5   fga     481 non-null    float64
 6   fg.     481 non-null    float64
 7   x3p     481 non-null    float64
 8   x3pa    481 non-null    int64  
 9   x3p.    481 non-null    float64
 10  x2p     481 non-null    float64
 11  x2pa    481 non-null    int64  
 12  x2p.    481 non-null    float64
 13  efg.    481 non-null    float64
 14  ft      481 non-null    float64
 15  fta     481 non-null    float64
 16  ft.     481 non-null    float64
 17  orb     481 non-null    float64
 18  drb     481 non-null    float64
 19  trb     481 non-null    float64
 20  ast     481 non-null    float64
 21  stl     481 non-null    float64
 22  bl

In [359]:
nba.head()

Unnamed: 0,age,g,gs,mp,fg,fga,fg.,x3p,x3pa,x3p.,...,blk,tov,pf,pts,F,G,PF,PG,SF,SG
0,23,63,0,847.0,66.0,141.0,0.468,4.0,15,0.266667,...,26,30.0,122.0,171.0,0,0,0,0,1,0
1,20,81,20,1197.0,93.0,185.0,0.503,0.0,0,0.330976,...,57,71.0,203.0,265.0,0,0,0,0,0,0
2,27,53,12,961.0,143.0,275.0,0.52,0.0,0,0.330976,...,36,39.0,108.0,362.0,0,0,1,0,0,0
3,28,73,73,2552.0,464.0,1011.0,0.459,128.0,300,0.426667,...,3,146.0,136.0,1330.0,0,0,0,0,0,1
4,25,56,30,951.0,136.0,249.0,0.546,0.0,1,0.0,...,46,63.0,187.0,328.0,0,0,0,0,0,0


In [360]:
#We will divide the data into X &Y

In [361]:
X=nba.drop(columns=['pts'],axis=1)

In [362]:
Y=nba['pts']

In [363]:
#Normalise the Dataset using Standard Scalar

In [364]:
from sklearn.preprocessing import StandardScaler

In [365]:
SC=StandardScaler()

In [366]:
X_scaled=SC.fit_transform(X)

In [367]:
X_scaled=pd.DataFrame(X_scaled)

In [368]:
X_scaled.columns=X.columns

In [369]:
DF=pd.concat([X_scaled,Y],axis=1)

In [370]:
DF.rename(columns={'fg.':'fg1','x3p.':'x3p1','x2p.':'x2p1','efg.':'efg1','ft.':'ft1'},inplace=True)

In [371]:
DF.columns

Index(['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg1', 'x3p', 'x3pa', 'x3p1',
       'x2p', 'x2pa', 'x2p1', 'efg1', 'ft', 'fta', 'ft1', 'orb', 'drb', 'trb',
       'ast', 'stl', 'blk', 'tov', 'pf', 'F', 'G', 'PF', 'PG', 'SF', 'SG',
       'pts'],
      dtype='object')

In [372]:
#Dividing the dataset into X& Y 

In [373]:
X=DF.drop(columns=['pts'])

In [374]:
Y=DF['pts']

In [375]:
features='+'.join(X.columns)

In [376]:
features

'age+g+gs+mp+fg+fga+fg1+x3p+x3pa+x3p1+x2p+x2pa+x2p1+efg1+ft+fta+ft1+orb+drb+trb+ast+stl+blk+tov+pf+F+G+PF+PG+SF+SG'

In [377]:
y,x=dmatrices('pts~age+g+gs+mp+fg+fga+fg1+x3p+x3pa+x3p1+x2p+x2pa+x2p1+efg1+ft+fta+ft1+orb+drb+trb+ast+stl+blk+tov+pf+F+G+PF+PG+SF+SG',DF,return_type='dataframe')

In [378]:
#Checking the multi-collinearity 

In [379]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [380]:
VIF=pd.DataFrame()

VIF['VIF Factor']=[variance_inflation_factor(x.values,i)for i in range(x.shape[1])]

VIF['Features']=x.columns



In [381]:
VIF

Unnamed: 0,VIF Factor,Features
0,1.0,Intercept
1,1.139269,age
2,8.186064,g
3,6.327575,gs
4,47.874766,mp
5,3016.085483,fg
6,2564.925289,fga
7,22.671458,fg1
8,357.390231,x3p
9,429.302247,x3pa


In [382]:
#As we have high multicollinearity , we will remove some variables & check the VIF factor again. 

In [383]:
#We will remove fg as it has high value & check the multicollinearity again

In [384]:
c,d=dmatrices('pts~age+g+gs+mp+fga+fg1+x3p+x3pa+x3p1+x2p+x2pa+x2p1+efg1+ft+fta+ft1+orb+drb+trb+ast+stl+blk+tov+pf+F+G+PF+PG+SF+SG',DF,return_type='dataframe')

In [385]:
VIF=pd.DataFrame()

VIF['VIF Factor']=[variance_inflation_factor(d.values,i)for i in range(d.shape[1])]

VIF['Features']=d.columns



In [386]:
VIF

Unnamed: 0,VIF Factor,Features
0,1.0,Intercept
1,1.139267,age
2,8.167307,g
3,6.3012,gs
4,47.862274,mp
5,311.392855,fga
6,22.610276,fg1
7,92.691904,x3p
8,147.100277,x3pa
9,1.960658,x3p1


In [387]:
#Again, we will remove the variables on a one by one basis - trb, x2pa,x3pa,fta,x2p,mp,tov,fg1,drb,fga,pf,orb as it has high value of VIF & check the VIF again

In [388]:
e,f=dmatrices('pts~age+g+gs+x3p+x3p1+x2p1+efg1+ft+ft1+ast+stl+blk+F+G+PF+PG+SF+SG',DF,return_type='dataframe')

In [389]:
VIF=pd.DataFrame()

VIF['VIF Factor']=[variance_inflation_factor(f.values,i)for i in range(f.shape[1])]

VIF['Features']=f.columns



In [390]:
VIF

Unnamed: 0,VIF Factor,Features
0,1.0,Intercept
1,1.049526,age
2,2.993809,g
3,3.399768,gs
4,3.050795,x3p
5,1.786006,x3p1
6,3.856679,x2p1
7,4.402464,efg1
8,3.380216,ft
9,1.255211,ft1


In [391]:
#Now, looks like we will consider these variables basedd on the above VIF factor( all of them are well below 5)
#'age', 'g', 'gs', 'x3p', 'x3p1', 'x2p1', 'efg1', 'ft','ft1', 'ast', 'stl', 'blk', 'F', 'G', 'PF', 'PG', 'SF', 'SG'

In [392]:
short_list=['age', 'g', 'gs', 'x3p', 'x3p1', 'x2p1', 'efg1', 'ft','ft1', 'ast', 'stl', 'blk', 'F', 'G', 'PF', 'PG', 'SF', 'SG']

In [393]:
X_shortlist=X[short_list]

In [394]:
#Now we will divide the data into train & test

In [395]:
from sklearn.model_selection import train_test_split

In [396]:
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X_shortlist,Y,test_size=0.3,random_state=345)

In [397]:
from sklearn.neighbors import KNeighborsRegressor

In [398]:
KNN=KNeighborsRegressor(n_neighbors=5)

In [399]:
KNN.fit(Xtrain,Ytrain)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [400]:
from sklearn.metrics import r2_score

In [401]:
#Training Accuracy for R2 Score

In [402]:
r2_score(Ytrain,KNN.predict(Xtrain))

0.9273004189304167

In [403]:
#Testing Accuracy for R2 Score

In [404]:
r2_score(Ytest,KNN.predict(Xtest))

0.8804882179937712

In [405]:
# Seems like we have a great accuracy wrt to Training & Testing accuracy. So we will go ahead with this model for Pickle

In [406]:
import pickle

In [407]:
with open('KNNNBA.pickle','wb') as f:
    pickle.dump(KNN,f)
    

In [408]:
with open('ScaleR.pickle','wb')as f:
    pickle.dump(SC,f)

In [409]:
#

In [410]:
with open('KNNNBA.pickle','rb') as f:
    KN=pickle.load(f)

In [411]:
with open('ScaleR.pickle','rb')as f:
    Standard=pickle.load(f)

In [412]:
#We will use this pickle model for modelling

In [None]:
#The End##