# <p style="text-align: center;"> Can we predict an NBA player's position based on your stats? </p>
### In this notebook we will scrape the web for NBA player stats and attempt to train machine learning algorithms to predict a player's traditional position to the best of our ability

In [1]:
import pandas as pd
import numpy as np
from urllib.request import urlopen
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
#Create the list of years I wish to extract data for
years = ['2000','2001','2002','2003','2004','2005','2006','2007','2008','2009',
         '2010','2011','2012','2013','2014','2015','2016','2017','2018','2019']
df = pd.DataFrame()

#loop through the urls and store the combine data into a Pandas DataFrame
for year in years:
    url = 'https://www.basketball-reference.com/leagues/NBA_'+year+'_per_game.html'
    html = urlopen(url).read()
    table = pd.read_html(html)[0]
    df = df.append(table)

In [3]:
#Remove headers that came in as rows
df = df[df.Player != 'Player']

#Remove extra characters from player's names
df['Player'] = df['Player'].str.replace('*','')

#Remove people who did not play any minutes
df = df[df.MP != '0.0']

In [4]:
#Reset the index
df = df.reset_index(drop=True)

In [5]:
df.head(3)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Tariq Abdul-Wahad,SG,25,TOT,61,56,25.9,4.5,10.6,...,0.756,1.7,3.1,4.8,1.6,1.0,0.5,1.7,2.4,11.4
1,1,Tariq Abdul-Wahad,SG,25,ORL,46,46,26.2,4.8,11.2,...,0.762,1.7,3.5,5.2,1.6,1.2,0.3,1.9,2.5,12.2
2,1,Tariq Abdul-Wahad,SG,25,DEN,15,10,24.9,3.4,8.7,...,0.738,1.6,1.9,3.5,1.7,0.4,0.8,1.3,2.1,8.9


In [6]:
df.dtypes

Rk        object
Player    object
Pos       object
Age       object
Tm        object
G         object
GS        object
MP        object
FG        object
FGA       object
FG%       object
3P        object
3PA       object
3P%       object
2P        object
2PA       object
2P%       object
eFG%      object
FT        object
FTA       object
FT%       object
ORB       object
DRB       object
TRB       object
AST       object
STL       object
BLK       object
TOV       object
PF        object
PTS       object
dtype: object

In [7]:
#Because everything came in as an Object dtype we will change the appropriate ones to Floats
df['G'] = df['G'].astype(str).astype(float)
df['GS'] = df['GS'].astype(str).astype(float)
df['MP'] = df['MP'].astype(str).astype(float)
df['FG'] = df['FG'].astype(str).astype(float)
df['FGA'] = df['FGA'].astype(str).astype(float)
df['FG%'] = df['FG%'].astype(str).astype(float)
df['3P'] = df['3P'].astype(str).astype(float)
df['3PA'] = df['3PA'].astype(str).astype(float)
df['3P%'] = df['3P%'].astype(str).astype(float)
df['2P'] = df['2P'].astype(str).astype(float)
df['2PA'] = df['2PA'].astype(str).astype(float)
df['2P%'] = df['2P%'].astype(str).astype(float)
df['eFG%'] = df['eFG%'].astype(str).astype(float)
df['FT'] = df['FT'].astype(str).astype(float)
df['FTA'] = df['FTA'].astype(str).astype(float)
df['FT%'] = df['FT%'].astype(str).astype(float)
df['DRB'] = df['DRB'].astype(str).astype(float)
df['ORB'] = df['ORB'].astype(str).astype(float)
df['TRB'] = df['TRB'].astype(str).astype(float)
df['AST'] = df['AST'].astype(str).astype(float)
df['STL'] = df['STL'].astype(str).astype(float)
df['BLK'] = df['BLK'].astype(str).astype(float)
df['TOV'] = df['TOV'].astype(str).astype(float)
df['PF'] = df['PF'].astype(str).astype(float)
df['PTS'] = df['PTS'].astype(str).astype(float)

In [8]:
df.head(5)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Tariq Abdul-Wahad,SG,25,TOT,61.0,56.0,25.9,4.5,10.6,...,0.756,1.7,3.1,4.8,1.6,1.0,0.5,1.7,2.4,11.4
1,1,Tariq Abdul-Wahad,SG,25,ORL,46.0,46.0,26.2,4.8,11.2,...,0.762,1.7,3.5,5.2,1.6,1.2,0.3,1.9,2.5,12.2
2,1,Tariq Abdul-Wahad,SG,25,DEN,15.0,10.0,24.9,3.4,8.7,...,0.738,1.6,1.9,3.5,1.7,0.4,0.8,1.3,2.1,8.9
3,2,Shareef Abdur-Rahim,SF,23,VAN,82.0,82.0,39.3,7.2,15.6,...,0.809,2.7,7.4,10.1,3.3,1.1,1.1,3.0,3.0,20.3
4,3,Cory Alexander,PG,26,DEN,29.0,2.0,11.3,1.0,3.4,...,0.773,0.3,1.2,1.4,2.0,0.8,0.1,1.0,1.3,2.8


In [9]:
df.dtypes

Rk         object
Player     object
Pos        object
Age        object
Tm         object
G         float64
GS        float64
MP        float64
FG        float64
FGA       float64
FG%       float64
3P        float64
3PA       float64
3P%       float64
2P        float64
2PA       float64
2P%       float64
eFG%      float64
FT        float64
FTA       float64
FT%       float64
ORB       float64
DRB       float64
TRB       float64
AST       float64
STL       float64
BLK       float64
TOV       float64
PF        float64
PTS       float64
dtype: object

In [10]:
#We are missing some key metrics that can add some valuable insights

# Points Per Minute
df['PPM'] = df.apply(lambda x: x.PTS / x.MP, axis=1)

#Field Goal Attempts per Minute
df['FGAPM'] = df.apply(lambda x: x.FGA / x.MP, axis=1)

# Offensive Rebounds Per Minute
df['ORBPM'] = df.apply(lambda x: x.ORB / x.MP, axis=1)

# Defensive Rebounds Per Minute
df['DRBPM'] = df.apply(lambda x: x.DRB / x.MP, axis=1)

# Assists Per Minute
df['APM'] = df.apply(lambda x: x.AST / x.MP, axis=1)

# Personal Fouls Per Minute
df['PFPM'] = df.apply(lambda x: x.PF / x.MP, axis=1)

# Free Throws Attempted Per Minute
df['FTAPM'] = df.apply(lambda x: x.FTA / x.MP, axis=1)

# Turnovers Per Minute
df['TOVPM'] = df.apply(lambda x: x.TOV / x.MP, axis=1)

In [11]:
#Let's see how many rows of data and features we have remaining
df.shape

(11571, 38)

In [12]:
#Create feature set
feature_set = df[['Pos','MP','FG','FGA','FG%','3P','3PA','3P%','2P','2PA','2P%','eFG%',
                   'FT','FTA','FT%','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS'
                  ,'PPM','FGAPM','ORBPM','DRBPM','APM','PFPM','FTAPM','TOVPM']]

In [13]:
feature_set.shape

(11571, 32)

# Machine Learning
### Classification Model to Predict Player Positions

In [14]:
# split data into X (Our Features to determine the classification) and y (The classification)
X = feature_set.iloc[:,1:33]
Y = feature_set.iloc[:,0]

In [15]:
seed = 3
test_size = 0.30
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

### XGBoost

In [16]:
model = XGBClassifier(seed=seed,
                      max_depth=6,
                      min_child_weight=8)

model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=8, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=3, silent=True,
       subsample=1)

In [17]:
y_pred = model.predict(X_test)
predictions = [value for value in y_pred]

In [18]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 65.47%


## 65.47% Accuracy...that's a start! Perhaps we can improve.

## What if we pull in more data?

In [19]:
#As an additional datasource we will bring player data from Kaggle 
#(https://www.kaggle.com/drgilermo/nba-players-stats) to enhance our predictions

data = pd.read_csv('player_data.csv')

data = data[['name','height','weight']]

data.head(3)

Unnamed: 0,name,height,weight
0,Alaa Abdelnaby,6-10,240.0
1,Zaid Abdul-Aziz,6-9,235.0
2,Kareem Abdul-Jabbar,7-2,225.0


In [20]:
#We will need to conver height into a inches for easier use
data['height'] = data['height'].str.split('-')
data['height'] = (data['height'].str[0].astype(float))*12 + (data['height'].str[1].astype(float))

In [21]:
data.tail(3)

Unnamed: 0,name,height,weight
4547,Bill Zopf,73.0,170.0
4548,Ivica Zubac,85.0,265.0
4549,Matt Zunic,75.0,195.0


In [22]:
#In this case we do not have any players with the same name, and therefore can use the name fields to do the join
df = pd.merge(df, data, left_on='Player',right_on='name', how='left')

df.head(3)

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FGAPM,ORBPM,DRBPM,APM,PFPM,FTAPM,TOVPM,name,height,weight
0,1,Tariq Abdul-Wahad,SG,25,TOT,61.0,56.0,25.9,4.5,10.6,...,0.409266,0.065637,0.119691,0.061776,0.092664,0.123552,0.065637,Tariq Abdul-Wahad,78.0,223.0
1,1,Tariq Abdul-Wahad,SG,25,ORL,46.0,46.0,26.2,4.8,11.2,...,0.427481,0.064885,0.133588,0.061069,0.09542,0.125954,0.072519,Tariq Abdul-Wahad,78.0,223.0
2,1,Tariq Abdul-Wahad,SG,25,DEN,15.0,10.0,24.9,3.4,8.7,...,0.349398,0.064257,0.076305,0.068273,0.084337,0.11245,0.052209,Tariq Abdul-Wahad,78.0,223.0


In [23]:
#Save as a csv for future analysis to reduce need for webscraping
df = df.drop(columns = ['name'], axis=1)
df.to_csv('output_files/NBA_player_data.csv', index=False)

In [24]:
#Create our improved Feature Set
improved_feature_set = df[['Pos','MP','FG','FGA','FG%','3P','3PA','3P%','2P','2PA','2P%','eFG%',
                   'FT','FTA','FT%','ORB','DRB','TRB','AST','STL','BLK','TOV','PF','PTS'
                  ,'PPM','FGAPM','ORBPM','DRBPM','APM','PFPM','FTAPM','TOVPM','height','weight']]

In [25]:
X = improved_feature_set.iloc[:,1:35]
Y = improved_feature_set.iloc[:,0]

In [26]:
seed = 3
test_size = 0.30
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [27]:
model = XGBClassifier(seed=seed,
                      max_depth=6,
                      min_child_weight=8)

model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=6, min_child_weight=8, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=3, silent=True,
       subsample=1)

In [28]:
y_pred = model.predict(X_test)
predictions = [value for value in y_pred]

In [29]:
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 76.04%


## 76.04% That is a big improvement!

In [30]:
print(model.feature_importances_)

[ 0.01963553  0.00618654  0.01203685  0.02696523  0.00800215  0.03187412
  0.04027974  0.00813664  0.0160043   0.03227759  0.02259431  0.00793491
  0.00914532  0.0527873   0.00927981  0.01318001  0.01876135  0.01291103
  0.01701298  0.02111492  0.01237307  0.01977002  0.01049022  0.0281084
  0.03658127  0.06031874  0.07867662  0.06798467  0.05117343  0.03940555
  0.04155739  0.08277857  0.08466142]


# Conclusion

### What these findings show us are that the statistics alone do not give you a 100% certainty of what position an NBA player plays

### In the NBA today you see a shift from referring to players by these traditional roles, and perhaps that is for good reason

### You can run into examples like the one below that show a lot of similarities between those playing different traditional positions

In [31]:
example = improved_feature_set[(improved_feature_set['height'] == 78.0) 
                               & (improved_feature_set['FG%'] > .45)
                               & (improved_feature_set['Pos'].isin(['SG','SF']))]

example.head()

Unnamed: 0,Pos,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,...,PPM,FGAPM,ORBPM,DRBPM,APM,PFPM,FTAPM,TOVPM,height,weight
11,SF,32.9,4.5,9.5,0.473,1.0,2.7,0.351,3.5,6.7,...,0.37386,0.288754,0.033435,0.109422,0.088146,0.066869,0.094225,0.072948,78.0,208.0
25,SG,34.1,4.1,8.8,0.463,2.1,5.0,0.411,2.0,3.9,...,0.346041,0.258065,0.017595,0.117302,0.105572,0.085044,0.058651,0.052786,78.0,185.0
68,SG,38.2,8.4,17.9,0.468,0.7,2.2,0.319,7.7,15.7,...,0.589005,0.468586,0.041885,0.123037,0.128272,0.086387,0.159686,0.073298,78.0,212.0
84,SF,38.1,9.6,20.7,0.465,1.2,2.9,0.403,8.5,17.8,...,0.674541,0.543307,0.047244,0.104987,0.102362,0.08399,0.175853,0.057743,78.0,220.0
127,SG,11.9,2.0,3.9,0.503,0.0,0.1,0.0,2.0,3.8,...,0.394958,0.327731,0.05042,0.092437,0.109244,0.067227,0.092437,0.084034,78.0,195.0
