In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
import os 

### Data  
1. All-NBA Selections from 1984 - 2018
2. Season Statistics from 1950 - 2018

### Cleaning / Preparing Data for Model
Main issues
- Null values 
- Dealing with players that were traded multiple times in one season 
- Flagging players in my season statistics data who were selected for the ALL-NBA Team that year 

In [2]:
all_nba_data = pd.read_csv("All.NBA.1984-2018.csv")
stat_data = pd.read_csv("Seasons_Stats.csv")

In [3]:
all_nba_data.columns=all_nba_data.iloc[0]

In [4]:
all_nba_data = all_nba_data.drop(all_nba_data.index[0])

In [5]:
all_nba_data = all_nba_data.reset_index(drop = True)

In [6]:
stat_data = stat_data.drop(['Unnamed: 0'], axis=1)

In [7]:
stat_data.head()

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1950.0,Curly Armstrong,G-F,31.0,FTW,63.0,,,,0.368,...,0.705,,,,176.0,,,,217.0,458.0
1,1950.0,Cliff Barker,SG,29.0,INO,49.0,,,,0.435,...,0.708,,,,109.0,,,,99.0,279.0
2,1950.0,Leo Barnhorst,SF,25.0,CHS,67.0,,,,0.394,...,0.698,,,,140.0,,,,192.0,438.0
3,1950.0,Ed Bartels,F,24.0,TOT,15.0,,,,0.312,...,0.559,,,,20.0,,,,29.0,63.0
4,1950.0,Ed Bartels,F,24.0,DNN,13.0,,,,0.308,...,0.548,,,,20.0,,,,27.0,59.0


In [8]:
all_nba_data.head()

Unnamed: 0,Rk,Player,Season,Age,Tm,Lg,WS,G,GS,MP,...,BLK,TOV,PF,PTS,FG%,2P%,3P%,eFG%,FT%,TS%
0,1,Michael Jordan,1987-88,24,CHI,NBA,21.2,82,82,3311,...,131,252,270,2868,0.535,0.546,0.132,0.537,0.841,0.603
1,2,Michael Jordan,1995-96,32,CHI,NBA,20.4,82,82,3090,...,42,197,195,2491,0.495,0.506,0.427,0.525,0.834,0.582
2,3,LeBron James,2008-09,24,CLE,NBA,20.3,81,81,3054,...,93,241,139,2304,0.489,0.535,0.344,0.53,0.78,0.591
3,4,Michael Jordan,1990-91,27,CHI,NBA,20.3,82,82,3034,...,83,202,229,2580,0.539,0.551,0.312,0.547,0.851,0.605
4,5,David Robinson,1993-94,28,SAS,NBA,20.0,80,80,3241,...,265,253,228,2383,0.507,0.51,0.345,0.51,0.749,0.577


In [9]:
all_nba_data.columns

Index(['Rk', 'Player', 'Season', 'Age', 'Tm', 'Lg', 'WS', 'G', 'GS', 'MP',
       'FG', 'FGA', '2P', '2PA', '3P', '3PA', 'FT', 'FTA', 'ORB', 'DRB', 'TRB',
       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'FG%', '2P%', '3P%', 'eFG%',
       'FT%', 'TS%'],
      dtype='object', name=0)

In [10]:
stat_data.columns

Index(['Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'blanl', 'OWS', 'DWS', 'WS', 'WS/48', 'blank2', 'OBPM', 'DBPM',
       'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA',
       '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

In [11]:
len(all_nba_data)

475

In [12]:
# Changing season column of all_nba data to match year column in stats_data
for i in range(len(all_nba_data)):
    a = all_nba_data['Season'][i]
    b = a[:2]
    c = a.split('-')[1]
    all_nba_data['Season'][i] = b + c

In [13]:
#Converting all_nba dtypes to (int) in order to filter 
all_nba_data['Season'] = all_nba_data['Season'].astype(int)

# filtering stats_data and all_nba data by year. Only want data from 1995 on. 
updated_stats = stat_data.loc[stat_data["Year"] >= 1995, :].reset_index(drop = True)

updated_anba_stats = all_nba_data.loc[all_nba_data["Season"] >= 1995, :].reset_index(drop = True)

In [14]:
# Checking for null data 
updated_stats.isnull().sum()

Year          0
Player        0
Pos           0
Age           0
Tm            0
G             0
GS            0
MP            0
PER           5
TS%          63
3PAr         68
FTr          68
ORB%          5
DRB%          5
TRB%          5
AST%          5
STL%          5
BLK%          5
TOV%         50
USG%          5
blanl     12829
OWS           0
DWS           0
WS            0
WS/48         5
blank2    12829
OBPM          0
DBPM          0
BPM           0
VORP          0
FG            0
FGA           0
FG%          68
3P            0
3PA           0
3P%        2254
2P            0
2PA           0
2P%          93
eFG%         68
FT            0
FTA           0
FT%         562
ORB           0
DRB           0
TRB           0
AST           0
STL           0
BLK           0
TOV           0
PF            0
PTS           0
dtype: int64

In [15]:
updated_stats.columns

Index(['Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'blanl', 'OWS', 'DWS', 'WS', 'WS/48', 'blank2', 'OBPM', 'DBPM',
       'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA',
       '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

In [16]:
# getting rid of two empty columns
updated_stats = updated_stats[['Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM',
       'BPM', 'VORP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA',
       '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL',
       'BLK', 'TOV', 'PF', 'PTS']]

In [17]:
updated_stats.head()

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1995.0,Alaa Abdelnaby,PF,26.0,TOT,54.0,0.0,506.0,12.6,0.519,...,0.571,37.0,77.0,114.0,13.0,15.0,12.0,45.0,104.0,256.0
1,1995.0,Alaa Abdelnaby,PF,26.0,SAC,51.0,0.0,476.0,14.1,0.54,...,0.571,34.0,72.0,106.0,13.0,15.0,12.0,40.0,102.0,254.0
2,1995.0,Alaa Abdelnaby,PF,26.0,PHI,3.0,0.0,30.0,-12.5,0.091,...,,3.0,5.0,8.0,0.0,0.0,0.0,5.0,2.0,2.0
3,1995.0,Mahmoud Abdul-Rauf,PG,25.0,DEN,73.0,43.0,2082.0,17.8,0.543,...,0.885,32.0,105.0,137.0,263.0,77.0,9.0,119.0,126.0,1165.0
4,1995.0,Michael Adams,PG,32.0,CHH,29.0,0.0,443.0,17.7,0.583,...,0.833,6.0,23.0,29.0,95.0,23.0,1.0,26.0,41.0,188.0


In [18]:
# Created a list of indexes in order to remove the corresponding row. 
# Indexes were appened if player was on multiple teams in one year. 
# kept stats with the best PER

index_list = []

for i in range(len(updated_stats)-1):
    #try:
        if updated_stats['Year'][i] == updated_stats['Year'][i+1] and updated_stats['Player'][i] == updated_stats['Player'][i+1]:
            x = updated_stats["PER"][i]
            y = updated_stats["PER"][i+1]
    #except (KeyError, IndexError):
            if x > y:
                index_list.append(i+1)
            else:
                index_list.append(i)
        

unique_index_list = list(set(index_list))

updated_stats1 = updated_stats.drop(updated_stats.index[[unique_index_list]]).reset_index(drop = True)

  result = getitem(key)


In [19]:
# Ran same process again on updated data because some players had more than two teams in one season
index_list1 = []

for i in range(len(updated_stats1)-1):
    #try:
        if updated_stats1['Year'][i] == updated_stats1['Year'][i+1] and updated_stats1['Player'][i] == updated_stats1['Player'][i+1]:
            x = updated_stats1["PER"][i]
            y = updated_stats1["PER"][i+1]
    #except (KeyError, IndexError):
            if x > y:
                index_list1.append(i+1)
            else:
                index_list1.append(i)     

updated_stats2 = updated_stats1.drop(updated_stats1.index[[index_list1]]).reset_index(drop = True)

In [20]:
updated_stats2['Year'] = updated_stats2['Year'].astype(int)

In [21]:
print(len(updated_stats2))
print(len(updated_anba_stats))

10393
330


In [22]:
# Filled in NULL values with the average across all players 

updated_stats2["3P%"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["PER"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["TS%"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["3PAr"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["FTr"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["ORB%"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["DRB%"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["TRB%"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["AST%"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["STL%"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["BLK%"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["USG%"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["WS/48"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["FG%"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["2P%"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["eFG%"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["FT%"].fillna(updated_stats2["3P%"].mean(), inplace=True)
updated_stats2["TOV%"].fillna(updated_stats2["TOV%"].mean(), inplace=True)

In [23]:
updated_anba_stats.head()

Unnamed: 0,Rk,Player,Season,Age,Tm,Lg,WS,G,GS,MP,...,BLK,TOV,PF,PTS,FG%,2P%,3P%,eFG%,FT%,TS%
0,2,Michael Jordan,1996,32,CHI,NBA,20.4,82,82,3090,...,42,197,195,2491,0.495,0.506,0.427,0.525,0.834,0.582
1,3,LeBron James,2009,24,CLE,NBA,20.3,81,81,3054,...,93,241,139,2304,0.489,0.535,0.344,0.53,0.78,0.591
2,7,LeBron James,2013,28,MIA,NBA,19.3,76,76,2877,...,67,226,110,2036,0.565,0.602,0.406,0.603,0.753,0.64
3,8,Kevin Durant,2014,25,OKC,NBA,19.2,81,81,3122,...,59,285,174,2593,0.503,0.549,0.391,0.56,0.873,0.635
4,10,Kevin Durant,2013,24,OKC,NBA,18.9,81,81,3119,...,105,280,143,2280,0.51,0.539,0.416,0.559,0.905,0.647


In [24]:
updated_stats2.head()

Unnamed: 0,Year,Player,Pos,Age,Tm,G,GS,MP,PER,TS%,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1995,Alaa Abdelnaby,PF,26.0,SAC,51.0,0.0,476.0,14.1,0.54,...,0.571,34.0,72.0,106.0,13.0,15.0,12.0,40.0,102.0,254.0
1,1995,Mahmoud Abdul-Rauf,PG,25.0,DEN,73.0,43.0,2082.0,17.8,0.543,...,0.885,32.0,105.0,137.0,263.0,77.0,9.0,119.0,126.0,1165.0
2,1995,Michael Adams,PG,32.0,CHH,29.0,0.0,443.0,17.7,0.583,...,0.833,6.0,23.0,29.0,95.0,23.0,1.0,26.0,41.0,188.0
3,1995,Rafael Addison,SF,30.0,DET,79.0,16.0,1776.0,11.3,0.521,...,0.747,67.0,175.0,242.0,109.0,53.0,25.0,76.0,236.0,656.0
4,1995,Danny Ainge,SG,35.0,PHO,74.0,1.0,1374.0,14.3,0.596,...,0.808,25.0,84.0,109.0,210.0,46.0,7.0,79.0,155.0,571.0


In [25]:
# Creating an all_nba column 
updated_stats2['All_NBA'] = 0

In [26]:
# Created nested loop that inserts a 1 in the all_nba column in my stat data 
# only if the name and year matches with my all nba data
for i in tqdm_notebook(range(len(updated_stats2))):
    for y in range(len(updated_anba_stats)):
        if updated_stats2['Player'][i] == updated_anba_stats['Player'][y]:
            if updated_stats2['Year'][i] == updated_anba_stats['Season'][y]:
                updated_stats2['All_NBA'][i] = 1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(IntProgress(value=0, max=10393), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys





In [27]:
# ran the loop again becuase some names in my stat data have an astericks 
for i in tqdm_notebook(range(len(updated_stats2))):
    for y in range(len(updated_anba_stats)):
        if updated_stats2['Player'][i] == updated_anba_stats['Player'][y] + '*':
            if updated_stats2['Year'][i] == updated_anba_stats['Season'][y]:
                updated_stats2['All_NBA'][i] = 1

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(IntProgress(value=0, max=10393), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  





In [45]:
#df.to_csv('nba_stats.csv')

### Stage 1: Testing the Waters
At this point my data was cleaned and ready to go, I began the preliminary stages of building my model.
I started by using most columns(49 to be exact). I chose to stick with most of the original columns because I wanted to run the _feature_importance function embeded in the Random Forest Classifier and gain some insight on which features might be the most valuable moving forward. 

After running the _feature_importance function, I found that these 26 features are the most valuable. I choose those in bold to be my parameters moving forward. The model in section 1 is run with the bold features as input, the classification report and confusion matrix results are nearly identical to the ones mentioned above. 
   - **WS**, **PER**, VORP, BPM, FTA, OWS, **PTS**, WS/48, **FG**, **2P**, DWS, **FT**, **FGA**, USG%, 2PA, **TOV**, OBPM, **MP**, **DRB**, **AST**, PF, **GS**, TRB, **2P%**, AST%, **BLK**

# Stage 1

In [29]:
updated_stats2.columns

Index(['Year', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'PER', 'TS%',
       '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP',
       'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%',
       'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV',
       'PF', 'PTS', 'All_NBA'],
      dtype='object')

In [30]:
updated_stats3 = updated_stats2[['GS', 'MP', 'PER','WS','FG', 'FGA','2P', '2PA', 
                                 '2P%', 'DRB', 'AST', 'BLK', 'TOV','PTS', 'All_NBA']]

In [31]:
updated_stats3 = pd.get_dummies(updated_stats3)

In [32]:
updated_stats3.head()

Unnamed: 0,GS,MP,PER,WS,FG,FGA,2P,2PA,2P%,DRB,AST,BLK,TOV,PTS,All_NBA
0,0.0,476.0,14.1,0.6,117.0,220.0,117.0,218.0,0.537,72.0,13.0,12.0,40.0,254.0,0
1,43.0,2082.0,17.8,5.0,472.0,1005.0,389.0,790.0,0.492,105.0,263.0,9.0,119.0,1165.0,0
2,0.0,443.0,17.7,1.6,67.0,148.0,38.0,67.0,0.567,23.0,95.0,1.0,26.0,188.0,0
3,16.0,1776.0,11.3,1.5,279.0,586.0,255.0,503.0,0.507,175.0,109.0,25.0,76.0,656.0,0
4,1.0,1374.0,14.3,3.4,194.0,422.0,116.0,208.0,0.558,84.0,210.0,7.0,79.0,571.0,0


In [33]:
from sklearn import tree

target = updated_stats3['All_NBA'].values.reshape(-1, 1)
data = updated_stats3.drop("All_NBA", axis=1)

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42, stratify=target)

In [35]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
#y_scaler = StandardScaler().fit(y_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
#y_train_scaled = y_scaler.transform(y_train)
#y_test_scaled = y_scaler.transform(y_test)

In [36]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200)

rf = rf.fit(X_train_scaled, y_train)
rf.score(X_test_scaled, y_test)

  after removing the cwd from sys.path.


0.9823008849557522

In [37]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

In [38]:
predictions = rf.predict(X_test_scaled)

In [39]:
cm_results = confusion_matrix(y_test, predictions)
print ('Confusion Matrix :')
print(cm_results) 
print('  ')
print ('Accuracy Score :',accuracy_score(y_test, predictions))
print('  ')
print ('Report :')

print (classification_report(y_test, predictions))

Confusion Matrix :
[[2500   17]
 [  29   53]]
  
Accuracy Score : 0.9823008849557522
  
Report :
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2517
           1       0.76      0.65      0.70        82

    accuracy                           0.98      2599
   macro avg       0.87      0.82      0.84      2599
weighted avg       0.98      0.98      0.98      2599



In [40]:
feature_names = data.columns

In [41]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

[(0.22488850582197387, 'WS'),
 (0.16576531848794687, 'PER'),
 (0.09869692972159974, 'PTS'),
 (0.06943112788386621, '2P'),
 (0.06406666915934109, 'FG'),
 (0.059704299621330434, 'FGA'),
 (0.049603517558618734, '2PA'),
 (0.0473365906311531, 'MP'),
 (0.044609932164029836, 'DRB'),
 (0.041836014422889374, 'AST'),
 (0.037337271085146985, 'TOV'),
 (0.03467071631593167, '2P%'),
 (0.032192380600870776, 'BLK'),
 (0.029860726525301223, 'GS')]