Imported Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.plotly as py
import warnings
import xgboost as xgb
warnings.filterwarnings('ignore')

Read 5 years worth of NBA statistics into individual dataframes. Dataframes are labeled according to the year range of the corresponding data. In this case, we wanted the 2006 draft class and got the first 5 years of NBA data after they had signed with a team. We ended the data collection at the 2013 draft class, making the 2017-2018 season the last year of data that we used. For other draft classes, we would just change the range of years for the NBA data.

In [2]:
data1011 = pd.read_csv('Downloads/results/input_files/NBA Player Stats - Regular Stats 2010-2011.csv')
data1011.drop(['Tm', 'Rk'], axis=1, inplace=True)
data0910 = pd.read_csv('Downloads/results/input_files/NBA Player Stats - Regular Stats 2009-2010.csv')
data0910.drop(['Tm', 'Rk'], axis=1, inplace=True)
data0809 = pd.read_csv('Downloads/results/input_files/NBA Player Stats - Regular Stats 2008-2009.csv')
data0809.drop(['Tm', 'Rk'], axis=1, inplace=True)
data0708 = pd.read_csv('Downloads/results/input_files/NBA Player Stats - Regular Stats 2007-2008.csv')
data0708.drop(['Tm', 'Rk'], axis=1, inplace=True)
data0607 = pd.read_csv('Downloads/results/input_files/NBA Player Stats - Regular Stats 2006-2007.csv')
data0607.drop(['Tm', 'Rk'], axis=1, inplace=True)
data0607.head()

Unnamed: 0,Player,Pos,Age,G,GS,MP,FG,FGA,FG%,3P,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PS/G
0,Shareef Abdur-Rahim\abdursh01,C,30.0,80.0,45.0,25.2,3.9,8.2,0.474,0.0,...,0.726,1.5,3.5,5.0,1.4,0.7,0.5,1.5,3.0,9.9
1,Hassan Adams\adamsha01,SG,22.0,61.0,8.0,8.1,1.2,2.2,0.556,0.0,...,0.667,0.6,0.7,1.3,0.2,0.3,0.1,0.4,0.8,2.9
2,Maurice Ager\agerma01,SG,22.0,32.0,1.0,6.7,0.7,2.2,0.314,0.2,...,0.606,0.0,0.6,0.7,0.2,0.1,0.1,0.5,0.8,2.2
3,LaMarcus Aldridge\aldrila01,C,21.0,63.0,22.0,22.1,3.8,7.6,0.503,0.0,...,0.722,2.3,2.7,5.0,0.4,0.3,1.2,0.7,3.0,9.0
4,Malik Allen\allenma01,PF,28.0,60.0,1.0,10.6,1.8,4.3,0.415,0.0,...,0.824,0.7,1.3,2.0,0.3,0.3,0.3,0.4,1.4,4.0


Here we read the 2006 draft class's final season data into a dataframe. The final season is defined as the last season they played before declaring for the draft. For other draft classes, we would just change the year of the draft class file. 

In [3]:
draft2006 = pd.read_csv('Downloads/results/input_files/2006_class_urls.csv')
draft2006.columns
college = draft2006[['Name', ' Years in College']]
draft2006.head()

Unnamed: 0,Name,Draft Class,Years in College,Season,Age,College,G,MP,FG,FGA,...,BLK,TOV,PTS,FG%,3p%,FT%,PG_MP,PG_PTS,PG_TRB,PG_AST
0,Ronnie Brewer,2006,3,2005-06,20,ARKANSAS,32,1112,197,447,...,15,,589,0.441,0.338,0.75,34.8,18.4,4.8,3.3
1,Hassan Adams,2006,4,2005-06,21,ARIZONA,31,1031,234,490,...,14,,543,0.478,0.262,0.611,33.3,17.5,5.0,2.9
2,Shelden Williams,2006,4,2005-06,22,DUKE,36,1198,237,410,...,137,,677,0.578,0.333,0.744,33.3,18.8,10.7,1.1
3,Rodney Carney,2006,4,2005-06,21,MEMPHIS,37,1027,220,505,...,25,,636,0.436,0.391,0.712,27.8,17.2,4.3,1.3
4,Josh Boone,2006,3,2005-06,21,UCONN,34,922,137,243,...,69,,350,0.564,,0.547,27.1,10.3,7.0,0.6


Here we created a final dataframe with the average values of the data provided by the 5 succeeding years after a player had signed with the team.

The process was relatively difficult but since some players do not make it to play 5 years in the NBA, we had to calculate the 5 year average values for those players who did play atleast 5 years after they got drafted. To do this we compared all the names of the players in each of the 5 succeeding seasons to the names in the draft class. From there we filtered for those names that were the same and eliminated those that were not. Once we had all the appropriate names, we created a final dataframe and grouped the values by the player name, and calculated the mean for each metric. 

In [4]:
names = list(draft2006['Name'][1:])
namenyear = draft2006[['Name', ' Years in College']][1:] 
namenyear = namenyear.sort_values('Name')
namenyear = namenyear.reset_index()
namenyear = namenyear.drop('index', 1)

#------------------------------------------------------------

n1011 = []
for i in range(len(data1011['Player'])):
    for j in range(len(names)):
        if data1011['Player'][i] == names[j]:
            n1011.append(names[j])       
n1011 = list(set(n1011)) 

f1011 = pd.DataFrame(index=range(len(n1011)),columns=data1011.columns)

for i in range(len(n1011)):

    for word in f1011.columns:

         f1011[word][i] = data1011.loc[data1011['Player'] == n1011[i]][word].values[0] 
            
#------------------------------------------------------------

n0910 = []
for i in range(len(data0910['Player'])):
    for j in range(len(names)):
        if data0910['Player'][i] == names[j]:
            n0910.append(names[j])       
n0910 = list(set(n0910))

f0910 = pd.DataFrame(index=range(len(n0910)),columns=data0910.columns)

for i in range(len(n0910)):

    for word in f0910.columns:

         f0910[word][i] = data0910.loc[data0910['Player'] == n0910[i]][word].values[0]
            
#------------------------------------------------------------

n0809 = []
for i in range(len(data0809['Player'])):
    for j in range(len(names)):
        if data0809['Player'][i] == names[j]:
            n0809.append(names[j])       
n0809 = list(set(n0809))

f0809 = pd.DataFrame(index=range(len(n0809)),columns=data0809.columns)

for i in range(len(n0809)):

    for word in f0809.columns:

         f0809[word][i] = data0809.loc[data0809['Player'] == n0809[i]][word].values[0]
            
#------------------------------------------------------------

n0708 = []
for i in range(len(data0708['Player'])):
    for j in range(len(names)):
        if data0708['Player'][i] == names[j]:
            n0708.append(names[j])       
n0708 = list(set(n0708))

f0708 = pd.DataFrame(index=range(len(n0708)),columns=data0708.columns)

for i in range(len(n0708)):

    for word in f0708.columns:

         f0708[word][i] = data0708.loc[data0708['Player'] == n0708[i]][word].values[0]
            
#------------------------------------------------------------

n0607 = []
for i in range(len(data0607['Player'])):
    for j in range(len(names)):
        if data0607['Player'][i] == names[j]:
            n0607.append(names[j])       
n0607 = list(set(n0708))

f0607 = pd.DataFrame(index=range(len(n0607)),columns=data0607.columns)

for i in range(len(n0607)):

    for word in f0607.columns:

         f0607[word][i] = data0607.loc[data0607['Player'] == n0607[i]][word].values[0]

final = f0607.append([f0910, f0809, f0708, f1011])
final = final.reset_index()
final.drop(['index'], axis=1, inplace=True)

final['Age']= final['Age'].astype('float64')
final['G']= final['G'].astype('float64')
final['GS']= final['GS'].astype('float64')
final['MP']= final['MP'].astype('float64')
final['FG']= final['FG'].astype('float64')
final['FGA']= final['FGA'].astype('float64')
final['FG%']= final['FG%'].astype('float64')
final['3P']= final['3P'].astype('float64')
final['3PA']= final['3PA'].astype('float64')
final['3P%']= final['3P%'].astype('float64')
final['2P']= final['2P'].astype('float64')
final['2PA']= final['2PA'].astype('float64')
final['2P%']= final['2P%'].astype('float64')
final['eFG%']= final['eFG%'].astype('float64')
final['FT']= final['FT'].astype('float64')
final['FTA']= final['FTA'].astype('float64')
final['FT%']= final['FT%'].astype('float64')
final['ORB']= final['ORB'].astype('float64')
final['DRB']= final['DRB'].astype('float64')
final['TRB']= final['TRB'].astype('float64')
final['AST']= final['AST'].astype('float64')
final['STL']= final['STL'].astype('float64')
final['BLK']= final['BLK'].astype('float64')
final['TOV']= final['TOV'].astype('float64')
final['PF']= final['PF'].astype('float64')
final['PS/G']= final['PS/G'].astype('float64')

finaldata = final.groupby('Player', as_index=False).mean()
finaldata.insert(1, "Years in College", namenyear[' Years in College']) 
finaldata['Years in College'] = 0

finaldata = finaldata[['Player', 'Years in College', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA',
       'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA',
       'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PS/G']]

finaldata.head()

Unnamed: 0,Player,Years in College,Age,G,GS,MP,FG,FGA,FG%,3P,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PS/G
0,Brandon Roy,0,26.0,47.0,23.0,27.9,4.4,11.1,0.4,0.8,...,0.848,0.6,2.0,2.6,2.7,0.8,0.3,1.2,1.6,12.2
1,Craig Smith,0,27.0,48.0,0.0,12.2,2.2,3.9,0.553,0.0,...,0.735,0.9,1.5,2.4,0.6,0.3,0.1,0.7,1.8,5.4
2,Daniel Gibson,0,24.0,67.0,15.0,27.8,3.9,9.9,0.4,1.8,...,0.822,0.4,2.3,2.6,3.0,0.7,0.3,1.2,2.3,11.6
3,Hilton Armstrong,0,26.0,53.0,2.0,9.2,0.7,1.4,0.487,0.0,...,0.485,0.8,1.7,2.5,0.3,0.3,0.4,0.5,1.6,1.7
4,J.J. Redick,0,26.0,59.0,5.0,25.6,3.3,7.6,0.441,1.5,...,0.875,0.1,1.8,1.9,1.7,0.5,0.1,0.9,1.4,10.1


The above dataframe did not have the # of years each player spent in college because the previous cell code would have average those years and not given us an accurate value. To solve this we took the # of years played in college from the original college dataframe and compared it against the player in the finaldata dataframe. If the names were the same, in the college dataframe and finaldata dataframe, we updated the value. The test dataframe has the final 5 year average values, with the appropriate # of years played in college for each player. 

In [5]:
x = []
for i in range(len(finaldata['Player'])):
    for j in range(len(n1011)):
        if finaldata['Player'][i] == n1011[j]:
            x.append(n1011[j])       
f = list(set(x))

test = finaldata

test = test[test['Player'].isin(f)].reset_index().drop('index', 1)

combined = college
combined = combined.sort_values('Name').reset_index().drop('index', 1)
combined['Player'] = test['Player']
combined['check'] = combined['Name'].isin(combined['Player'])
combined = combined[combined.check == True]
combined = combined[['Name', ' Years in College']]
years = list(combined[' Years in College'])
test['Years in College'] = years

test

Unnamed: 0,Player,Years in College,Age,G,GS,MP,FG,FGA,FG%,3P,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PS/G
0,Brandon Roy,4,26.0,47.0,23.0,27.9,4.4,11.1,0.4,0.8,...,0.848,0.6,2.0,2.6,2.7,0.8,0.3,1.2,1.6,12.2
1,Craig Smith,4,27.0,48.0,0.0,12.2,2.2,3.9,0.553,0.0,...,0.735,0.9,1.5,2.4,0.6,0.3,0.1,0.7,1.8,5.4
2,Daniel Gibson,2,24.0,67.0,15.0,27.8,3.9,9.9,0.4,1.8,...,0.822,0.4,2.3,2.6,3.0,0.7,0.3,1.2,2.3,11.6
3,Hilton Armstrong,4,26.0,53.0,2.0,9.2,0.7,1.4,0.487,0.0,...,0.485,0.8,1.7,2.5,0.3,0.3,0.4,0.5,1.6,1.7
4,J.J. Redick,4,26.0,59.0,5.0,25.6,3.3,7.6,0.441,1.5,...,0.875,0.1,1.8,1.9,1.7,0.5,0.1,0.9,1.4,10.1
5,Jordan Farmar,2,24.0,73.0,18.0,24.6,3.5,8.9,0.392,1.5,...,0.82,0.3,2.0,2.4,5.0,0.8,0.1,1.9,1.8,9.6
6,Kyle Lowry,2,24.0,75.0,71.0,34.2,4.6,10.8,0.426,1.7,...,0.765,1.2,2.9,4.1,6.7,1.4,0.3,2.1,2.8,13.5
7,LaMarcus Aldridge,2,25.0,81.0,81.0,39.6,8.7,17.5,0.5,0.0,...,0.791,3.4,5.3,8.8,2.1,1.0,1.2,1.9,2.7,21.8
8,Leon Powe,2,27.0,30.0,3.0,10.9,2.0,4.0,0.496,0.0,...,0.556,0.8,1.3,2.1,0.2,0.3,0.1,0.4,1.5,5.3
9,Maurice Ager,4,26.0,4.0,0.0,7.3,1.5,2.8,0.545,0.8,...,,0.0,0.5,0.5,0.3,0.3,0.0,1.0,1.0,3.8
