In [None]:
# convert statistical columns to numerics; first get list of columns to convert
cols_to_numeric = df_all.columns.drop(['Player', 'Pos', 'Tm', 'Year'])

In [None]:
# convert selected columns to numeric dtypes
df_all[cols_to_numeric].apply(pd.to_numeric, errors = 'coerce').head(10)

In [None]:
df_all[cols_to_numeric] = df_all[cols_to_numeric].apply(pd.to_numeric, errors = 'coerce')

In [None]:
def clean_per_poss(df):
    '''function that accepts raw per possession data and cleans it into usable DataFrame'''
    # 1: drop duplicate players
    drop_duplicate_players(df)
    # 2: replace 'None' rows with 'NaN'
    df.fillna(value=pd.np.nan, inplace = True)
    # 3: drop rows with na
    df.dropna(axis = 0, inplace = True)
    # 4: drop blank column
    df.drop(labels = '', axis = 1, inplace = True)
    # 5: variable that contains columns to convert to numerics
    cols_to_numeric = df.columns.drop(['Player', 'Pos', 'Age', 'Tm', 'Year'])
    # 6: convert selected columns to numeric dtypes
    df[cols_to_numeric] = df[cols_to_numeric].apply(pd.to_numeric, errors = 'coerce')
    # 7: reset index
    df.reset_index(drop = True, inplace = True)
    return df

In [None]:
def clean_advanced(df):
    '''function that takes in raw advanced data and cleans it into usable DataFrame'''
    #1: drop duplicate players
    drop_duplicate_players(df)
    #2: replace 'None' row with 'NaN'
    df.fillna(value=pd.np.nan, inplace = True)
    #3: dropna row
    df.dropna(axis=0, inplace=True)
    #4: create variable that stores advanced column names then drops two blank columns based on index
    adv_cols = pd.Series(data=df.columns)
    adv_cols.drop(index=[18, 23], inplace=True)
    #5: convert updated list of columns to list type
    adv_cols = list(adv_cols)
    #6: update df with new columns
    df = df[adv_cols]
    #7: create list of columns to convert to numerics
    cols_to_numeric = df.columns.drop(['Player', 'Pos', 'Age', 'Tm', 'Year'])
    #8: convert selected columns to numeric dtypes
    df[cols_to_numeric] = df[cols_to_numeric].apply(pd.to_numeric, errors = 'coerce').copy()
    #9: return updated df
    return df

In [1]:
def merge_df(df1, df2):
    '''function to merge clean per_poss and advanced dataframe'''
    #1: merge df1 and df2
    df = pd.merge(df1, df2, left_index=True, right_index=True, how='outer')
    #2: drop duplicate columns
    df.drop(labels = ['Year_x', 'Player_y', 'Pos_y', 'Age_y', 'Tm_y', 'G_y', 'MP_y'], axis = 1, inplace = True)
    #3: replace columns that still have '_x' and '_y'
    df.columns = df.columns.str.replace('_x', '').str.replace('_y', '')
    #4: print to confirm that all three df's have same number of rows
    print('Do all three DataFrames have the same number of rows? ',
         df1.shape[0] == df2.shape[0] == df.shape[0])
    #5: return merged df
    return df

In [None]:
advanced2017.drop(advanced2017.columns[[18, 23]], axis=1)

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

from urllib.request import urlopen
from bs4 import BeautifulSoup

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

In [3]:
# NBA season we will be analyzing
years = 2018

# URL page we will be scraping 
url = 'https://www.basketball-reference.com/leagues/NBA_{}_advanced.html'.format(year)

html = urlopen(url)

soup = BeautifulSoup(html)

In [6]:
soup.findAll('tr', limit=2)

headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

# exclude the first column as we will not need the ranking order
headers = headers[1:]
headers

['Player',
 'Pos',
 'Age',
 'Tm',
 'G',
 'MP',
 'PER',
 'TS%',
 '3PAr',
 'FTr',
 'ORB%',
 'DRB%',
 'TRB%',
 'AST%',
 'STL%',
 'BLK%',
 'TOV%',
 'USG%',
 '\xa0',
 'OWS',
 'DWS',
 'WS',
 'WS/48',
 '\xa0',
 'OBPM',
 'DBPM',
 'BPM',
 'VORP']

In [7]:
# avoid the first header row
rows = soup.findAll('tr')[1:]
player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]

In [8]:
stats = pd.DataFrame(player_stats, columns = headers)

In [9]:
stats.head(10)

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,Unnamed: 19,OWS,DWS,WS,WS/48,Unnamed: 24,OBPM,DBPM,BPM,VORP
0,Alex Abrines,SG,24,OKC,75,1134,9.0,0.567,0.759,0.158,2.5,8.9,5.6,3.4,1.7,0.6,7.4,12.7,,1.3,1.0,2.2,0.094,,-0.5,-1.7,-2.2,-0.1
1,Quincy Acy,PF,27,BRK,70,1359,8.2,0.525,0.8,0.164,3.1,17.1,10.0,6.0,1.2,1.6,13.3,14.4,,-0.1,1.1,1.0,0.036,,-2.0,-0.2,-2.2,-0.1
2,Steven Adams,C,24,OKC,76,2487,20.6,0.63,0.003,0.402,16.6,13.9,15.3,5.5,1.8,2.8,13.3,16.7,,6.7,3.0,9.7,0.187,,2.2,1.1,3.3,3.3
3,Bam Adebayo,C,20,MIA,69,1368,15.7,0.57,0.021,0.526,9.7,21.6,15.6,11.0,1.2,2.5,13.6,15.9,,2.3,1.9,4.2,0.148,,-1.6,1.8,0.2,0.8
4,Arron Afflalo,SG,32,ORL,53,682,5.8,0.516,0.432,0.16,0.6,10.1,5.3,6.2,0.3,1.1,10.8,12.5,,-0.1,0.2,0.1,0.009,,-4.1,-1.8,-5.8,-0.7
5,Cole Aldrich,C,29,MIN,21,49,6.0,0.34,0.0,0.4,7.0,28.6,17.6,8.2,2.0,1.8,5.4,16.8,,-0.1,0.1,0.0,-0.013,,-7.0,0.1,-6.9,-0.1
6,LaMarcus Aldridge,C,32,SAS,75,2509,25.0,0.57,0.068,0.296,10.8,17.3,14.0,11.3,0.9,3.0,6.8,29.1,,7.4,3.5,10.9,0.209,,3.0,0.3,3.3,3.3
7,Jarrett Allen,C,19,BRK,72,1441,17.5,0.636,0.038,0.37,10.5,18.1,14.3,5.4,0.9,4.6,15.1,16.3,,2.7,1.5,4.2,0.141,,-1.3,1.4,0.2,0.8
8,Kadeem Allen,PG,25,BOS,18,107,2.6,0.366,0.5,0.409,4.1,7.1,5.6,15.2,1.4,1.6,25.7,14.6,,-0.2,0.1,-0.1,-0.038,,-6.7,0.3,-6.4,-0.1
9,Tony Allen,SF,36,NOP,22,273,8.7,0.514,0.132,0.231,8.2,10.4,9.3,4.6,1.9,0.9,15.9,18.9,,-0.2,0.2,0.1,0.017,,-4.0,-1.3,-5.2,-0.2


In [10]:
stats.shape

(690, 28)

In [24]:
stats

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,Unnamed: 19,OWS,DWS,WS,WS/48,Unnamed: 24,OBPM,DBPM,BPM,VORP
0,Alex Abrines,SG,24,OKC,75,1134,9.0,.567,.759,.158,2.5,8.9,5.6,3.4,1.7,0.6,7.4,12.7,,1.3,1.0,2.2,.094,,-0.5,-1.7,-2.2,-0.1
1,Quincy Acy,PF,27,BRK,70,1359,8.2,.525,.800,.164,3.1,17.1,10.0,6.0,1.2,1.6,13.3,14.4,,-0.1,1.1,1.0,.036,,-2.0,-0.2,-2.2,-0.1
2,Steven Adams,C,24,OKC,76,2487,20.6,.630,.003,.402,16.6,13.9,15.3,5.5,1.8,2.8,13.3,16.7,,6.7,3.0,9.7,.187,,2.2,1.1,3.3,3.3
3,Bam Adebayo,C,20,MIA,69,1368,15.7,.570,.021,.526,9.7,21.6,15.6,11.0,1.2,2.5,13.6,15.9,,2.3,1.9,4.2,.148,,-1.6,1.8,0.2,0.8
4,Arron Afflalo,SG,32,ORL,53,682,5.8,.516,.432,.160,0.6,10.1,5.3,6.2,0.3,1.1,10.8,12.5,,-0.1,0.2,0.1,.009,,-4.1,-1.8,-5.8,-0.7
5,Cole Aldrich,C,29,MIN,21,49,6.0,.340,.000,.400,7.0,28.6,17.6,8.2,2.0,1.8,5.4,16.8,,-0.1,0.1,0.0,-0.013,,-7.0,0.1,-6.9,-0.1
6,LaMarcus Aldridge,C,32,SAS,75,2509,25.0,.570,.068,.296,10.8,17.3,14.0,11.3,0.9,3.0,6.8,29.1,,7.4,3.5,10.9,.209,,3.0,0.3,3.3,3.3
7,Jarrett Allen,C,19,BRK,72,1441,17.5,.636,.038,.370,10.5,18.1,14.3,5.4,0.9,4.6,15.1,16.3,,2.7,1.5,4.2,.141,,-1.3,1.4,0.2,0.8
8,Kadeem Allen,PG,25,BOS,18,107,2.6,.366,.500,.409,4.1,7.1,5.6,15.2,1.4,1.6,25.7,14.6,,-0.2,0.1,-0.1,-0.038,,-6.7,0.3,-6.4,-0.1
9,Tony Allen,SF,36,NOP,22,273,8.7,.514,.132,.231,8.2,10.4,9.3,4.6,1.9,0.9,15.9,18.9,,-0.2,0.2,0.1,.017,,-4.0,-1.3,-5.2,-0.2


In [16]:
def basketball_reference_advanced_scrap(year):
    # NBA season we will be analyzing
    years = year
    # URL page we will be scraping 
    url = 'https://www.basketball-reference.com/leagues/NBA_{}_advanced.html'.format(year)
    html = urlopen(url)
    soup = BeautifulSoup(html)
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    # exclude the first column as we will not need the ranking order
    headers = headers[1:]
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
    df = pd.DataFrame(player_stats, columns = headers)
    return df

In [25]:
advanced_2017 = basketball_reference_advanced_scrap(2017)

In [26]:
advanced_2017.head(5)

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,Unnamed: 19,OWS,DWS,WS,WS/48,Unnamed: 24,OBPM,DBPM,BPM,VORP
0,Alex Abrines,SG,23,OKC,68,1055,10.1,0.56,0.724,0.144,1.9,7.1,4.5,5.5,1.7,0.6,8.3,15.9,,1.2,0.9,2.1,0.096,,-0.3,-2.2,-2.5,-0.1
1,Quincy Acy,PF,26,TOT,38,558,11.8,0.565,0.529,0.353,3.9,18.0,11.0,4.9,1.2,2.0,9.7,16.8,,0.5,0.5,0.9,0.082,,-1.8,-1.2,-3.0,-0.1
2,Quincy Acy,PF,26,DAL,6,48,-1.4,0.355,0.412,0.176,4.6,15.2,9.7,0.0,0.0,0.0,9.8,20.0,,-0.2,0.0,-0.1,-0.133,,-10.1,-6.0,-16.2,-0.2
3,Quincy Acy,PF,26,BRK,32,510,13.1,0.587,0.542,0.373,3.8,18.2,11.1,5.4,1.3,2.2,9.6,16.5,,0.6,0.5,1.1,0.102,,-1.1,-0.7,-1.8,0.0
4,Steven Adams,C,23,OKC,80,2389,16.5,0.589,0.002,0.392,13.0,15.4,14.2,5.4,1.8,2.6,16.0,16.2,,3.3,3.1,6.5,0.13,,-0.7,1.2,0.6,1.5


In [42]:
advanced_2017.drop_duplicates(subset = 'Player')

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,Unnamed: 19,OWS,DWS,WS,WS/48,Unnamed: 24,OBPM,DBPM,BPM,VORP,is_duplicate
0,Alex Abrines,SG,23,OKC,68,1055,10.1,.560,.724,.144,1.9,7.1,4.5,5.5,1.7,0.6,8.3,15.9,,1.2,0.9,2.1,.096,,-0.3,-2.2,-2.5,-0.1,False
1,Quincy Acy,PF,26,TOT,38,558,11.8,.565,.529,.353,3.9,18.0,11.0,4.9,1.2,2.0,9.7,16.8,,0.5,0.5,0.9,.082,,-1.8,-1.2,-3.0,-0.1,False
4,Steven Adams,C,23,OKC,80,2389,16.5,.589,.002,.392,13.0,15.4,14.2,5.4,1.8,2.6,16.0,16.2,,3.3,3.1,6.5,.130,,-0.7,1.2,0.6,1.5,False
5,Arron Afflalo,SG,31,SAC,61,1580,8.9,.559,.360,.221,0.7,8.4,4.6,7.4,0.7,0.3,8.4,14.4,,1.2,0.2,1.4,.043,,-1.4,-2.1,-3.5,-0.6,False
6,Alexis Ajinca,C,28,NOP,39,584,12.9,.529,.022,.225,8.3,23.8,16.0,3.1,1.7,3.1,13.7,17.2,,0.0,0.9,1.0,.080,,-5.1,1.0,-4.1,-0.3,False
7,Cole Aldrich,C,28,MIN,62,531,12.7,.549,.000,.256,11.0,23.9,17.4,6.4,2.4,3.7,15.1,9.4,,0.6,0.7,1.3,.116,,-2.0,2.6,0.6,0.4,False
8,LaMarcus Aldridge,PF,31,SAS,72,2335,18.6,.532,.053,.258,8.5,16.6,12.7,9.9,1.0,3.0,7.7,24.5,,3.5,3.7,7.2,.149,,-0.3,1.3,1.0,1.8,False
9,Lavoy Allen,PF,27,IND,61,871,11.6,.485,.006,.196,13.7,14.5,14.1,9.1,1.0,2.4,13.7,10.9,,0.9,0.8,1.7,.093,,-1.5,1.3,-0.3,0.4,False
10,Tony Allen,SG,35,MEM,71,1914,13.3,.493,.091,.218,9.6,13.8,11.7,8.4,3.1,1.4,13.3,17.9,,0.2,2.9,3.1,.077,,-1.8,2.4,0.6,1.3,False
11,Al-Farouq Aminu,PF,26,POR,61,1773,11.3,.506,.455,.292,4.8,23.5,14.1,7.9,1.7,2.0,15.2,15.4,,-0.1,2.0,1.9,.051,,-2.3,1.2,-1.1,0.4,False


In [29]:
advanced_2017['is_duplicate'] = advanced_2017.duplicated()

In [30]:
advanced_2017[advanced_2017['is_duplicate'] == True]

Unnamed: 0,Player,Pos,Age,Tm,G,MP,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,Unnamed: 19,OWS,DWS,WS,WS/48,Unnamed: 24,OBPM,DBPM,BPM,VORP,is_duplicate
47,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True
76,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True
102,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True
127,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True
150,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True
177,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True
204,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True
227,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True
254,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True
280,,,,,,,,,,,,,,,,,,,,,,,,,,,,,True


In [46]:
def basketball_reference_scrap(year, statistic):
    # NBA season we will be analyzing
    years = year
    # URL page we will be scraping 
    url = 'https://www.basketball-reference.com/leagues/NBA_{}_{}.html'.format(year, statistic)
    html = urlopen(url)
    soup = BeautifulSoup(html)
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    # exclude the first column as we will not need the ranking order
    headers = headers[1:]
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
    df = pd.DataFrame(player_stats, columns = headers)
    return df

In [49]:
per_game18 = basketball_reference_scrap(2018, 'per_game')

In [50]:
advanced18 = basketball_reference_scrap(2018, 'advanced')

In [53]:
pd.concat([per_game18, advanced18], join='inner').head(10)

Unnamed: 0,Player,Pos,Age,Tm,G,MP
0,Alex Abrines,SG,24,OKC,75,15.1
1,Quincy Acy,PF,27,BRK,70,19.4
2,Steven Adams,C,24,OKC,76,32.7
3,Bam Adebayo,C,20,MIA,69,19.8
4,Arron Afflalo,SG,32,ORL,53,12.9
5,Cole Aldrich,C,29,MIN,21,2.3
6,LaMarcus Aldridge,C,32,SAS,75,33.5
7,Jarrett Allen,C,19,BRK,72,20.0
8,Kadeem Allen,PG,25,BOS,18,5.9
9,Tony Allen,SF,36,NOP,22,12.4


In [55]:
pd.merge(left=per_game18, right=advanced18, on='Player')

Unnamed: 0,Player,Pos_x,Age_x,Tm_x,G_x,GS,MP_x,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Pos_y,Age_y,Tm_y,G_y,MP_y,PER,TS%,3PAr,FTr,ORB%,DRB%,TRB%,AST%,STL%,BLK%,TOV%,USG%,Unnamed: 47,OWS,DWS,WS,WS/48,Unnamed: 52,OBPM,DBPM,BPM,VORP
0,Alex Abrines,SG,24,OKC,75,8,15.1,1.5,3.9,.395,1.1,2.9,.380,0.4,0.9,.443,.540,0.5,0.6,.848,0.3,1.2,1.5,0.4,0.5,0.1,0.3,1.7,4.7,SG,24,OKC,75,1134,9.0,.567,.759,.158,2.5,8.9,5.6,3.4,1.7,0.6,7.4,12.7,,1.3,1.0,2.2,.094,,-0.5,-1.7,-2.2,-0.1
1,Quincy Acy,PF,27,BRK,70,8,19.4,1.9,5.2,.356,1.5,4.2,.349,0.4,1.0,.384,.496,0.7,0.9,.817,0.6,3.1,3.7,0.8,0.5,0.4,0.9,2.1,5.9,PF,27,BRK,70,1359,8.2,.525,.800,.164,3.1,17.1,10.0,6.0,1.2,1.6,13.3,14.4,,-0.1,1.1,1.0,.036,,-2.0,-0.2,-2.2,-0.1
2,Steven Adams,C,24,OKC,76,76,32.7,5.9,9.4,.629,0.0,0.0,.000,5.9,9.3,.631,.629,2.1,3.8,.559,5.1,4.0,9.0,1.2,1.2,1.0,1.7,2.8,13.9,C,24,OKC,76,2487,20.6,.630,.003,.402,16.6,13.9,15.3,5.5,1.8,2.8,13.3,16.7,,6.7,3.0,9.7,.187,,2.2,1.1,3.3,3.3
3,Bam Adebayo,C,20,MIA,69,19,19.8,2.5,4.9,.512,0.0,0.1,.000,2.5,4.8,.523,.512,1.9,2.6,.721,1.7,3.8,5.5,1.5,0.5,0.6,1.0,2.0,6.9,C,20,MIA,69,1368,15.7,.570,.021,.526,9.7,21.6,15.6,11.0,1.2,2.5,13.6,15.9,,2.3,1.9,4.2,.148,,-1.6,1.8,0.2,0.8
4,Arron Afflalo,SG,32,ORL,53,3,12.9,1.2,3.1,.401,0.5,1.3,.386,0.7,1.7,.413,.485,0.4,0.5,.846,0.1,1.2,1.2,0.6,0.1,0.2,0.4,1.1,3.4,SG,32,ORL,53,682,5.8,.516,.432,.160,0.6,10.1,5.3,6.2,0.3,1.1,10.8,12.5,,-0.1,0.2,0.1,.009,,-4.1,-1.8,-5.8,-0.7
5,Cole Aldrich,C,29,MIN,21,0,2.3,0.2,0.7,.333,0.0,0.0,,0.2,0.7,.333,.333,0.1,0.3,.333,0.1,0.6,0.7,0.1,0.1,0.0,0.0,0.5,0.6,C,29,MIN,21,49,6.0,.340,.000,.400,7.0,28.6,17.6,8.2,2.0,1.8,5.4,16.8,,-0.1,0.1,0.0,-0.013,,-7.0,0.1,-6.9,-0.1
6,LaMarcus Aldridge,C,32,SAS,75,75,33.5,9.2,18.0,.510,0.4,1.2,.293,8.8,16.7,.526,.520,4.5,5.3,.837,3.3,5.2,8.5,2.0,0.6,1.2,1.5,2.1,23.1,C,32,SAS,75,2509,25.0,.570,.068,.296,10.8,17.3,14.0,11.3,0.9,3.0,6.8,29.1,,7.4,3.5,10.9,.209,,3.0,0.3,3.3,3.3
7,Jarrett Allen,C,19,BRK,72,31,20.0,3.3,5.5,.589,0.1,0.2,.333,3.2,5.3,.599,.596,1.6,2.0,.776,2.0,3.4,5.4,0.7,0.4,1.2,1.1,2.0,8.2,C,19,BRK,72,1441,17.5,.636,.038,.370,10.5,18.1,14.3,5.4,0.9,4.6,15.1,16.3,,2.7,1.5,4.2,.141,,-1.3,1.4,0.2,0.8
8,Kadeem Allen,PG,25,BOS,18,1,5.9,0.3,1.2,.273,0.0,0.6,.000,0.3,0.6,.545,.273,0.4,0.5,.778,0.2,0.4,0.6,0.7,0.2,0.1,0.5,0.8,1.1,PG,25,BOS,18,107,2.6,.366,.500,.409,4.1,7.1,5.6,15.2,1.4,1.6,25.7,14.6,,-0.2,0.1,-0.1,-0.038,,-6.7,0.3,-6.4,-0.1
9,Tony Allen,SF,36,NOP,22,0,12.4,2.0,4.1,.484,0.2,0.5,.333,1.8,3.6,.506,.505,0.5,1.0,.524,0.9,1.2,2.1,0.4,0.5,0.1,0.9,2.2,4.7,SF,36,NOP,22,273,8.7,.514,.132,.231,8.2,10.4,9.3,4.6,1.9,0.9,15.9,18.9,,-0.2,0.2,0.1,.017,,-4.0,-1.3,-5.2,-0.2


In [57]:
cols_to_use = advanced18.columns.difference(per_game18.columns)

In [75]:
def cols_to_use(df1, df2):
    columns = df2.columns.difference(df1.columns)
    return columns

In [61]:
dfnew2018 = pd.merge(per_game18, advanced18[cols_to_use], left_index=True, right_index=True, how='outer')
dfnew2018

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,3PAr,AST%,BLK%,BPM,DBPM,DRB%,DWS,FTr,OBPM,ORB%,OWS,PER,STL%,TOV%,TRB%,TS%,USG%,VORP,WS,WS/48,Unnamed: 50,Unnamed: 51
0,Alex Abrines,SG,24,OKC,75,8,15.1,1.5,3.9,.395,1.1,2.9,.380,0.4,0.9,.443,.540,0.5,0.6,.848,0.3,1.2,1.5,0.4,0.5,0.1,0.3,1.7,4.7,.759,3.4,0.6,-2.2,-1.7,8.9,1.0,.158,-0.5,2.5,1.3,9.0,1.7,7.4,5.6,.567,12.7,-0.1,2.2,.094,,
1,Quincy Acy,PF,27,BRK,70,8,19.4,1.9,5.2,.356,1.5,4.2,.349,0.4,1.0,.384,.496,0.7,0.9,.817,0.6,3.1,3.7,0.8,0.5,0.4,0.9,2.1,5.9,.800,6.0,1.6,-2.2,-0.2,17.1,1.1,.164,-2.0,3.1,-0.1,8.2,1.2,13.3,10.0,.525,14.4,-0.1,1.0,.036,,
2,Steven Adams,C,24,OKC,76,76,32.7,5.9,9.4,.629,0.0,0.0,.000,5.9,9.3,.631,.629,2.1,3.8,.559,5.1,4.0,9.0,1.2,1.2,1.0,1.7,2.8,13.9,.003,5.5,2.8,3.3,1.1,13.9,3.0,.402,2.2,16.6,6.7,20.6,1.8,13.3,15.3,.630,16.7,3.3,9.7,.187,,
3,Bam Adebayo,C,20,MIA,69,19,19.8,2.5,4.9,.512,0.0,0.1,.000,2.5,4.8,.523,.512,1.9,2.6,.721,1.7,3.8,5.5,1.5,0.5,0.6,1.0,2.0,6.9,.021,11.0,2.5,0.2,1.8,21.6,1.9,.526,-1.6,9.7,2.3,15.7,1.2,13.6,15.6,.570,15.9,0.8,4.2,.148,,
4,Arron Afflalo,SG,32,ORL,53,3,12.9,1.2,3.1,.401,0.5,1.3,.386,0.7,1.7,.413,.485,0.4,0.5,.846,0.1,1.2,1.2,0.6,0.1,0.2,0.4,1.1,3.4,.432,6.2,1.1,-5.8,-1.8,10.1,0.2,.160,-4.1,0.6,-0.1,5.8,0.3,10.8,5.3,.516,12.5,-0.7,0.1,.009,,
5,Cole Aldrich,C,29,MIN,21,0,2.3,0.2,0.7,.333,0.0,0.0,,0.2,0.7,.333,.333,0.1,0.3,.333,0.1,0.6,0.7,0.1,0.1,0.0,0.0,0.5,0.6,.000,8.2,1.8,-6.9,0.1,28.6,0.1,.400,-7.0,7.0,-0.1,6.0,2.0,5.4,17.6,.340,16.8,-0.1,0.0,-0.013,,
6,LaMarcus Aldridge,C,32,SAS,75,75,33.5,9.2,18.0,.510,0.4,1.2,.293,8.8,16.7,.526,.520,4.5,5.3,.837,3.3,5.2,8.5,2.0,0.6,1.2,1.5,2.1,23.1,.068,11.3,3.0,3.3,0.3,17.3,3.5,.296,3.0,10.8,7.4,25.0,0.9,6.8,14.0,.570,29.1,3.3,10.9,.209,,
7,Jarrett Allen,C,19,BRK,72,31,20.0,3.3,5.5,.589,0.1,0.2,.333,3.2,5.3,.599,.596,1.6,2.0,.776,2.0,3.4,5.4,0.7,0.4,1.2,1.1,2.0,8.2,.038,5.4,4.6,0.2,1.4,18.1,1.5,.370,-1.3,10.5,2.7,17.5,0.9,15.1,14.3,.636,16.3,0.8,4.2,.141,,
8,Kadeem Allen,PG,25,BOS,18,1,5.9,0.3,1.2,.273,0.0,0.6,.000,0.3,0.6,.545,.273,0.4,0.5,.778,0.2,0.4,0.6,0.7,0.2,0.1,0.5,0.8,1.1,.500,15.2,1.6,-6.4,0.3,7.1,0.1,.409,-6.7,4.1,-0.2,2.6,1.4,25.7,5.6,.366,14.6,-0.1,-0.1,-0.038,,
9,Tony Allen,SF,36,NOP,22,0,12.4,2.0,4.1,.484,0.2,0.5,.333,1.8,3.6,.506,.505,0.5,1.0,.524,0.9,1.2,2.1,0.4,0.5,0.1,0.9,2.2,4.7,.132,4.6,0.9,-5.2,-1.3,10.4,0.2,.231,-4.0,8.2,-0.2,8.7,1.9,15.9,9.3,.514,18.9,-0.2,0.1,.017,,


In [63]:
dfnew2018.shape

(690, 51)

In [65]:
dfnew2018.duplicated().sum()

25

In [69]:
dfnew2018.drop_duplicates(subset = 'Player')

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,3PAr,AST%,BLK%,BPM,DBPM,DRB%,DWS,FTr,OBPM,ORB%,OWS,PER,STL%,TOV%,TRB%,TS%,USG%,VORP,WS,WS/48,Unnamed: 50,Unnamed: 51
0,Alex Abrines,SG,24,OKC,75,8,15.1,1.5,3.9,.395,1.1,2.9,.380,0.4,0.9,.443,.540,0.5,0.6,.848,0.3,1.2,1.5,0.4,0.5,0.1,0.3,1.7,4.7,.759,3.4,0.6,-2.2,-1.7,8.9,1.0,.158,-0.5,2.5,1.3,9.0,1.7,7.4,5.6,.567,12.7,-0.1,2.2,.094,,
1,Quincy Acy,PF,27,BRK,70,8,19.4,1.9,5.2,.356,1.5,4.2,.349,0.4,1.0,.384,.496,0.7,0.9,.817,0.6,3.1,3.7,0.8,0.5,0.4,0.9,2.1,5.9,.800,6.0,1.6,-2.2,-0.2,17.1,1.1,.164,-2.0,3.1,-0.1,8.2,1.2,13.3,10.0,.525,14.4,-0.1,1.0,.036,,
2,Steven Adams,C,24,OKC,76,76,32.7,5.9,9.4,.629,0.0,0.0,.000,5.9,9.3,.631,.629,2.1,3.8,.559,5.1,4.0,9.0,1.2,1.2,1.0,1.7,2.8,13.9,.003,5.5,2.8,3.3,1.1,13.9,3.0,.402,2.2,16.6,6.7,20.6,1.8,13.3,15.3,.630,16.7,3.3,9.7,.187,,
3,Bam Adebayo,C,20,MIA,69,19,19.8,2.5,4.9,.512,0.0,0.1,.000,2.5,4.8,.523,.512,1.9,2.6,.721,1.7,3.8,5.5,1.5,0.5,0.6,1.0,2.0,6.9,.021,11.0,2.5,0.2,1.8,21.6,1.9,.526,-1.6,9.7,2.3,15.7,1.2,13.6,15.6,.570,15.9,0.8,4.2,.148,,
4,Arron Afflalo,SG,32,ORL,53,3,12.9,1.2,3.1,.401,0.5,1.3,.386,0.7,1.7,.413,.485,0.4,0.5,.846,0.1,1.2,1.2,0.6,0.1,0.2,0.4,1.1,3.4,.432,6.2,1.1,-5.8,-1.8,10.1,0.2,.160,-4.1,0.6,-0.1,5.8,0.3,10.8,5.3,.516,12.5,-0.7,0.1,.009,,
5,Cole Aldrich,C,29,MIN,21,0,2.3,0.2,0.7,.333,0.0,0.0,,0.2,0.7,.333,.333,0.1,0.3,.333,0.1,0.6,0.7,0.1,0.1,0.0,0.0,0.5,0.6,.000,8.2,1.8,-6.9,0.1,28.6,0.1,.400,-7.0,7.0,-0.1,6.0,2.0,5.4,17.6,.340,16.8,-0.1,0.0,-0.013,,
6,LaMarcus Aldridge,C,32,SAS,75,75,33.5,9.2,18.0,.510,0.4,1.2,.293,8.8,16.7,.526,.520,4.5,5.3,.837,3.3,5.2,8.5,2.0,0.6,1.2,1.5,2.1,23.1,.068,11.3,3.0,3.3,0.3,17.3,3.5,.296,3.0,10.8,7.4,25.0,0.9,6.8,14.0,.570,29.1,3.3,10.9,.209,,
7,Jarrett Allen,C,19,BRK,72,31,20.0,3.3,5.5,.589,0.1,0.2,.333,3.2,5.3,.599,.596,1.6,2.0,.776,2.0,3.4,5.4,0.7,0.4,1.2,1.1,2.0,8.2,.038,5.4,4.6,0.2,1.4,18.1,1.5,.370,-1.3,10.5,2.7,17.5,0.9,15.1,14.3,.636,16.3,0.8,4.2,.141,,
8,Kadeem Allen,PG,25,BOS,18,1,5.9,0.3,1.2,.273,0.0,0.6,.000,0.3,0.6,.545,.273,0.4,0.5,.778,0.2,0.4,0.6,0.7,0.2,0.1,0.5,0.8,1.1,.500,15.2,1.6,-6.4,0.3,7.1,0.1,.409,-6.7,4.1,-0.2,2.6,1.4,25.7,5.6,.366,14.6,-0.1,-0.1,-0.038,,
9,Tony Allen,SF,36,NOP,22,0,12.4,2.0,4.1,.484,0.2,0.5,.333,1.8,3.6,.506,.505,0.5,1.0,.524,0.9,1.2,2.1,0.4,0.5,0.1,0.9,2.2,4.7,.132,4.6,0.9,-5.2,-1.3,10.4,0.2,.231,-4.0,8.2,-0.2,8.7,1.9,15.9,9.3,.514,18.9,-0.2,0.1,.017,,


In [70]:
def basketball_reference_scrap(year, statistic):
    # NBA season we will be analyzing
    years = year
    # URL page we will be scraping 
    url = 'https://www.basketball-reference.com/leagues/NBA_{}_{}.html'.format(year, statistic)
    html = urlopen(url)
    soup = BeautifulSoup(html)
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    # exclude the first column as we will not need the ranking order
    headers = headers[1:]
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')] for i in range(len(rows))]
    df = pd.DataFrame(player_stats, columns = headers)
    df['Year'] = year
    return df

In [72]:
basketball_reference_scrap(2017, 'per_game')

Unnamed: 0,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,eFG%,FT,FTA,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year
0,Alex Abrines,SG,23,OKC,68,6,15.5,2.0,5.0,.393,1.4,3.6,.381,0.6,1.4,.426,.531,0.6,0.7,.898,0.3,1.0,1.3,0.6,0.5,0.1,0.5,1.7,6.0,2017
1,Quincy Acy,PF,26,TOT,38,1,14.7,1.8,4.5,.412,1.0,2.4,.411,0.9,2.1,.413,.521,1.2,1.6,.750,0.5,2.5,3.0,0.5,0.4,0.4,0.6,1.8,5.8,2017
2,Quincy Acy,PF,26,DAL,6,0,8.0,0.8,2.8,.294,0.2,1.2,.143,0.7,1.7,.400,.324,0.3,0.5,.667,0.3,1.0,1.3,0.0,0.0,0.0,0.3,1.5,2.2,2017
3,Quincy Acy,PF,26,BRK,32,1,15.9,2.0,4.8,.425,1.1,2.6,.434,0.9,2.2,.414,.542,1.3,1.8,.754,0.6,2.8,3.3,0.6,0.4,0.5,0.6,1.8,6.5,2017
4,Steven Adams,C,23,OKC,80,80,29.9,4.7,8.2,.571,0.0,0.0,.000,4.7,8.2,.572,.571,2.0,3.2,.611,3.5,4.2,7.7,1.1,1.1,1.0,1.8,2.4,11.3,2017
5,Arron Afflalo,SG,31,SAC,61,45,25.9,3.0,6.9,.440,1.0,2.5,.411,2.0,4.4,.457,.514,1.4,1.5,.892,0.1,1.9,2.0,1.3,0.3,0.1,0.7,1.7,8.4,2017
6,Alexis Ajinca,C,28,NOP,39,15,15.0,2.3,4.6,.500,0.0,0.1,.000,2.3,4.5,.511,.500,0.7,1.0,.725,1.2,3.4,4.5,0.3,0.5,0.6,0.8,2.0,5.3,2017
7,Cole Aldrich,C,28,MIN,62,0,8.6,0.7,1.4,.523,0.0,0.0,,0.7,1.4,.523,.523,0.2,0.4,.682,0.8,1.7,2.5,0.4,0.4,0.4,0.3,1.4,1.7,2017
8,LaMarcus Aldridge,PF,31,SAS,72,72,32.4,6.9,14.6,.477,0.3,0.8,.411,6.6,13.8,.480,.488,3.1,3.8,.812,2.4,4.9,7.3,1.9,0.6,1.2,1.4,2.2,17.3,2017
9,Lavoy Allen,PF,27,IND,61,5,14.3,1.3,2.8,.458,0.0,0.0,.000,1.3,2.7,.461,.458,0.4,0.5,.697,1.7,1.9,3.6,0.9,0.3,0.4,0.5,1.3,2.9,2017


In [74]:
year_test = [1, 2, 3, 4, 5]

for i in year_test:
    print(i)

1
2
3
4
5


In [None]:
# scatter plot of TS% and Win Shares
eda_scatterplot(df_per_qualify, 'TS%', 'WS', 'True Shooting % and Win Shares')

The Player Efficiency Rating is a statistic that was developed by John Hollinger, who is now the Vice President of Basketball Operations for the Memphis Grizzlies [(1)](https://en.wikipedia.org/wiki/John_Hollinger). In the 1990's he was motivated to find the ultimate basketball statistic, one that could take into account all aspects, both good and bad, to derive a player's overall contribution. The result was the PER. 

It is a statistic that in essence "sums up all a player's positive accomplishments, subtracts the negative accomplishments, and returns a per-minute rating of a player's performance" [2](https://www.basketball-reference.com/about/per.html). So it takes into account good things -- field goals, free throws, 3-pointers, assists, rebounds, blocks and steals -- and bad things -- missed shots, turnovers, fouls -- and returns a value that is then adjusted to a per-minute basis so that the number can then be compared across all players, no matter if they are a starter or substitute. [3](https://www.washingtonpost.com/what-is-player-efficiency-rating/37939879-1c08-4cfa-aff3-51c2a2ae060e_note.html?utm_term=.916e0fa414eb)