[Basketball Reference](https://www.basketball-reference.com "Basketball Reference")

In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from datetime import date
from tqdm import tqdm
import numpy as np

In [3]:
# There is some debate as to when the modern era of basketball began. NBA modern era options:
merger = 1977
three_point = 1980
unrestricted_fa = 1989
bulls_breakup = 1999
def_rule_change = 2001

# current year and month
c_year = date.today().year
c_month = date.today().month

# select year data for var 'begin'
# as a default, 'begin' will be set as the year of the addition of the 3-point line (1979-1980 season)

begin = three_point

# select your end year
# as a default, 'end' will be set as the current year (2019 at time of creation)

if c_month > 10:
    end = c_year + 1
else:
    end = c_year

years = list(range(begin, end + 1))

In [4]:
dfs = []

for year in tqdm(years):
    # URL page we will scraping
    url = "https://www.basketball-reference.com/leagues/NBA_{}_per_poss.html".format(year)
    
    # this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html)

    # use findALL() to get the column headers
    soup.findAll('tr', limit=2)

    # use getText()to extract the text we need into a list
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

    # exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
    headers = headers[1:]

    # get the actual data
    # avoid the first header row
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
    
    # append a list of player stats
    year_stats = pd.DataFrame(player_stats, columns = headers)
    
    year_stats = year_stats.dropna(how='all')
    
    # add year columns
    year_stats['Year'] = year - 1
    
    # append a list of dfs to be concatenated
    dfs.append(year_stats)



100%|██████████████████████████████████████████████████████████████████████████████████| 40/40 [02:49<00:00,  3.65s/it]


In [5]:
# concat all dfs to create stats df

stats = pd.concat(dfs)

# str split "player" into "last name" and "first name" for organization

last_first = stats.Player.str.split(" ", n = 1, expand = True)
stats['First Name'] = last_first[0]
stats['Last Name'] = last_first[1]
stats = stats.rename(columns = {'Player':'Full Name'})

# reorganize columns so that last name and first name come first before stats

stats = stats[['Last Name','First Name', 'Full Name', 'Year', 'Pos', 'Age', 'Tm', 
               'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P','3PA', '3P%', '2P', '2PA',
               '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 
               'TOV', 'PF', 'PTS', '', 'ORtg', 'DRtg']]

In [6]:
stats = stats.sort_values(['Year', 'Last Name'], ascending = [False, True])

In [7]:
# Now we repeat the process to retrieve the advanced stats

adv_dfs = []

for year in tqdm(years):
    # URL page we will scraping
    url = "https://www.basketball-reference.com/leagues/NBA_{}_advanced.html".format(year)
    
    # this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html)

    # use findALL() to get the column headers
    soup.findAll('tr', limit=2)

    # use getText()to extract the text we need into a list
    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]

    # exclude the first column as we will not need the ranking order from Basketball Reference for the analysis
    headers = headers[1:]

    # get the actual data
    # avoid the first header row
    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
    
    # append a list of player stats
    year_stats = pd.DataFrame(player_stats, columns = headers)
    
    year_stats = year_stats.dropna(how='all')
    
    # add year columns
    year_stats['Year'] = year - 1
    
    # append a list of dfs to be concatenated
    adv_dfs.append(year_stats)

100%|██████████████████████████████████████████████████████████████████████████████████| 40/40 [02:21<00:00,  3.35s/it]


In [8]:
# concat all dfs to create advanced stats df

adv_stats = pd.concat(adv_dfs)

# str split "player" into "last name" and "first name" for organization

last_first = adv_stats.Player.str.split(" ", n = 1, expand = True)
adv_stats['First Name'] = last_first[0]
adv_stats['Last Name'] = last_first[1]
adv_stats = adv_stats.rename(columns = {'Player':'Full Name'})

In [9]:
# reorganize columns so that last name and first name come first before stats

adv_stats = adv_stats[['Last Name', 'First Name', 'Full Name','Year', 'Pos', 'Age', 'Tm', 'G', 'MP',
                       'PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 
                       'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS', 'WS', 'WS/48', 'OBPM', 
                       'DBPM', 'BPM', 'VORP']]

In [10]:
adv_stats = adv_stats.sort_values(['Year', 'Last Name'], ascending = [False, True])

In [11]:
per_100 = pd.merge(stats, adv_stats, on = ['Last Name', 'First Name', 'Full Name','Year','Pos','Tm','Age','G','MP'])

per_100.shape

(20299, 54)

In [12]:
# This is a crucial step because until this point, there are a few columns where cells with
# 'no information' are filled with a space (" ") and thereby interpreted as a string--uncoerceable
# to floats, as we need them to be for sorting

per_100.replace(r'^\s*$', np.nan, regex = True, inplace = True)

In [13]:
per_100.columns

Index(['Last Name', 'First Name', 'Full Name', 'Year', 'Pos', 'Age', 'Tm', 'G',
       'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%',
       'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV',
       'PF', 'PTS', '', 'ORtg', 'DRtg', 'PER', 'TS%', '3PAr', 'FTr', 'ORB%',
       'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 'OWS', 'DWS',
       'WS', 'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP'],
      dtype='object')

In [14]:
float_cols = [3,5] + list(range(7,54))

for col in float_cols:
    per_100.iloc[:,col] = per_100.iloc[:,col].astype('float')

In [15]:
per_100.head()

Unnamed: 0,Last Name,First Name,Full Name,Year,Pos,Age,Tm,G,GS,MP,...,TOV%,USG%,OWS,DWS,WS,WS/48,OBPM,DBPM,BPM,VORP
0,Abrines,Álex,Álex Abrines,2018.0,SG,25.0,OKC,31.0,2.0,588.0,...,7.9,12.2,0.1,0.6,0.6,0.053,-2.4,-0.9,-3.4,-0.2
1,Acy,Quincy,Quincy Acy,2018.0,PF,28.0,PHO,10.0,0.0,123.0,...,15.2,9.2,-0.1,0.0,-0.1,-0.022,-5.7,-0.3,-5.9,-0.1
2,Adams,Jaylen,Jaylen Adams,2018.0,PG,22.0,ATL,34.0,1.0,428.0,...,19.7,13.5,-0.1,0.2,0.1,0.011,-3.1,-1.3,-4.4,-0.3
3,Adams,Steven,Steven Adams,2018.0,C,25.0,OKC,80.0,80.0,2669.0,...,12.6,16.4,5.1,4.0,9.1,0.163,0.6,2.1,2.7,3.2
4,Adebayo,Bam,Bam Adebayo,2018.0,C,21.0,MIA,82.0,28.0,1913.0,...,17.1,15.8,3.4,3.4,6.8,0.171,-0.6,3.6,3.0,2.4


In [16]:
file_loc = "../data/interim/per_100.csv"

per_100.to_csv(file_loc)