# 1 Purpose

Use [Basketball Reference](https://www.basketball-reference.com) to pull team-level data; both W/L records as well as more advanced statistics (2 separate loops)

### 1.0.1 Imports

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup, Comment
import pandas as pd
from datetime import date
import numpy as np
import requests
import re
from tqdm import tqdm


# 2 Webscrape Loops

### 2.0.1 Set Start/End Dates for Loops

In [2]:
# There is some debate as to when the modern era of basketball began. NBA modern era options:
merger = 1977
three_point = 1980
unrestricted_fa = 1989
bulls_breakup = 1999
def_rule_change = 2001

# current year and month
c_year = date.today().year
c_month = date.today().month

In [3]:
begin = 2016

# set end year
# 'end' will be set as the current year (2019 at time of creation), so that up-to-date data is more or less dynamic

if c_month > 10:
    end = c_year + 1
else:
    end = c_year

years = list(range(begin, end + 1))

### 2.0.2 Initial Wrangling Prep

In [4]:
players = pd.read_csv('../data/interim/per_100.csv')

short_name = players.Tm.unique()

full_name = ['Oklahoma City Thunder', 'Phoenix Suns', 'Atlanta Hawks', 'Miami Heat',
            'Cleveland Cavaliers', 'Denver Nuggets', 'San Antonio Spurs', 'Chicago Bulls',
            'Utah Jazz', 'Brooklyn Nets', 'New York Knicks', 'Portland Trail Blazers', 
            'Memphis Grizzlies', 'Total', 'Indiana Pacers', 'Milwaukee Bucks', 'Dallas Mavericks',
            'Houston Rockets', 'Toronto Raptors', 'Washington Wizards', 'Orlando Magic', 
            'Charlotte Hornets', 'Sacramento Kings', 'Los Angeles Lakers', 'Minnesota Timberwolves',
            'Boston Celtics', 'Golden State Warriors', 'New Orleans Pelicans', 'Los Angeles Clippers',
            'Philadelphia 76ers', 'Detroit Pistons', 'Charlotte Bobcats', 'New Orleans Hornets',
            'New Jersey Nets', 'Seattle SuperSonics', 'New Orleans/Oklahoma City Hornets',
            'Charlotte Hornets', 'Vancouver Grizzlies', 'Washington Bullets', 'Kansas City Kings',
            'San Diego Clippers']

playoff_star = [full + '*' for full in full_name]

full_name = full_name + playoff_star
short_name = list(short_name) + list(short_name)

name_df = pd.DataFrame({'Tm' : short_name,
                        'Team' : full_name})


## 2.1 First Loop (2016 - 2019)

In [5]:
dfs = []

for year in tqdm(years):

    # URL page we will scraping
    url = "https://www.basketball-reference.com/leagues/NBA_{}.html".format(year)
    
    # this is the HTML from the given URL
    html = urlopen(url)
    soup = BeautifulSoup(html)
    
    # use getText()to extract the text we need into a list
    headers_e = [th.getText() for th in soup.findAll('table')[0].findAll('th')][:8]
    headers_w = [th.getText() for th in soup.findAll('table')[1].findAll('th')][:8]
    
    teams_east = []
    ranks_east = []

    for i in range(2, 17):
        team = [th.getText() for th in soup.findAll('tr', limit = 33)[i].findAll('th')]
        team = repr(str(team[0]))
        rank = re.findall('\d+',team)
        team = team.split('\\', 1)[0].strip("'")
        teams_east.append(team)
        ranks_east.append(rank)

    ranks_east = list(filter(None, ranks_east))

    teams_west = []
    ranks_west = []
    
    for j in range(18, 33):
        # use getText()to extract the text we need into a list
        team = [th.getText() for th in soup.findAll('tr', limit = 33)[j].findAll('th')]
        team = repr(str(team[0]))
        rank = re.findall('\d+', team)
        team = team.split('\\', 1)[0].strip("'")
        teams_west.append(team)
        ranks_west.append(rank)
    
    ranks_west = list(filter(None, ranks_west))


    rows = soup.findAll('table')
    win_loss = [[td.getText() for td in rows[i].findAll('td')]
        for i in range(len(rows))]
    
    wl_e = win_loss[0]
    wl_w = win_loss[1]
    elements = len(headers_e) - 1

    data_e = []
    for i in range(0, elements):
        col = wl_e[i::elements]
        data_e.append(col)

    data_w = []
    for i in range(0, elements):
        col = wl_w[i::elements]
        data_w.append(col)

    east = [teams_east] + data_e
    west = [teams_west] + data_w
    
    # create dfs
    year_stats_east = pd.DataFrame(east)
    year_stats_east = year_stats_east.transpose()
    year_stats_east.columns = headers_e
    year_stats_east['Rank'] = ranks_east

    year_stats_west = pd.DataFrame(west)
    year_stats_west = year_stats_west.transpose()
    year_stats_west.columns = headers_w
    year_stats_west['Rank'] = ranks_west

    # add year columns
    year_stats_east['Year'] = year - 1

    year_stats_west['Year'] = year - 1
    
    east_df = year_stats_east.rename(columns = {'Eastern Conference':'Team'})
    west_df = year_stats_west.rename(columns = {'Western Conference':'Team'})
    
    df = pd.concat([east_df, west_df]).reset_index(drop = 'True')
    
    rank_list = list(df.Rank)

    df['Rank'] = [ num[-2] for num in rank_list]
    
#####################################################################################################

    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    placeholder = soup.select_one('#all_team-stats-per_game .placeholder')
    comment = next(elem for elem in placeholder.next_siblings if isinstance(elem, Comment))
    table_soup = BeautifulSoup(comment, 'html.parser')

    headers = [th.getText() for th in table_soup.findAll('th')]
    headers = headers[1:25] # only these headers correlate with our data

    data = []

    rows = table_soup.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if cells:
            data.append([cell.text for cell in cells])
        
    data = data[:-1] #remove last row 'league average'

    stats_df = pd.DataFrame(data, columns = headers)

    stats_df = stats_df.drop(columns = ['G','MP','PTS'])

######################################################################################################
    
    final = df.merge(name_df, on = 'Team')

    final = final.merge(stats_df, on = 'Team')
    
    dfs.append(final)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:10<00:00,  3.07s/it]


In [6]:
wl_stats = pd.concat(dfs)

## 2.2 Second Loop (1979 - 2015)

After this first for loop (which you'll notice only covered years 2016 to 2019 at time of creation), webscraping got a bit more ...involved. You see, going back to the 1979 season (for which the url includes year number 1980), means going back through quite a bit of league shuffling and restructuring. This means table sizes varied and the webscraping code that worked for one year might throw an index length error when used on the year right before it. We were at least able to bite this off in chunks, but before adding some efficiency with the dictionary below, we used seven additional and only slightly unique 'for loops' of the same size as the one above to scrape all this team data.

With a little creativity and the help of an 'if' statement embedded inside an already embedded 'for loop', we were able to automate our larger loop and webscrape each chunk of year ranges and save ourselves (and you the reader) from a LOT of code.

In [7]:
# dict object 'd' will serve as our means of accessing the different line ranges when retrieving table data for
# specific year ranges; year ranges serve as the keys, then values, in order: [east line range, east line limit,
# west line range, west line limit]

total_range = range(1980,2016)

d = {range(2005,2016) : [range(2,20), 20, range(21,39), 39],
     range(1996,2005) : [range(2,19), 19, range(21,36), 36],
     range(1992,1996) : [range(2,18), 18, range(20,34), 34],
     range(1990,1992) : [range(2,17), 17, range(18,34), 34],
     range(1989,1990) : [range(2,16), 16, range(18,32), 32],
     range(1981,1989) : [range(2,15), 15, range(16,30), 30],
     range(1980,1981) : [range(2,15), 15, range(16,29), 29]}

range_keys = list(d.keys())

# example of accessing 'd'

a = d.get(range(1981,1989))
print(a[0], a[1], a[2], a[3])
print(list(d.keys())[5])

range(2, 15) 15 range(16, 30) 30
range(1981, 1989)


In [8]:
dfs2 = []

for year in tqdm(total_range):

    url = "https://www.basketball-reference.com/leagues/NBA_{}.html".format(year)
    
    html = urlopen(url)
    soup = BeautifulSoup(html)

    headers = [th.getText() for th in soup.findAll('table')[1].findAll('th')][:8]
    headers.pop(0)
    
    # THIS IS THAT BEAUTIFUL FOR LOOP! ...yes it's probably simpler than you expected, but it was a life saver.
    
    for r in range_keys:
        if year in r:
            a = d.get(r)    

    teams_east = []
    ranks_east = []
    

    for i in a[0]:
        team = [th.getText() for th in soup.findAll('tr', limit = a[1])[i].findAll('th')]
        team = repr(str(team[0]))
        rank = re.findall('\d+',team)
        team = team.split('\\', 1)[0].strip("'")
        teams_east.append(team)
        ranks_east.append(rank)

    ranks_east = list(filter(None, ranks_east))
    
    div_list = ['Atlantic Division', 'Central Division', 'Southeast Division', 'Northwest Division',
                'Pacific Division', 'Southwest Division', 'Midwest Division']
    for div in div_list:
        if div in teams_east: 
            teams_east.remove(div) 

    teams_west = []
    ranks_west = []
    
    for j in a[2]:
        team = [th.getText() for th in soup.findAll('tr', limit = a[3])[j].findAll('th')]
        team = repr(str(team[0]))
        rank = re.findall('\d+', team)
        team = team.split('\\', 1)[0].strip("'")
        teams_west.append(team)
        ranks_west.append(rank)
    
    ranks_west = list(filter(None, ranks_west))


    for div in div_list:
        if div in teams_west: 
            teams_west.remove(div)    

    rows = soup.findAll('tr',{'class' : 'full_table'})
    win_loss = [[td.getText() for td in rows[i].findAll('td')]
        for i in range(len(rows))]
    
    wl_e = win_loss[:len(teams_east)]
    wl_w = win_loss[len(teams_east):]

    
    # create dfs
    year_stats_east = pd.DataFrame(wl_e)
    year_stats_east.columns = headers
    year_stats_east['Team'] = teams_east
    year_stats_east['Rank'] = ranks_east

    year_stats_west = pd.DataFrame(wl_w)
    year_stats_west.columns = headers
    year_stats_west['Team'] = teams_west
    year_stats_west['Rank'] = ranks_west

    # add year columns
    year_stats_east['Year'] = year - 1

    year_stats_west['Year'] = year - 1
    
    df = pd.concat([year_stats_east, year_stats_west]).reset_index(drop = 'True')
    
    rank_list = list(df.Rank)

    df['Rank'] = [ num[-2] for num in rank_list]
    
#####################################################################################################

    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    placeholder = soup.select_one('#all_team-stats-per_game .placeholder')
    comment = next(elem for elem in placeholder.next_siblings if isinstance(elem, Comment))
    table_soup = BeautifulSoup(comment, 'html.parser')

    headers = [th.getText() for th in table_soup.findAll('th')]
    headers = headers[1:25] # only these headers correlate with our data

    data = []

    rows = table_soup.find_all('tr')
    for row in rows:
        cells = row.find_all('td')
        if cells:
            data.append([cell.text for cell in cells])
        
    data = data[:-1] #remove last row 'league average'

    stats_df = pd.DataFrame(data, columns = headers)

    stats_df = stats_df.drop(columns = ['G','MP','PTS'])

######################################################################################################
    
    final = df.merge(name_df, on = 'Team')
    
    final = final.merge(stats_df, on = 'Team')

    dfs2.append(final)


100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [01:30<00:00,  2.47s/it]


In [9]:
wl_stats2 = pd.concat(dfs2)

# 3 Wrangling and Cleaning

In [10]:
wl_list = [wl_stats, wl_stats2]

In [11]:
teams = pd.concat(wl_list, sort = True, ignore_index = True)
teams['Playoffs'] = teams['Team'].str.count("\*")
teams['Team'] = teams['Team'].str.strip('*')

print(teams.columns)
teams.head()


Index(['2P', '2P%', '2PA', '3P', '3P%', '3PA', 'AST', 'BLK', 'DRB', 'FG',
       'FG%', 'FGA', 'FT', 'FT%', 'FTA', 'GB', 'L', 'ORB', 'PA/G', 'PF',
       'PS/G', 'Rank', 'SRS', 'STL', 'TOV', 'TRB', 'Team', 'Tm', 'W', 'W/L%',
       'Year', 'Playoffs'],
      dtype='object')


Unnamed: 0,2P,2P%,2PA,3P,3P%,3PA,AST,BLK,DRB,FG,...,SRS,STL,TOV,TRB,Team,Tm,W,W/L%,Year,Playoffs
0,27.9,0.514,54.4,10.7,0.362,29.6,22.7,3.9,33.9,38.7,...,5.45,6.7,13.6,44.5,Cleveland Cavaliers,CLE,57,0.695,2015,1
1,28.0,0.483,58.0,8.6,0.37,23.4,18.7,5.5,33.2,36.7,...,4.08,7.8,13.1,43.4,Toronto Raptors,TOR,56,0.683,2015,1
2,32.3,0.508,63.6,6.1,0.336,18.0,20.8,6.5,34.3,38.4,...,1.5,6.7,14.1,44.1,Miami Heat,MIA,48,0.585,2015,1
3,28.7,0.512,56.1,9.9,0.35,28.4,25.6,5.9,33.8,38.6,...,3.49,9.1,15.0,42.1,Atlanta Hawks,ATL,48,0.585,2015,1
4,30.5,0.483,63.1,8.7,0.335,26.1,24.2,4.2,33.3,39.2,...,2.84,9.2,13.7,44.9,Boston Celtics,BOS,48,0.585,2015,1


In [13]:
teams['Games'] = teams['W'] + teams['L']

teams['PD/G'] = np.nan # teams['PS/G'] - teams['PA/G']
teams['PS'] = np.nan # teams['PS/G'] * teams['Games']
teams['PA'] = np.nan # teams['PA/G'] * teams['Games']
teams['PD'] = np.nan # teams['PS'] - teams['PA']

teams = teams[['Team','Tm','Year','W','L','W/L%','Rank','Playoffs','PS/G','PA/G','PD/G','PS','PA','PD',
               'Games','FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%','FT', 'FTA', 'FT%', 
               'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV','PF']]

teams.iloc[:,[2,3,4,6,7,14]] = teams.iloc[:,[2,3,4,6,7,14]].astype('int')
teams.iloc[:,[5,8,9,10,11,12,13]] = teams.iloc[:,[5,8,9,10,11,12,13]].astype('float')

teams['PD/G'] = teams['PS/G'] - teams['PA/G']
teams['PS'] = teams['PS/G'] * teams['Games']
teams['PA'] = teams['PA/G'] * teams['Games']
teams['PD'] = teams['PS'] - teams['PA']

teams.iloc[:,list(range(15,35))] = teams.iloc[:,list(range(15,35))].astype('float')

In [15]:
file_loc = '../data/interim/teams.csv'

teams.to_csv(file_loc)

In [16]:
blank = 'Unnamed: 0'

# team data 

teams = pd.read_csv("../data/interim/teams.csv")
teams.drop(blank, axis = 1, inplace = True)

# spending data

spending = pd.read_csv('../data/interim/spend_history.csv')
spending.drop([blank, 'Team'], axis = 1, inplace = True)
spending = spending.rename(columns = {'year':'Year'})


In [19]:
fulldf = pd.merge(teams, spending, on = ['Tm','Year'], how = 'left')
fulldf = fulldf[(fulldf.Year != 1998) & (fulldf.Year != 2011)]

fulldf.to_csv('../data/processed/teams_final.csv')