In [55]:
from urllib.request import urlopen
from IPython.display import display, HTML
from bs4 import BeautifulSoup, Comment
import pdb
import requests
import re
import pandas as pd
import os
import pickle


NBA_URL = 'https://www.basketball-reference.com'
NBA_SOUP = BeautifulSoup(urlopen(NBA_URL), 'html.parser')


def find_in_page_text(page, class_=None, text_=None):
    return page.find(class_, text=text_)

def find_in_page_id(page, class_=None, id_=None):
    return page.find(class_, id = id_)

def go_to_page(url, page, class_=None, text_=None):
    link = page.find(class_, text=text_)
    new_url = url + str(link.get('href'))
    return new_url, BeautifulSoup(urlopen(new_url), 'html.parser')

def get_nba_soup():
    return NBA_SOUP

def get_league_soup(nba_soup = None):
    if nba_soup == None:
        nba_soup = get_nba_soup()
    nav_bar = nba_soup.find('div', attrs={'id': 'nav'})
    league = nav_bar.find('li', attrs = {'id': 'header_leagues'})
    league_url = NBA_URL + str(league.find('a').get('href'))

    league_soup = urlopen(league_url)
    league_soup= BeautifulSoup(league_soup, 'html.parser')
    return league_soup


#Input: League page: basketball-reference.com/leagues
#Output: Dictionary of Key: Year Value: basketball-reference.com/leagues/NBA_YEAR.html
#Retrieves the basic statistics of e
def get_seasons_dict(league_soup=None):
    if league_soup == None:
        nav_bar = NBA_SOUP.find('div', attrs={'id': 'nav'})
        nav_bar_league = nav_bar.find('li', attrs = {'id': 'header_leagues'})
        league_url = nba_url + str(nav_bar_league.find('a').get('href'))
        league_soup = BeautifulSoup(urlopen(league_url), 'html.parser')
    
    NBA = find_in_page_text(league_soup, 'span', "Seasons")
    NBA_by_year = (NBA.parent.find_all('li'))
    url_by_season = dict()
    

    for i in NBA_by_year:
        #We only consider NBA. Not ABA/BBA
        if "NBA" not in str(i):
            continue
        year = [s for s in str(i.find("a").get("href")) if s.isdigit()]
        year = int(''.join(year))
        url_by_season[year]= NBA_URL + str(i.find('a').get('href'))
    return url_by_season

def get_team_stats_in_year(year, year_soup):
    categories = []

    categories.append("all_team-stats-base")
    categories.append("all_opponent-stats-base")
    categories.append("all_team-stats-per_poss")
    categories.append("all_opponent-stats-per_poss")
    categories.append("all_misc_stats")

    for category in categories:
        for comments in year_soup.find("div", id=category)\
            .findAll(text=lambda text:isinstance(text, Comment)):
            extracted_comment = comments.extract()
            commented_page = BeautifulSoup(extracted_comment)
            if category == "all_misc_stats":
                headers = [th.getText() for th in commented_page.findAll('tr', limit=2)[1].findAll('th')]
                headers = headers[1:]
                indices = [i for i, x in enumerate(headers) if x == "eFG%"]
                for i in range(indices[1], indices[1]+4):
                    headers[i] = "O"+ headers[i]
                rows = commented_page.findAll('tr')[2:]
                rows = rows[:len(rows)-1]
            else:
                headers = [th.getText() for th in commented_page.findAll('tr', limit=2)[0].findAll('th')]
                headers = headers[1:]
                rows = commented_page.findAll('tr')[1:]
                
            
            player_stats = [[td.getText() for td in rows[i].findAll('td')] \
                for i in range(len(rows))]
            stats = pd.DataFrame(player_stats, columns = headers)
            directory = "Data/" + str(year)
            if (not os.path.exists(directory)):
                 os.mkdir(directory)
            stats.to_pickle(directory + "/" + category + ".pkl")
def get_team_stats_in_all_years():
    league_soup = get_league_soup()
    seasons = get_seasons_dict(league_soup)
    for year in seasons.keys():
        #per possession statistics are only available from year > 1974. 
        #I'll only look at per possession statistics for now. 
        if year < 2019:
            continue
        year_soup = BeautifulSoup(urlopen(seasons[year]) , 'html.parser')
        get_team_stats_in_year(year, year_soup)
    return
get_team_stats_in_all_years()


In [58]:
features_in_per_poss = \
[
    "FG",
    "FGA",
    "FG%",
    "3P",
    "3PA",
    "3P%",
    "2P",
    "2PA",
    "2P%",
    "FT",
    "FTA",
    "FT%",
    "ORB",
    "DRB",
    "AST",
    "STL",
    "BLK",
    "TOV",
    "PF",
    "PTS"
]
features_in_misc = \
[
    "Age",
    "W",
    "L",
    "PW",
    "PL",
    "SOS", #Strength of Schedule
    "Pace",
    "FTr",
    "3PAr",
    "TS%",
    "eFG%",
    "TOV%",
    "ORB%",
    "FT/FGA",
    "OeFG%",
    "OTOV%",
    "ODRB%",
    "OFT/FGA"
]
len(features_in_misc)

18

In [56]:

with (open("Data/2019/all_misc_stats.pkl", "rb")) as openfile:
    pd = pickle.load(openfile)
    
#with (open("Data/2019/all_team-stats-per_poss.pkl", "rb")) as openfile:
#    pd = pickle.load(openfile)

In [57]:
pd


Unnamed: 0,Team,Age,W,L,PW,PL,MOV,SOS,SRS,ORtg,...,TOV%,ORB%,FT/FGA,OeFG%,OTOV%,ODRB%,OFT/FGA,Arena,Attend.,Attend./G
0,Milwaukee Bucks*,26.9,60,22,61,21,8.87,-0.82,8.04,113.8,...,12.0,20.8,0.197,0.503,11.5,80.3,0.162,Fiserv Forum,721692,17602
1,Golden State Warriors*,28.4,57,25,56,26,6.46,-0.04,6.42,115.9,...,12.6,22.5,0.182,0.508,11.7,77.1,0.205,Oracle Arena,803436,19596
2,Toronto Raptors*,27.3,58,24,56,26,6.09,-0.6,5.49,113.1,...,12.4,21.9,0.198,0.509,13.1,77.1,0.19,Scotiabank Arena,812822,19825
3,Utah Jazz*,27.3,50,32,54,28,5.26,0.03,5.28,110.9,...,13.4,22.9,0.217,0.507,12.4,80.3,0.189,Vivint Smart Home Arena,750546,18306
4,Houston Rockets*,29.2,53,29,53,29,4.77,0.19,4.96,115.5,...,12.0,22.8,0.221,0.525,13.4,74.4,0.21,Toyota Center,740392,18058
5,Portland Trail Blazers*,26.2,53,29,51,31,4.2,0.24,4.43,114.7,...,12.1,26.6,0.21,0.516,11.0,77.9,0.195,Moda Center,799345,19496
6,Denver Nuggets*,24.9,54,28,51,31,3.95,0.24,4.19,113.0,...,11.9,26.6,0.175,0.521,12.3,78.0,0.194,Pepsi Center,756457,18450
7,Boston Celtics*,25.7,49,33,52,30,4.44,-0.54,3.9,112.2,...,11.5,21.6,0.173,0.514,13.4,77.0,0.198,TD Garden,763584,18624
8,Oklahoma City Thunder*,25.7,49,33,50,32,3.4,0.15,3.56,110.3,...,11.7,26.0,0.19,0.523,14.4,78.2,0.206,Chesapeake Energy Arena,746323,18203
9,Indiana Pacers*,27.0,48,34,50,32,3.33,-0.57,2.76,109.9,...,12.4,21.9,0.182,0.516,14.1,76.2,0.184,Bankers Life Fieldhouse,689310,16812
