In [None]:
from urllib.request import urlopen
from IPython.display import display, HTML
from bs4 import BeautifulSoup, Comment
import pdb
import requests
import re
import pandas as pd
import os
import pickle


NBA_URL = 'https://www.basketball-reference.com'
NBA_SOUP = BeautifulSoup(urlopen(NBA_URL), 'html.parser')
TEAM_INITIALS =  ['ATL',
     'BRK',
     'BOS', 
     'CHO',
     'CHI',
     'CLE',
     'DAL',
     'DEN',
     'DET',
     'GSW',
     'HOU',
     'IND',
     'LAC',
     'LAL',
     'MEM',
     'MIA',
     'MIL',
     'MIN',
     'NOP',
     'NYK',
     'OKC',
     'ORL',
     'PHI',
     'PHO',
     'POR',
     'SAC',
     'SAS',
     'TOR',
     'UTA',
     'WAS']
features_in_per_poss = \
[
    "FG",
    "FGA",
    "3P",
    "3PA",
    "2P",
    "2PA",
    "FT",
    "FTA",
    "ORB",
    "DRB",
    "AST",
    "STL",
    "BLK",
    "TOV",
    "PF",
    "PTS"
]
features_in_misc = \
[
    "Age",
    "W",
    "L",
    "PW",
    "PL",
    "SOS", #Strength of Schedule
    "Pace",
    "FTr",
    "3PAr",
    "TS%",
    "eFG%",
    "TOV%",
    "ORB%",
    "FT/FGA",
    "OeFG%",
    "OTOV%",
    "ODRB%",
    "OFT/FGA"
]
#key: team initials values [game basic stat, cumulative game basic stat]
SAMPLE_BOX_SCORE_URL = "https://www.basketball-reference.com/boxscores/201810160BOS.html"
TEAM_DATAFRAMES = dict()
SINGLE_STAT = 0
CUMULATIVE_STAT = 1
TEAM_NAME = 0
STAT = 1


def find_in_page_text(page, class_=None, text_=None):
    return page.find(class_, text=text_)

def find_in_page_id(page, class_=None, id_=None):
    return page.find(class_, id = id_)

def go_to_page(url, page, class_=None, text_=None):
    link = page.find(class_, text=text_)
    new_url = url + str(link.get('href'))
    return new_url, BeautifulSoup(urlopen(new_url), 'html.parser')

def get_nba_soup():
    return NBA_SOUP

def get_league_soup(nba_soup = None):
    if nba_soup == None:
        nba_soup = get_nba_soup()
    nav_bar = nba_soup.find('div', attrs={'id': 'nav'})
    league = nav_bar.find('li', attrs = {'id': 'header_leagues'})
    league_url = NBA_URL + str(league.find('a').get('href'))

    league_soup = urlopen(league_url)
    league_soup= BeautifulSoup(league_soup, 'html.parser')
    return league_soup


#Input: League page: basketball-reference.com/leagues
#Output: Dictionary of Key: Year Value: basketball-reference.com/leagues/NBA_YEAR.html
#Retrieves the basic statistics
def get_seasons_dict(league_soup=None):
    if league_soup == None:
        nav_bar = NBA_SOUP.find('div', attrs={'id': 'nav'})
        nav_bar_league = nav_bar.find('li', attrs = {'id': 'header_leagues'})
        league_url = nba_url + str(nav_bar_league.find('a').get('href'))
        league_soup = BeautifulSoup(urlopen(league_url), 'html.parser')
    
    NBA = find_in_page_text(league_soup, 'span', "Seasons")
    NBA_by_year = (NBA.parent.find_all('li'))
    url_by_season = dict()
    
    for i in NBA_by_year:
        #We only consider NBA. Not ABA/BBA
        if "NBA" not in str(i):
            continue
        year = [s for s in str(i.find("a").get("href")) if s.isdigit()]
        year = int(''.join(year))
        url_by_season[year]= NBA_URL + str(i.find('a').get('href'))
    return url_by_season

def get_team_stats_in_year(year, year_soup):
    categories = []

    categories.append("all_team-stats-base")
    categories.append("all_opponent-stats-base")
    categories.append("all_team-stats-per_poss")
    categories.append("all_opponent-stats-per_poss")
    categories.append("all_misc_stats")

    for category in categories:
        for comments in year_soup.find("div", id=category)\
            .findAll(text=lambda text:isinstance(text, Comment)):
            extracted_comment = comments.extract()
            commented_page = BeautifulSoup(extracted_comment)
            if category == "all_misc_stats":
                headers = [th.getText() for th in commented_page.findAll('tr', limit=2)[1].findAll('th')]
                headers = headers[1:]
                indices = [i for i, x in enumerate(headers) if x == "eFG%"]
                for i in range(indices[1], indices[1]+4):
                    headers[i] = "O"+ headers[i]
                rows = commented_page.findAll('tr')[2:]
                rows = rows[:len(rows)-1]
            else:
                headers = [th.getText() for th in commented_page.findAll('tr', limit=2)[0].findAll('th')]
                headers = headers[1:]
                rows = commented_page.findAll('tr')[1:]
                
            
            player_stats = [[td.getText() for td in rows[i].findAll('td')] \
                for i in range(len(rows))]
            stats = pd.DataFrame(player_stats, columns = headers)
            directory = "Data/" + str(year)
            if (not os.path.exists(directory)):
                 os.mkdir(directory)
            stats.to_pickle(directory + "/" + category + ".pkl")
            
def get_team_stats_in_all_years():
    league_soup = get_league_soup()
    seasons = get_seasons_dict(league_soup)
    for year in seasons.keys():
        #per possession statistics are only available from year > 1974. 
        #I'll only look at per possession statistics for now. 
        if year < 2019:
            continue
        year_soup = BeautifulSoup(urlopen(seasons[year]) , 'html.parser')
        get_team_stats_in_year(year, year_soup)
    return

def get_game_results_in_year(year, year_soup):
    months = year_soup.find("div", attrs={'class': 'filter'})
    months = months.findAll("a")
    months = [NBA_URL + month['href'] for month in months]
    team_dataframes = dict()
    features_and_results_in_year = []
    cnt = 0
    for month in months:
        month_soup = BeautifulSoup(urlopen(month), 'html.parser')
        
        games = month_soup.find("table", attrs={'id': 'schedule'}).find('tbody').findAll("tr")
        for game in games:
            cnt += 1
            if cnt %100 == 0:
                print("game cnt: " + str(cnt))
            if not game.find('td', attrs={'data-stat': 'box_score_text'}):
                continue
            if not game.find('td', attrs={'data-stat': 'box_score_text'}).find('a'):
                continue
            game_link = NBA_URL + game.find('td', attrs={'data-stat': 'box_score_text'}).find('a')['href']
            print(game_link)
            game_soup = BeautifulSoup(urlopen(game_link), 'html.parser')
            [two_team_stats, combined] = get_box_score_in_game(game_soup)

            for team_stat in two_team_stats:
                if team_stat[TEAM_NAME] not in team_dataframes.keys():
                    team_dataframes[team_stat[TEAM_NAME]] = [[], []]#single_game_stat, #cumulative_game_stat
                team_dataframes[team_stat[TEAM_NAME]][SINGLE_STAT].append(team_stat[STAT])
                cumulative_stat = team_dataframes[team_stat[TEAM_NAME]][CUMULATIVE_STAT]
                if len(cumulative_stat) == 0:
                    cumulative_stat.append(team_stat[STAT])
                else:
                    cumulative_stat.append(list(x + y for x, y in zip(team_stat[STAT], cumulative_stat[-1])))
            features_and_results_in_year.append(combined)
        break;

    directory = "Data/" + str(year)
    if (not os.path.exists(directory)):
        os.mkdir(directory)
    #assert(len(team_dataframes.keys()) == 30)
    categories = get_basic_stat_categories_from_box_score()
    for team in team_dataframes.keys():
        assert (len(team_dataframes[team]) == 2)
        single_stat = team_dataframes[team][SINGLE_STAT]
        cumulative_stat = team_dataframes[team][CUMULATIVE_STAT]
        single_stat = pd.DataFrame(single_stat, columns = categories)
        cumulative_stat = pd.DataFrame(cumulative_stat, columns = categories)
        single_stat.to_pickle(directory + "/" + str(team) + "_season_game_basic_stats.pkl")
        cumulative_stat.to_pickle(directory + "/" + str(team) + "_season_game_basic_cumulative_stats.pkl")
    features_and_results_in_year = pd.DataFrame(features_and_results_in_year, columns=get_features_categories_from_box_score())
    features_and_results_in_year.to_pickle(directory + "/features_and_results.pkl")
    
def get_features_categories_from_box_score():
    features_categories = []
    categories = get_basic_stat_categories_from_box_score()
    features_categories.extend(["Away_" + category for category in categories])
    features_categories.extend(["Home_" + category for category in categories])
    features_categories.append("Win")
    return features_categories

def get_basic_stat_categories_from_box_score():
    game_soup = BeautifulSoup(urlopen(SAMPLE_BOX_SCORE_URL), 'html.parser')
    comment = game_soup.find("div", attrs={'id': 'all_line_score'}).find(text=lambda text:isinstance(text, Comment))
    extracted_comment = comment.extract()
    commented_page = BeautifulSoup(extracted_comment)
    teams = commented_page.find("table", attrs={'id':'line_score'}).findAll("a")
    game_features_and_result = []
    assert (len(teams) == 2)
    for team in teams:
        team_name = str(team.getText())
        team_table_soup = game_soup.find("table", attrs={'id': 'box-' + team_name + '-game-basic'})
        categories = get_basic_game_stat_categories(team_table_soup)
        return categories

def get_box_score_in_game(game_soup):
    comment = game_soup.find("div", attrs={'id': 'all_line_score'}).find(text=lambda text:isinstance(text, Comment))
    extracted_comment = comment.extract()
    commented_page = BeautifulSoup(extracted_comment)
    teams = commented_page.find("table", attrs={'id':'line_score'}).findAll("a")
    game_features_and_result = []
    scores = []
    rtn = [[]] #[[away, home], combined]
    assert (len(teams) == 2)
    for team in teams:
        team_name = str(team.getText())
        team_table_soup = game_soup.find("table", attrs={'id': 'box-' + team_name + '-game-basic'})
        #cumulative_stat.drop(columns=[column if '%' in column for column in list(cumulative_stat)])
        team_stat = team_table_soup.find("tfoot").find("tr").findAll("td")
        team_stat = team_stat[1:len(team_stat)-1]
        team_stat = [float(stat.getText()) for stat in team_stat]
        scores.append(team_stat[-1])
        game_features_and_result.extend(team_stat)
        rtn[0].append([team_name, team_stat])
        
    if scores[1] - scores[0] > 0:
        #home team wins
        game_features_and_result.append(1)
    else:
        game_features_and_result.append(0)
        
    rtn.append(game_features_and_result)
    return rtn
        
        

def get_basic_game_stat_categories(team_table_soup):
    categories = team_table_soup.find("thead").findAll("th")
    categories = categories[4:len(categories)-1]
    categories = [category.getText() for category in categories]
    #0 if home team lost. 1 if home team won
    return categories
    
def get_game_results_in_all_years():
    league_soup = get_league_soup()
    seasons = get_seasons_dict(league_soup)
    for year in seasons.keys():
        seasons[year] = seasons[year][:len(seasons[year]) - 5] + "_games" + seasons[year][len(seasons[year]) - 5:]
    for year in seasons.keys():
        #per possession statistics are only available from year > 1974. 
        #I'll only look at per possession statistics for now.
        print(year)
        if year < 2000 or year >= 2020:
            continue
        year_soup = BeautifulSoup(urlopen(seasons[year]) , 'html.parser')
        get_game_results_in_year(year, year_soup)
    return


In [None]:
get_game_results_in_all_years()


In [None]:
BeautifulSoup(urlopen(NBA_URL), 'html.parser')

In [3]:

with (open("Data/2019/features_and_results.pkl", "rb")) as openfile:
    pd = pickle.load(openfile)
    
#with (open("Data/2019/all_team-stats-per_poss.pkl", "rb")) as openfile:
#    pd = pickle.load(openfile)

In [4]:
pd


Unnamed: 0,Away_FG,Away_FGA,Away_FG%,Away_3P,Away_3PA,Away_3P%,Away_FT,Away_FTA,Away_FT%,Away_ORB,...,Home_ORB,Home_DRB,Home_TRB,Home_AST,Home_STL,Home_BLK,Home_TOV,Home_PF,Home_PTS,Win
0,34.0,87.0,0.391,5.0,26.0,0.192,14.0,23.0,0.609,6.0,...,12.0,43.0,55.0,21.0,7.0,5.0,14.0,20.0,105.0,1
1,33.0,91.0,0.363,10.0,37.0,0.270,24.0,37.0,0.649,16.0,...,17.0,41.0,58.0,28.0,7.0,7.0,21.0,29.0,108.0,1
2,42.0,85.0,0.494,14.0,34.0,0.412,15.0,20.0,0.750,11.0,...,9.0,32.0,41.0,21.0,8.0,9.0,11.0,19.0,112.0,0
3,40.0,82.0,0.488,5.0,27.0,0.185,15.0,22.0,0.682,5.0,...,14.0,32.0,46.0,21.0,5.0,5.0,14.0,20.0,103.0,1
4,52.0,98.0,0.531,10.0,25.0,0.400,17.0,22.0,0.773,14.0,...,8.0,29.0,37.0,21.0,8.0,7.0,11.0,22.0,112.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1273,47.0,87.0,0.540,13.0,31.0,0.419,5.0,8.0,0.625,3.0,...,7.0,37.0,44.0,19.0,5.0,2.0,9.0,11.0,90.0,0
1274,41.0,79.0,0.519,11.0,29.0,0.379,20.0,27.0,0.741,9.0,...,7.0,27.0,34.0,22.0,12.0,7.0,12.0,22.0,121.0,1
1275,30.0,76.0,0.395,10.0,35.0,0.286,24.0,26.0,0.923,9.0,...,9.0,27.0,36.0,20.0,10.0,4.0,9.0,22.0,89.0,0
1276,36.0,77.0,0.468,17.0,40.0,0.425,20.0,25.0,0.800,10.0,...,18.0,27.0,45.0,24.0,10.0,5.0,12.0,23.0,115.0,1


In [5]:
import numpy as np
from sklearn.linear_model import LinearRegression