In [19]:
import os
import pandas as pd

from bs4 import BeautifulSoup

SCORE_DIR = "data/scores"

In [20]:
box_scores = os.listdir(SCORE_DIR)
box_scores = [os.path.join(SCORE_DIR, f) for f in box_scores if f.endswith(".html")]

In [11]:
def read_season_info(soup):
    nav = soup.select("#bottom_nav_container")[0]
    hrefs = [a["href"] for a in nav.find_all('a')]
    season = os.path.basename(hrefs[1]).split("_")[0]
    return season

In [12]:
def read_line_score(soup):
    line_score = pd.read_html(str(soup), attrs = {'id': 'line_score'})[0]
    cols = list(line_score.columns)
    cols[0] = "team"
    cols[-1] = "total"
    line_score.columns = cols
    
    line_score = line_score[["team", "total"]]
    
    return line_score

In [13]:
def read_stats(soup, team, stat):
    df = pd.read_html(str(soup), attrs = {'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
    df = df.apply(pd.to_numeric, errors="coerce")
    return df

In [62]:
games = []
base_cols = None
for box_score in box_scores:
    with open(box_score) as f:
        html = f.read().replace("<!--", "").replace("-->", "")

    soup = BeautifulSoup(html)

    line_score = read_line_score(soup)
    teams = list(line_score["team"])

    summaries = []
    for team in teams:
        basic = read_stats(soup, team, "basic")
        advanced = read_stats(soup, team, "advanced")

        totals = pd.concat([basic.iloc[-1,:], advanced.iloc[-1,:]])
        
        maxes = pd.concat([basic.iloc[:-1].max(), advanced.iloc[:-1].max()])

        summary = pd.concat([totals, maxes])
        
        if base_cols is None:
            base_cols = list(summary.index.drop_duplicates(keep="first"))
            base_cols = [b for b in base_cols if "bpm" not in b]
        
        summary = summary[base_cols]
        
        summaries.append(summary)
    
    summary = pd.concat(summaries, axis=1).T

    game = pd.concat([summary, line_score], axis=1)

    game["home"] = [0,1]

    game_opp = game.iloc[::-1].reset_index()
    game_opp.columns += "_opp"

    full_game = pd.concat([game, game_opp], axis=1)
    full_game["season"] = read_season_info(soup)
    
    full_game["date"] = os.path.basename(box_score)[:8]
    full_game["date"] = pd.to_datetime(full_game["date"], format="%Y%m%d")
    
    full_game["won"] = full_game["total"] > full_game["total_opp"]
    games.append(full_game)
    
    if len(games) % 100 == 0:
        print(f"{len(games)} / {len(box_scores)}")

  line_score = pd.read_html(str(soup), attrs = {'id': 'line_score'})[0]
  df = pd.read_html(str(soup), attrs = {'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
  df = pd.read_html(str(soup), attrs = {'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
  df = pd.read_html(str(soup), attrs = {'id': f'box-{team}-game-{stat}'}, index_col=0)[0]
  df = pd.read_html(str(soup), attrs = {'id': f'box-{team}-game-{stat}'}, index_col=0)[0]


TypeError: can only concatenate tuple (not "str") to tuple

In [None]:
games_df = pd.concat(games, ignore_index=True)

In [None]:
games_df

In [None]:
games_df.to_csv("nba_games.csv")

In [47]:
totals.reset_index(levels=0)

TypeError: Series.reset_index() got an unexpected keyword argument 'levels'

In [63]:
game_opp

Unnamed: 0,index,"(Basic Box Score Stats, MP)","(Basic Box Score Stats, MP).1","(Basic Box Score Stats, FG)","(Basic Box Score Stats, FG).1","(Basic Box Score Stats, FGA)","(Basic Box Score Stats, FGA).1","(Basic Box Score Stats, FG%)","(Basic Box Score Stats, FG%).1","(Basic Box Score Stats, 3P)",...,"(Advanced Box Score Stats, USG%)","(Advanced Box Score Stats, ORtg)","(Advanced Box Score Stats, ORtg).1","(Advanced Box Score Stats, DRtg)","(Advanced Box Score Stats, DRtg).1","(Advanced Box Score Stats, BPM)","(Advanced Box Score Stats, BPM).1",team,total,home
0,1,240.0,,40.0,12.0,93.0,21.0,0.43,0.6,8.0,...,30.0,110.0,129.0,107.8,115.0,,10.1,POR,102,1
1,0,240.0,,35.0,9.0,75.0,18.0,0.467,0.667,7.0,...,43.4,107.8,168.0,110.0,119.0,,12.9,CHO,100,0


In [59]:
read_season_info(soup)

'2015'

In [64]:
game_opp.columns

Index([                             'index',
            ('Basic Box Score Stats', 'MP'),
            ('Basic Box Score Stats', 'MP'),
            ('Basic Box Score Stats', 'FG'),
            ('Basic Box Score Stats', 'FG'),
           ('Basic Box Score Stats', 'FGA'),
           ('Basic Box Score Stats', 'FGA'),
           ('Basic Box Score Stats', 'FG%'),
           ('Basic Box Score Stats', 'FG%'),
            ('Basic Box Score Stats', '3P'),
            ('Basic Box Score Stats', '3P'),
           ('Basic Box Score Stats', '3PA'),
           ('Basic Box Score Stats', '3PA'),
           ('Basic Box Score Stats', '3P%'),
           ('Basic Box Score Stats', '3P%'),
            ('Basic Box Score Stats', 'FT'),
            ('Basic Box Score Stats', 'FT'),
           ('Basic Box Score Stats', 'FTA'),
           ('Basic Box Score Stats', 'FTA'),
           ('Basic Box Score Stats', 'FT%'),
           ('Basic Box Score Stats', 'FT%'),
           ('Basic Box Score Stats', 'ORB'),
          