In [139]:
from dataclasses import dataclass
from datetime import date, datetime
from enum import Enum
import time
from typing import Dict, List, Optional, Tuple

from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests

In [90]:
class Conference(Enum):
    EAST = "East"
    WEST = "West"

    
@dataclass
class Team:
    symbol: str
    city: str
    name: str
    conference: Conference


@dataclass
class Game:
    home: str
    away: str
    winner: Optional[str]
    date: datetime.date
    link: str
    
    def teams() -> Tuple[str, str]:
        return (self.home, self.away)

In [107]:
TEAMS = [
    Team("BOS", "Boston", "Celtics", Conference.EAST),
    Team("BRK", "Brooklyn", "Nets", Conference.EAST),
    Team("IND", "Indiana", "Pacers", Conference.EAST),
    Team("MIA", "Miami", "Heat", Conference.EAST),
    Team("MIL", "Milwaukee", "Bucks", Conference.EAST),
    Team("ORL", "Orlando", "Magic", Conference.EAST),
    Team("PHI", "Philadelphia", "76ers", Conference.EAST),
    Team("TOR", "Toronto", "Raptors", Conference.EAST),
    Team("WAS", "Washington", "Wizards", Conference.EAST),
    Team("DAL", "Dallas", "Mavericks", Conference.WEST),
    Team("DEN", "Denver", "Nuggets", Conference.WEST),
    Team("HOU", "Houston", "Rockets", Conference.WEST),
    Team("LAC", "Los Angeles", "Clippers", Conference.WEST),
    Team("LAL", "Los Angeles", "Lakers", Conference.WEST),
    Team("MEM", "Memphis", "Grizzlies", Conference.WEST),
    Team("NOP", "New Orleans", "Pelicans", Conference.WEST),
    Team("OKC", "Oklahoma City", "Thunder", Conference.WEST),
    Team("PHO", "Phoenix", "Suns", Conference.WEST),
    Team("POR", "Portland", "Trail Blazers", Conference.WEST),
    Team("SAC", "Sacramento", "Kings", Conference.WEST),
    Team("SAS", "San Antonio", "Spurs", Conference.WEST),
    Team("UTA", "Utah", "Jazz", Conference.WEST),
]

In [99]:
def get_games(symbol: str) -> List[Game]:
    response = requests.get(f"https://www.basketball-reference.com/teams/{symbol}/2020_games.html")
    soup = BeautifulSoup(response.text, "html.parser")

    table = soup.find(id="games")
    rows = table.tbody.find_all("tr")
    games = []

    for row in rows:
        if row.get("class") and "thead" in row.get("class"):
            continue
        date_string = row.find(attrs={"data-stat": "date_game"}).get("csk")
        game_date = datetime.strptime(date_string, "%Y-%m-%d").date()
        link = date_string = row.find(attrs={"data-stat": "box_score_text"}).a.get("href")

        opponent = row.find(attrs={"data-stat": "opp_name"}).get("csk")[:3]
        at_home = row.find(attrs={"data-stat": "game_location"}).text == "@"
        if game_date > datetime.now().date():
            winner = None
        else:
            won = row.find(attrs={"data-stat": "game_result"}).text == "W"
            winner = symbol if won else opponent

        if at_home:
            games.append(Game(symbol, opponent, winner, game_date, link))
        else:
            games.append(Game(opponent, symbol, winner, game_date, link))

    return games

In [108]:
team_games = {} 
for t in TEAMS:
    symbol = t.symbol
    print(symbol)
    team_games[symbol] = get_games(symbol)
    time.sleep(1)

BOS
BRK
IND
MIA
MIL
ORL
PHI
TOR
WAS
DAL
DEN
HOU
LAC
LAL
MEM
NOP
OKC
PHO
POR
SAC
SAS
UTA


In [111]:
all_games = []
teams_done = []
for t, games in team_games.items():
    for game in games:
        if game.home in teams_done or game.away in teams_done:
            continue
        all_games.append(game)
    teams_done.append(t)

In [118]:
games_df = pd.DataFrame(
    {
        "home": g.home,
        "away": g.away,
        "winner": g.winner,
        "date": g.date,
        "link": g.link,
    }
    for g in all_games
)

In [119]:
games_df

Unnamed: 0,home,away,winner,date,link
0,BOS,PHI,PHI,2019-10-23,/boxscores/201910230PHI.html
1,TOR,BOS,BOS,2019-10-25,/boxscores/201910250BOS.html
2,BOS,NYK,BOS,2019-10-26,/boxscores/201910260NYK.html
3,MIL,BOS,BOS,2019-10-30,/boxscores/201910300BOS.html
4,NYK,BOS,BOS,2019-11-01,/boxscores/201911010BOS.html
...,...,...,...,...,...
986,CHO,UTA,UTA,2020-01-10,/boxscores/202001100UTA.html
987,UTA,GSW,UTA,2020-01-22,/boxscores/202001220GSW.html
988,UTA,CLE,UTA,2020-03-02,/boxscores/202003020CLE.html
989,UTA,NYK,UTA,2020-03-04,/boxscores/202003040NYK.html


In [120]:
games_df.to_csv("all_games.csv")

In [147]:
def parse_box_score(table, team: str, opponent: str, game_date: date) -> List[Dict]:
    rows = table.tbody.find_all("tr", class_=None)
    box_stats = []
    for row in rows:
        name = row.th.a.text
        stats = {"player": name}
        stats["team"] = team
        stats["opponent"] = opponent
        stats["date"] = game_date
        cols = row.find_all("td")

        if len(cols) > 1:
            for c in row.find_all("td"):
                stats[c["data-stat"]] = c.text

        box_stats.append(stats)
    return box_stats

In [148]:
def get_game_stats(game) -> List[Dict]:
    response = requests.get(f"https://www.basketball-reference.com/{game.link}")
    soup = BeautifulSoup(response.text, "html.parser")

    home = game.home
    home_table = soup.find(id=f"box-{home}-game-basic")
    
    away = game.away
    away_table = soup.find(id=f"box-{away}-game-basic")

    return (
        parse_box_score(home_table, home, away, game.date) + 
        parse_box_score(away_table, away, home, game.date)
    )

In [154]:
all_stats = []
num_games_processed = 0
for game in all_games:
    if game.date > datetime.now().date():
        continue
    all_stats += get_game_stats(game)
    num_games_processed += 1
    if num_games_processed % 50 == 0:
        print(f"Processed {num_games_processed} games")
    time.sleep(0.05)

Processed 50 games
Processed 100 games
Processed 150 games
Processed 200 games
Processed 250 games
Processed 300 games
Processed 350 games
Processed 400 games
Processed 450 games
Processed 500 games
Processed 550 games
Processed 600 games
Processed 650 games
Processed 700 games
Processed 750 games
Processed 800 games
Processed 850 games
Processed 900 games


In [156]:
stats_df = pd.DataFrame(all_stats)
stats_df

Unnamed: 0,player,team,opponent,date,mp,fg,fga,fg_pct,fg3,fg3a,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,plus_minus
0,Jayson Tatum,BOS,PHI,2019-10-23,37:13,8,22,.364,4,8,...,1,9,10,2,2,0,4,2,21,-4
1,Gordon Hayward,BOS,PHI,2019-10-23,35:11,8,15,.533,0,0,...,0,5,5,2,0,0,1,4,25,-11
2,Kemba Walker,BOS,PHI,2019-10-23,34:15,4,18,.222,1,6,...,2,0,2,2,0,0,3,4,12,-16
3,Enes Kanter,BOS,PHI,2019-10-23,24:58,5,8,.625,0,1,...,3,3,6,2,0,0,0,3,12,-12
4,Jaylen Brown,BOS,PHI,2019-10-23,20:52,3,6,.500,1,2,...,0,7,7,0,1,0,1,5,8,-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22604,Sekou Doumbouya,DET,UTA,2020-03-07,19:48,4,6,.667,0,1,...,1,2,3,1,0,0,1,1,9,-7
22605,Thon Maker,DET,UTA,2020-03-07,9:45,0,3,.000,0,0,...,0,0,0,1,0,0,0,3,0,-7
22606,Donta Hall,DET,UTA,2020-03-07,,,,,,,...,,,,,,,,,,
22607,Jordan Bone,DET,UTA,2020-03-07,,,,,,,...,,,,,,,,,,


In [157]:
stats_df.to_csv("all_box_scores.csv")