In [2]:
from datetime import datetime
import time

import pandas as pd

from nba import (
    Conference,
    Game,
    OTHER_TEAMS,
    Team,
    TEAMS,
)
from scraping import (
    get_games,
    get_game_stats,
    parse_box_score,
)

In [108]:
team_games = {} 
for t in TEAMS:
    symbol = t.symbol
    print(symbol)
    team_games[symbol] = get_games(symbol)
    time.sleep(1)

BOS
BRK
IND
MIA
MIL
ORL
PHI
TOR
WAS
DAL
DEN
HOU
LAC
LAL
MEM
NOP
OKC
PHO
POR
SAC
SAS
UTA


In [111]:
all_games = []
teams_done = []
for t, games in team_games.items():
    for game in games:
        if game.home in teams_done or game.away in teams_done:
            continue
        all_games.append(game)
    teams_done.append(t)

In [119]:
games_df = pd.DataFrame(
    {
        "home": g.home,
        "away": g.away,
        "winner": g.winner,
        "date": g.date,
        "link": g.link,
    }
    for g in all_games
)
games_df

Unnamed: 0,home,away,winner,date,link
0,BOS,PHI,PHI,2019-10-23,/boxscores/201910230PHI.html
1,TOR,BOS,BOS,2019-10-25,/boxscores/201910250BOS.html
2,BOS,NYK,BOS,2019-10-26,/boxscores/201910260NYK.html
3,MIL,BOS,BOS,2019-10-30,/boxscores/201910300BOS.html
4,NYK,BOS,BOS,2019-11-01,/boxscores/201911010BOS.html
...,...,...,...,...,...
986,CHO,UTA,UTA,2020-01-10,/boxscores/202001100UTA.html
987,UTA,GSW,UTA,2020-01-22,/boxscores/202001220GSW.html
988,UTA,CLE,UTA,2020-03-02,/boxscores/202003020CLE.html
989,UTA,NYK,UTA,2020-03-04,/boxscores/202003040NYK.html


In [120]:
games_df.to_csv("all_games.csv")

In [154]:
all_stats = []
num_games_processed = 0
for game in all_games:
    if game.date > datetime.now().date():
        continue
    all_stats += get_game_stats(game)
    num_games_processed += 1
    if num_games_processed % 50 == 0:
        print(f"Processed {num_games_processed} games")
    time.sleep(0.05)

Processed 50 games
Processed 100 games
Processed 150 games
Processed 200 games
Processed 250 games
Processed 300 games
Processed 350 games
Processed 400 games
Processed 450 games
Processed 500 games
Processed 550 games
Processed 600 games
Processed 650 games
Processed 700 games
Processed 750 games
Processed 800 games
Processed 850 games
Processed 900 games


In [156]:
stats_df = pd.DataFrame(all_stats)
stats_df

Unnamed: 0,player,team,opponent,date,mp,fg,fga,fg_pct,fg3,fg3a,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,plus_minus
0,Jayson Tatum,BOS,PHI,2019-10-23,37:13,8,22,.364,4,8,...,1,9,10,2,2,0,4,2,21,-4
1,Gordon Hayward,BOS,PHI,2019-10-23,35:11,8,15,.533,0,0,...,0,5,5,2,0,0,1,4,25,-11
2,Kemba Walker,BOS,PHI,2019-10-23,34:15,4,18,.222,1,6,...,2,0,2,2,0,0,3,4,12,-16
3,Enes Kanter,BOS,PHI,2019-10-23,24:58,5,8,.625,0,1,...,3,3,6,2,0,0,0,3,12,-12
4,Jaylen Brown,BOS,PHI,2019-10-23,20:52,3,6,.500,1,2,...,0,7,7,0,1,0,1,5,8,-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22604,Sekou Doumbouya,DET,UTA,2020-03-07,19:48,4,6,.667,0,1,...,1,2,3,1,0,0,1,1,9,-7
22605,Thon Maker,DET,UTA,2020-03-07,9:45,0,3,.000,0,0,...,0,0,0,1,0,0,0,3,0,-7
22606,Donta Hall,DET,UTA,2020-03-07,,,,,,,...,,,,,,,,,,
22607,Jordan Bone,DET,UTA,2020-03-07,,,,,,,...,,,,,,,,,,


In [157]:
stats_df.to_csv("all_box_scores.csv")

In [8]:
other_team_games = {} 
for t in OTHER_TEAMS:
    symbol = t.symbol
    print(symbol)
    other_team_games[symbol] = get_games(symbol)
    time.sleep(0.05)
other_games = []
teams_done = [t.symbol for t in TEAMS]
for t, games in other_team_games.items():
    for game in games:
        if game.home in teams_done or game.away in teams_done:
            continue
        other_games.append(game)
    teams_done.append(t)

CHO
CHI
NYK
DET
ATL
CLE
MIN
GSW


In [11]:
other_games_df = pd.DataFrame(
    {
        "home": g.home,
        "away": g.away,
        "winner": g.winner,
        "date": g.date,
        "link": g.link,
    }
    for g in other_games
)
other_games_df

Unnamed: 0,home,away,winner,date,link
0,CHI,CHO,CHO,2019-10-23,/boxscores/201910230CHO.html
1,MIN,CHO,MIN,2019-10-25,/boxscores/201910250CHO.html
2,CHO,GSW,CHO,2019-11-02,/boxscores/201911020GSW.html
3,DET,CHO,CHO,2019-11-15,/boxscores/201911150CHO.html
4,CHO,NYK,CHO,2019-11-16,/boxscores/201911160NYK.html
...,...,...,...,...,...
63,MIN,CLE,MIN,2020-01-05,/boxscores/202001050CLE.html
64,GSW,CLE,GSW,2020-02-01,/boxscores/202002010CLE.html
65,GSW,MIN,MIN,2019-11-08,/boxscores/201911080MIN.html
66,MIN,GSW,GSW,2019-12-23,/boxscores/201912230GSW.html


In [12]:
other_games_df.to_csv("other_games.csv")

In [13]:
other_stats = []
num_games_processed = 0
for game in other_games:
    if game.date >= datetime.now().date():
        continue
    other_stats += get_game_stats(game)
    num_games_processed += 1
    if num_games_processed % 50 == 0:
        print(f"Processed {num_games_processed} games")
    time.sleep(0.05)

Processed 50 games


In [14]:
other_stats_df = pd.DataFrame(other_stats)
other_stats_df

Unnamed: 0,player,team,opponent,date,mp,fg,fga,fg_pct,fg3,fg3a,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,plus_minus
0,Lauri Markkanen,CHI,CHO,2019-10-23,34:22,13,25,.520,1,7,...,5,12,17,2,1,0,1,1,35,-8
1,Zach LaVine,CHI,CHO,2019-10-23,30:00,7,17,.412,1,4,...,3,3,6,7,2,1,5,3,16,-8
2,Wendell Carter,CHI,CHO,2019-10-23,29:25,5,11,.455,0,0,...,5,4,9,0,0,1,1,4,12,+2
3,Otto Porter,CHI,CHO,2019-10-23,27:43,3,10,.300,1,5,...,0,2,2,1,2,1,0,2,9,+14
4,Tomáš Satoranský,CHI,CHO,2019-10-23,24:22,1,5,.200,0,0,...,0,0,0,3,0,1,0,2,2,+4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1662,Jordan McLaughlin,MIN,GSW,2020-01-02,14:55,0,4,.000,0,1,...,0,1,1,4,1,0,1,1,0,-8
1663,Jaylen Nowell,MIN,GSW,2020-01-02,5:45,0,4,.000,0,1,...,1,0,1,1,0,0,0,0,0,+1
1664,Jordan Bell,MIN,GSW,2020-01-02,5:19,1,5,.200,0,1,...,3,1,4,0,0,0,1,1,2,-9
1665,Jeff Teague,MIN,GSW,2020-01-02,,,,,,,...,,,,,,,,,,


In [15]:
other_stats_df.to_csv("other_box_scores.csv")