In [84]:
import os
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
import requests
import pandas as pd

## Web Scraping Historical Team Data

In [125]:
seasons = list(range(1996, 2023))
data_url = "https://www.basketball-reference.com/teams/{}/{}.html"
teams = ["ATL", "BOS", "BRK", "NJN", "CHO", "CHA", "CHH", "CHI", "CLE", "DAL", "DEN", "DET", "GSW", "HOU", "IND",
         "LAC", "LAL", "MEM", "VAN", "MIA", "MIL", "MIN", "NOP", "NOH", "NYK", "OKC", "SEA", "ORL",
         "PHI", "PHO", "POR", "SAC", "SAS", "TOR", "UTA", "WAS", "WSB"]

In [95]:
# instantiating (start) the webdriver
driver = webdriver.Chrome(executable_path="/Users/caleb/Desktop/github/NBA/chromedriver_mac64")

  driver = webdriver.Chrome(executable_path="/Users/caleb/Desktop/github/NBA/chromedriver_mac64")


In [96]:
# scrape_season(team) will use the webdriver to scrape the seasons from 1996 to 2022 for each team
def scrape_season(team):
    for season in seasons:
        url = data_url.format(team, season)

        response = requests.get(url)

        # response.status_code returns the HTTP status code received from a server after sending an HTTP request
        # 200:OK, 404: Not Found, 500: Internal Server Error, 429: Too many requests
        if response.status_code == 200:

            # launch a new broswer and opens the given URL in the broswer instance
            driver.get(url)

            # pauses for 3 seconds
            time.sleep(3)
            
            # gets the HTML source code of the page
            page_source = driver.page_source

            # write the contents of the page (page_source) to a file in the DATA directory
            with open("DATA/{}_{}.html".format(team, season), "w+") as file:
                file.write(page_source)
        else:
            # pauses for 3 seconds
            time.sleep(3)
            continue

In [97]:
# scrape every team in the list of teams
for team in teams:
    print(team)
    scrape_season(team)

driver.quit()

LAC
LAL
MEM
VAN
MIA
MIL
MIN
NOP
NOH
NYK
OKC
SEA
ORL
PHI
PHO
POR
SAC
SAS
TOR
UTA
WAS
WSB
CHH


**We have 801 seasons from 30 teams across 27 seasons, 9 missing seasons are 7 from the New Orleans Pelicans/Hornets who didn't exist**<br>
**prior to 2003 and 2 from the Charlotte Hornets/Bobcats who didn't play between 2002-2004**

## Parsing the Data

In [153]:
# parse_team_stats will create a dataframe containing the team's stats for every season
def parse_team_stats(team):
    df = []
    
    for season in seasons:
        file_path = "DATA/{}_{}.html".format(team, season)

        # checking if file exists
        if os.path.exists(file_path):

            # get the contents of the file containing the team's season data
            with open(file_path) as file:
                team_data = file.read()

            # creates a new object using the BeautifulSoup class and allows for parsing of the team_data
            soup = BeautifulSoup(team_data, "html.parser")

            # finds the table with the specific id
            team_stats_table = soup.find(id="div_team_and_opponent")

            # turns the HTML table to a pandas dataframe
            team_stats = pd.read_html(str(team_stats_table))[0]


            # extract the first row from the dataframe
            team_stats = team_stats.drop(team_stats.index.difference([1]))

            # delete specific columns
            del team_stats["Unnamed: 0"]
            del team_stats["G"]

            # adding some columns
            team_stats["Year"] = season
            team_stats["Team"] = team

            df.append(team_stats)

        else:
            continue

    df = pd.concat(df)
    
    return df

In [154]:
# loop through each team and call parse_team_stats
all_teams = []
for team in teams:
    team_stats_df = parse_team_stats(team)
    all_teams.append(team_stats_df)

In [155]:
# combine the elements in the all_teams_df list into one dataframe
df = pd.concat(all_teams)

# resetting the dataframe index
df = df.reset_index(drop=True)

In [156]:
df

Unnamed: 0,MP,FG,FGA,FG%,3P,3PA,3P%,2P,2PA,2P%,...,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Year,Team
0,240.3,41.5,88.3,0.470,12.9,34.4,0.374,28.6,53.9,0.531,...,33.9,44.0,24.6,7.2,4.2,11.9,18.7,113.9,1996,ATL
1,240.3,41.5,88.3,0.470,12.9,34.4,0.374,28.6,53.9,0.531,...,33.9,44.0,24.6,7.2,4.2,11.9,18.7,113.9,1997,ATL
2,240.3,41.5,88.3,0.470,12.9,34.4,0.374,28.6,53.9,0.531,...,33.9,44.0,24.6,7.2,4.2,11.9,18.7,113.9,1998,ATL
3,240.3,41.5,88.3,0.470,12.9,34.4,0.374,28.6,53.9,0.531,...,33.9,44.0,24.6,7.2,4.2,11.9,18.7,113.9,1999,ATL
4,240.3,41.5,88.3,0.470,12.9,34.4,0.374,28.6,53.9,0.531,...,33.9,44.0,24.6,7.2,4.2,11.9,18.7,113.9,2000,ATL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
794,241.0,41.5,90.9,0.457,12.0,32.6,0.368,29.5,58.3,0.506,...,31.9,42.0,25.0,8.0,4.3,14.2,22.7,114.4,2020,WAS
795,241.7,43.2,90.9,0.475,10.2,29.0,0.351,33.0,61.9,0.533,...,35.5,45.2,25.5,7.3,4.1,14.4,21.6,116.6,2021,WAS
796,241.8,40.6,86.0,0.472,10.5,30.6,0.342,30.1,55.4,0.543,...,34.1,43.1,25.0,6.4,5.0,13.1,18.8,108.6,2022,WAS
797,241.5,39.0,80.7,0.484,6.0,14.8,0.407,33.0,65.9,0.501,...,28.4,39.7,22.1,7.2,6.2,16.2,24.2,102.5,1996,WSB


In [3]:
df.to_csv("CSV/team_stats.csv")