In [11]:
import csv
import json
import time


import random


from selenium import webdriver


from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support.ui import Select
from time import sleep

# Teams

In [12]:
driver = webdriver.Chrome()

# Target URL for NBA Teams page
nba_url = "https://www.nba.com/teams"

# CSV file name to store the team data
csv_file_name = "teams_NBA.csv"

try:
    with open(csv_file_name, mode="w", newline="", encoding="utf-8") as csv_file:
        csv_writer = csv.writer(csv_file)

        # Write the header row for the CSV
        csv_writer.writerow(
            ["Division", "Team Name", "Team Profile", "Team Stats", "Team Schedule"]
        )

        driver.get(nba_url)

        # Wait for the page to load completely
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CLASS_NAME, "TeamDivisions_wrapper__5_SVo")
            )
        )

        # Locate all team divisions
        divisions = driver.find_elements(By.CLASS_NAME, "TeamDivisions_division__u3KUS")

        for division in divisions:
            division_name = division.find_element(
                By.CLASS_NAME, "TeamDivisions_divisionName__KFlSk"
            ).text
            print(f"Division: {division_name}")

            # Locate teams within the division
            teams = division.find_elements(By.CLASS_NAME, "TeamFigure_tf__jA5HW")
            for team in teams:
                team_name = team.find_element(
                    By.CLASS_NAME, "TeamFigure_tfMainLink__OPLFu"
                ).text

                # Get URLs for Profile, Stats, and Schedule
                team_links = team.find_elements(
                    By.CLASS_NAME, "TeamFigureLink_teamFigureLink__uqnNO"
                )
                team_profile = (
                    team_links[0].get_attribute("href")
                    if len(team_links) > 0
                    else "N/A"
                )
                team_stats = (
                    team_links[1].get_attribute("href")
                    if len(team_links) > 1
                    else "N/A"
                )
                team_schedule = (
                    team_links[2].get_attribute("href")
                    if len(team_links) > 2
                    else "N/A"
                )

                print(f"  Team: {team_name}")
                print(f"    Profile: {team_profile}")
                print(f"    Stats: {team_stats}")
                print(f"    Schedule: {team_schedule}")

                # Write the team data into the CSV
                csv_writer.writerow(
                    [division_name, team_name, team_profile, team_stats, team_schedule]
                )

finally:
    driver.quit()  # Ensure driver quits even if an error occurs

Division: ATLANTIC
  Team: Boston Celtics
    Profile: https://www.nba.com/team/1610612738/celtics/
    Stats: https://www.nba.com/stats/team/1610612738
    Schedule: https://www.nba.com/celtics/schedule
  Team: Brooklyn Nets
    Profile: https://www.nba.com/team/1610612751/nets/
    Stats: https://www.nba.com/stats/team/1610612751
    Schedule: https://www.nba.com/nets/schedule
  Team: New York Knicks
    Profile: https://www.nba.com/team/1610612752/knicks/
    Stats: https://www.nba.com/stats/team/1610612752
    Schedule: https://www.nba.com/knicks/schedule
  Team: Philadelphia 76ers
    Profile: https://www.nba.com/team/1610612755/sixers/
    Stats: https://www.nba.com/stats/team/1610612755
    Schedule: https://www.nba.com/sixers/schedule
  Team: Toronto Raptors
    Profile: https://www.nba.com/team/1610612761/raptors/
    Stats: https://www.nba.com/stats/team/1610612761
    Schedule: https://www.nba.com/raptors/schedule
Division: CENTRAL
  Team: Chicago Bulls
    Profile: https://

### Teams's Profile

In [17]:
csv_file_name = "teams_NBA.csv"
team_profile_list = []
team_stats = []

url_season = [
    "?Season=2024-25",
    "?Season=2023-24",
    "?Season=2022-23",
    "?Season=2021-22",
    "?Season=2020-21",
]

with open(csv_file_name, mode="r", encoding="utf-8") as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        team_profile_list.append(row["Team Profile"])
        for season_query in url_season:
            team_stats.append(row["Team Stats"] + season_query)

team_stats = ["https://www.nba.com/stats/team/1610612761?Season=2021-22"]

In [18]:
# Function to process team profiles in batches (2 at a time)
def process_batches(team_profile_list, batch_size=2, type=1):
    all_rosters = []
    # Process the URLs in batches of 2
    for i in range(0, len(team_profile_list), batch_size):
        batch = team_profile_list[i : i + batch_size]
        batch_data = []

        # Scrape data for each URL in the current batch
        for url in batch:
            try:
                print(f"Scraping data for {url}...")
                if type == 1:
                    team_roster = scrape_roster(url, url[-7:])
                    print(team_roster)
                    batch_data.extend(team_roster)
                elif type == 2:
                    team_retired = scrape_retired(url)
                    batch_data.extend(team_retired)
                elif type == 3:
                    team_hall_of_fame = scrape_hall_of_fame(url)
                    batch_data.extend(team_hall_of_fame)
                elif type == 4:
                    team_all_time_record = scrape_all_time_record(url)
                    batch_data.extend(team_all_time_record)
                elif type == 5:
                    team_achievements = scrape_achievements(url)
                    batch_data.extend(team_achievements)
                else:
                    print(f"Failed to retrieve roster for {url}")
            except Exception as e:
                print(f"Error scraping {url}: {e}")

            # Adding a random sleep between each request to prevent overwhelming the server
            sleep(random.uniform(2, 5))

        # Once the batch is done, add the data to the all_rosters list
        all_rosters.extend(batch_data)

        # Wait before processing the next batch to prevent rate-limiting issues
        print(f"Batch of {batch_size} teams processed. Waiting before next batch...")
        time.sleep(10)

    return all_rosters

### Roster

In [19]:
driver = webdriver.Chrome()


# Function to scrape the team roster data


def scrape_roster(url, season):
    driver.get(url)

    time.sleep(2)  # Wait for the page to load completely

    # Wait for the team name section to load
    try:
        team_name_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "TeamHeader_name__MmHlP"))
        )
    except:
        print(f"Unable to load the team name for {url}")
        return []

    # Extract team name
    team_name_parts = team_name_section.find_elements(By.TAG_NAME, "div")
    team_name = " ".join(
        [part.text.strip() for part in team_name_parts if part.text.strip()]
    )

    # Wait for the Roster section to load completely
    try:
        rows_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "Crom_body__UYOcU"))
        )
    except:
        print(f"Unable to load the roster section for {team_name}")
        return []

    # Extract header for the CSV file
    header = [
        "Player",
        "No.",
        "Pos",
        "Height",
        "Weight",
        "Birthdate",
        "Age",
        "Exp",
        "School",
        "How Acquired",
        "Team Name",
        "Player Link",
        "Season",
    ]

    # Store the roster data
    data = []

    # Find all rows in the roster table
    rows = rows_section.find_elements(By.TAG_NAME, "tr")

    for row in rows:
        columns = row.find_elements(By.TAG_NAME, "td")

        # Ensure row has data columns
        if len(columns) > 0:
            player_name = columns[0].text.strip()
            player_link_elements = columns[0].find_elements(By.TAG_NAME, "a")
            player_link = (
                player_link_elements[0].get_attribute("href")
                if player_link_elements
                else None
            )

            # Extract other data columns
            row_data = [col.text.strip() for col in columns]
            row_data.append(team_name)  # Append team name
            row_data.append(player_link)  # Append player link
            row_data.append(season)  # Append season

            data.append(row_data)

    return data


# Usage of the function


all_rosters = process_batches(team_stats, batch_size=2, type=1)



# Write the data to CSV


csv_file_name = "nba_roster_1.csv"


header = [
    "Player",
    "#",
    "Pos",
    "Height",
    "Weight",
    "Birthdate",
    "Age",
    "Exp",
    "School",
    "How Acquired",
    "Team Name",
    "Player Link",
    "Season",
]



with open(csv_file_name, mode="w", newline="", encoding="utf-8") as csv_file:

    writer = csv.writer(csv_file)

    writer.writerow(header)  # Write header row

    writer.writerows(all_rosters)  # Write roster rows



print(f"Roster data saved to {csv_file_name}")



driver.quit()

Scraping data for https://www.nba.com/stats/team/1610612761?Season=2021-22...
[['Armoni Brooks', '#1', 'G', '6-3', '195 lbs', 'JUN 05, 1998', '24', '1', 'Houston', '', 'TORONTO RAPTORS', 'https://www.nba.com/stats/player/1629717/', '2021-22'], ['OG Anunoby', '#3', 'F', '6-7', '232 lbs', 'JUL 17, 1997', '24', '4', 'Indiana', '', 'TORONTO RAPTORS', 'https://www.nba.com/stats/player/1628384/', '2021-22'], ['Scottie Barnes', '#4', 'F', '6-7', '225 lbs', 'AUG 01, 2001', '20', 'R', 'Florida State', '#4 Pick in 2021 Draft', 'TORONTO RAPTORS', 'https://www.nba.com/stats/player/1630567/', '2021-22'], ['Precious Achiuwa', '#5', 'F', '6-8', '225 lbs', 'SEP 19, 1999', '22', '1', 'Memphis', '', 'TORONTO RAPTORS', 'https://www.nba.com/stats/player/1630173/', '2021-22'], ['Justin Champagnie', '#11', 'G-F', '6-6', '206 lbs', 'JUN 29, 2001', '21', 'R', 'Pittsburgh', '', 'TORONTO RAPTORS', 'https://www.nba.com/stats/player/1630551/', '2021-22'], ['David Johnson', '#13', 'G', '6-4', '203 lbs', 'FEB 26, 2

### RETIRED NUMBERS

In [None]:
driver = webdriver.Chrome(service=service, options=chrome_options)


# Function to scrape the team roster data


def scrape_retired(url):

    driver.get(url)

    time.sleep(5)  # Wait for the page to load completely

    # Wait for the team name section to load

    try:

        team_name_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "TeamHeader_name__MmHlP"))
        )

        team_name_parts = team_name_section.find_elements(By.TAG_NAME, "div")

        team_name = " ".join(
            [part.text.strip() for part in team_name_parts if part.text.strip()]
        )

    except:

        print(f"Unable to load the team name for {url}")

        return []

    # Wait for the Roster section to load

    try:

        rows_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CLASS_NAME, "TeamRetired_content__nb7Qt")
            )
        )

        rows = rows_section.find_elements(By.XPATH, ".//tr")

    except:

        print(f"Unable to load the roster section for {team_name}")

        return []

    # Extract header and data

    data = []

    for row in rows[1:]:  # Skip header row

        columns = row.find_elements(By.TAG_NAME, "td")

        if len(columns) > 0:

            player_link = None

            try:

                player_link = (
                    columns[1].find_element(By.TAG_NAME, "a").get_attribute("href")
                )
            except:

                pass  # If no link exists, skip gracefully

            row_data = [
                team_name,  # Team name
                player_link,  # Player link
                columns[1].text.strip(),  # Player name
                columns[0].text.strip(),  # Jersey #
                columns[2].text.strip(),  # Position
                columns[3].text.strip(),  # Seasons with team
                columns[4].text.strip(),  # Year of induction
            ]

            data.append(row_data)

    return data


all_retired = process_batches(team_profile_list, batch_size=2, type=2)


# Write the data to CSV


csv_file_name = "nba_retired.csv"


header = [
    "Team Name",
    "Player Link",
    "Player",
    "#",
    "Pos",
    "Seasons With Team",
    "Year of Induction",
]


with open(csv_file_name, mode="w", newline="", encoding="utf-8") as csv_file:

    writer = csv.writer(csv_file)

    writer.writerow(header)  # Write header row

    writer.writerows(all_retired)  # Write roster rows


print(f"Roster data saved to {csv_file_name}")


# Close the driver


driver.quit()

### Hall of Fame

In [None]:
driver = webdriver.Chrome(service=service, options=chrome_options)


# Function to scrape Hall of Fame data


def scrape_hall_of_fame(url):

    driver.get(url)

    time.sleep(2)  # Wait for the page to load completely

    # Wait for the team name section to load

    try:

        team_name_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "TeamHeader_name__MmHlP"))
        )

        team_name_parts = team_name_section.find_elements(By.TAG_NAME, "div")

        team_name = " ".join(
            [part.text.strip() for part in team_name_parts if part.text.strip()]
        )

    except:

        print(f"Unable to load the team name for {url}")

        return []

    # Wait for the Hall of Fame section to load

    try:

        hall_of_fame_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CLASS_NAME, "TeamHallOfFame_content__IZSl2")
            )
        )

        rows = hall_of_fame_section.find_elements(By.XPATH, ".//tr")

    except:

        print(f"Unable to load the Hall of Fame section for {team_name}")

        return []

    # Extract Hall of Fame data

    data = []

    for row in rows[1:]:  # Skip header row

        columns = row.find_elements(By.TAG_NAME, "td")

        if len(columns) > 0:

            player_link = None

            try:

                player_link = (
                    columns[0].find_element(By.TAG_NAME, "a").get_attribute("href")
                )
            except:

                pass  # If no link exists, skip gracefully

            row_data = [
                team_name,  # Team name
                player_link,  # Player link
                columns[0].text.strip(),  # Player name
                columns[1].text.strip(),  # Position
                columns[2].text.strip(),  # Seasons with team
                columns[3].text.strip(),  # Year of induction
            ]

            data.append(row_data)

    return data


all_hall_of_fame = process_batches(team_profile_list, batch_size=2, type=3)


# Write the data to CSV


csv_file_name = "nba_hall_of_fame.csv"


header = [
    "Team Name",
    "Player Link",
    "Player",
    "Pos",
    "Seasons With Team",
    "Year of Induction",
]


with open(csv_file_name, mode="w", newline="", encoding="utf-8") as csv_file:

    writer = csv.writer(csv_file)

    writer.writerow(header)  # Write header row

    writer.writerows(all_hall_of_fame)  # Write Hall of Fame rows


print(f"Hall of Fame data saved to {csv_file_name}")


# Close the driver


driver.quit()

### All-time records

In [None]:
driver = webdriver.Chrome(service=service, options=chrome_options)


# Function to scrape All-Time Record data


def scrape_all_time_record(url):

    driver.get(url)

    time.sleep(5)  # Wait for the page to load completely

    # Wait for the team name section to load

    try:

        team_name_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "TeamHeader_name__MmHlP"))
        )

        team_name_parts = team_name_section.find_elements(By.TAG_NAME, "div")

        team_name = " ".join(
            [part.text.strip() for part in team_name_parts if part.text.strip()]
        )

    except:

        print(f"Unable to load the team name for {url}")

        return []

    # Wait for the All-Time Record section to load

    try:

        records_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "TeamRecords_table__0iapO"))
        )

        rows = records_section.find_elements(By.XPATH, ".//tr")

    except:

        print(f"Unable to load the All-Time Record section for {team_name}")

        return []

    # Extract All-Time Record data

    data = []

    for row in rows:

        columns = row.find_elements(By.TAG_NAME, "td")

        if len(columns) > 0:

            player_link = None

            try:

                player_link = (
                    columns[1].find_element(By.TAG_NAME, "a").get_attribute("href")
                )
            except:

                pass  # If no link exists, skip gracefully

            row_data = [
                team_name,  # Team name
                columns[0].text.strip(),  # Record type (e.g., "Total Points")
                columns[1].text.strip(),  # Player name
                player_link,  # Player link
                columns[2].text.strip(),  # Stat value
            ]

            data.append(row_data)

    return data


all_time_records = process_batches(team_profile_list, batch_size=2, type=4)


# Write the data to CSV


csv_file_name = "nba_all_time_records.csv"


header = ["Team Name", "Record Type", "Player", "Player Link", "Stat Value"]


with open(csv_file_name, mode="w", newline="", encoding="utf-8") as csv_file:

    writer = csv.writer(csv_file)

    writer.writerow(header)  # Write header row

    writer.writerows(all_time_records)  # Write All-Time Record rows


print(f"All-Time Record data saved to {csv_file_name}")


# Close the driver


driver.quit()

### Achievement

In [None]:
driver = webdriver.Chrome(service=service, options=chrome_options)


def scrape_achievements(url):

    driver.get(url)

    time.sleep(2)  # Wait for the page to load completely

    # Wait for the Achievements section to load

    try:

        awards_sections = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located(
                (By.CLASS_NAME, "TeamAwards_group__XU0o9")
            )
        )

    except TimeoutException:

        print(f"Unable to load the Achievements section for {url}")

        return []

    # Wait for the team name section to load

    try:

        team_name_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "TeamHeader_name__MmHlP"))
        )

        team_name_parts = team_name_section.find_elements(By.TAG_NAME, "div")

        team_name = " ".join(
            [part.text.strip() for part in team_name_parts if part.text.strip()]
        )

    except:

        print(f"Unable to load the team name for {url}")

        return []

    # Extracting Achievements: Championship Wins, Conference Titles, Division Titles

    data = []

    try:

        # Extract each category of awards

        for group in awards_sections:

            # Extracting the heading (e.g., "Championship Wins", "Conference Titles", "Division Titles")

            try:

                heading = group.find_element(
                    By.CLASS_NAME, "TeamAwards_heading__BvLNE"
                ).text.strip()

            except NoSuchElementException:

                print("Heading not found in group.")

                continue

            # Extracting the list of years under each heading

            try:

                years_list = group.find_elements(
                    By.CLASS_NAME, "TeamAwards_listItem__rb4hz"
                )

                for year in years_list:

                    row_data = [team_name, heading, year.text.strip()]

                    data.append(row_data)

            except NoSuchElementException:

                print(f"No years found for {heading}.")

    except Exception as e:

        print(f"Error while processing awards: {e}")

    return data


# Process the team profiles


all_achievements = process_batches(team_profile_list, batch_size=2, type=5)


# Write the data to CSV


csv_file_name = "nba_achievements.csv"


header = ["Team Name", "Achievement Type", "Year"]


with open(csv_file_name, mode="w", newline="", encoding="utf-8") as csv_file:

    writer = csv.writer(csv_file)

    writer.writerow(header)  # Write header row

    writer.writerows(all_achievements)  # Write achievement rows


print(f"Achievement data saved to {csv_file_name}")


# Close the driver


driver.quit()

### Team's Stat

In [None]:
team_stats = [team_stat + "/traditional" for team_stat in team_stats]

team_stats

In [None]:
driver = webdriver.Chrome(service=service, options=chrome_options)


# Function to scrape the table data for a given year


def scrape_table_data():

    table_data = []

    # Wait for the table to load

    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "Crom_table__p1iZz"))
    )

    # Locate the table

    table = driver.find_element(By.CLASS_NAME, "Crom_table__p1iZz")

    # Extract headers and their titles

    headers = table.find_element(By.TAG_NAME, "thead").find_elements(By.TAG_NAME, "th")

    header_data = []

    for header in headers:

        field = header.get_attribute("field")  # e.g., "GP"

        title = header.get_attribute("title")  # e.g., "Game Played"

        header_data.append((field, title))

    # Extract rows

    body = table.find_element(By.TAG_NAME, "tbody")

    rows = body.find_elements(By.TAG_NAME, "tr")

    for row in rows:

        cols = row.find_elements(By.TAG_NAME, "td")

        row_data = [col.text.strip() for col in cols]

        table_data.append(row_data)

    return header_data, table_data


# Function to get the team name


def get_team_name():

    try:

        team_name_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "TeamHeader_name__MmHlP"))
        )

        team_name_parts = team_name_section.find_elements(By.TAG_NAME, "div")

        team_name = " ".join(
            [part.text.strip() for part in team_name_parts if part.text.strip()]
        )

        return team_name

    except:

        print("Unable to load the team name")

        return "Unknown Team"


# Main function to scrape all years


def scrape_all_years(url):

    driver.get(url)

    time.sleep(2)  # Wait for the page to load

    # Get the team name

    team_name = get_team_name()

    print(f"Team Name: {team_name}")

    # Wait for the dropdown to appear

    dropdown = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "DropDown_select__4pIg9"))
    )

    # Get all options from the dropdown

    select = Select(dropdown)

    all_years = [
        option.get_attribute("value")
        for option in select.options
        if option.get_attribute("value")
    ]

    all_data = []

    # Iterate over each year, select it, and scrape the table data

    for year in all_years:

        print(f"Scraping data for year: {year}")

        select.select_by_value(year)

        time.sleep(2)  # Allow the page to refresh with the new data

        try:

            header_data, table_data = scrape_table_data()

            # Append the year and team name to the data

            for row in table_data:

                row.insert(0, year)  # Add year to the start of each row

                row.insert(0, team_name)  # Add team name to the start of each row

            all_data.extend(table_data)

        except Exception as e:

            print(f"Error scraping data for year {year}: {e}")

    return header_data, all_data


# Initialize CSV file only once


csv_file_name = "nba_team_stats.csv"


with open(csv_file_name, mode="w", newline="", encoding="utf-8") as file:

    writer = csv.writer(file)

    # Write headers later after determining them

    header_written = False

    # Loop through each URL in the list

    for url in team_stats:  # `team_stats` should be a list of URLs

        print(f"Scraping data for URL: {url}")

        try:

            # Scrape data for the current URL

            header_data, all_data = scrape_all_years(url)

            if not header_written:

                # Write the headers only once

                writer.writerow(
                    ["Team Name", "Year"]
                    + [f"{field} - {title}" for field, title in header_data]
                )

                header_written = True

            # Write the data rows

            writer.writerows(all_data)

        except Exception as e:

            print(f"Error scraping data for {url}: {e}")


print(f"Data saved to {csv_file_name}")


# Close the driver after all URLs are processed


driver.quit()