# **[Data4Life] - Introduction to Data Science**
Topic ***NBA*** - Group ***16***

## **Team Members**

<center>

| No. | Name                   | Student ID |
|-----|------------------------|------------|
| 1   | Trần Nguyễn Nhật Cường | 22127048   |
| 2   | Huỳnh Tấn Đạt          | 22127059   |
| 3   | Nguyễn Công Tuấn       | 22127436   |  
| 4   | Trần Đăng Tuấn         | 22127438   |

</center>

## **Overview**

In this section, we will identify two specific URLs which are:
- [Offcial NBA Stat](https://www.nba.com/stats)
- [Basketball Reference](https://www.basketball-reference.com/)

Our team will use for data crawling to support our analysis. The targeted data types include:
- Team Statistics (Total): Comprehensive data summarizing team performance metrics.
- Player Statistics (Total): Aggregate performance data for individual players across relevant metrics.
- Player Information: Detailed player profiles, including demographic, biographical, and career-specific information.
- Rookie Player Statistics: Performance metrics specific to rookie players in the league.

These URLs will serve as primary data sources, enabling us to compile and analyze the necessary information effectively.

### **Import modules**

In [2]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support.ui import Select
from selenium.webdriver.edge.service import Service
import pandas as pd
import time
import csv
import json
import time
import random

### **Data Collection**

#### **Offcial NBA Stat**

**Player's profile**

In [None]:
driver = webdriver.Chrome()


# Function to scrape the team roster data
def scrape_roster(url, season):
    driver.get(url)

    time.sleep(2)  # Wait for the page to load completely

    # Wait for the team name section to load
    try:
        team_name_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "TeamHeader_name__MmHlP"))
        )
    except:
        print(f"Unable to load the team name for {url}")
        return []

    # Extract team name
    team_name_parts = team_name_section.find_elements(By.TAG_NAME, "div")
    team_name = " ".join(
        [part.text.strip() for part in team_name_parts if part.text.strip()]
    )

    # Wait for the Roster section to load completely
    try:
        rows_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "Crom_body__UYOcU"))
        )
    except:
        print(f"Unable to load the roster section for {team_name}")
        return []

    # Extract header for the CSV file
    header = [
        "Player",
        "No.",
        "Pos",
        "Height",
        "Weight",
        "Birthdate",
        "Age",
        "Exp",
        "School",
        "How Acquired",
        "Team Name",
        "Player Link",
        "Season",
    ]

    # Store the roster data
    data = []

    # Find all rows in the roster table
    rows = rows_section.find_elements(By.TAG_NAME, "tr")

    for row in rows:
        columns = row.find_elements(By.TAG_NAME, "td")

        # Ensure row has data columns
        if len(columns) > 0:
            player_name = columns[0].text.strip()
            player_link_elements = columns[0].find_elements(By.TAG_NAME, "a")
            player_link = (
                player_link_elements[0].get_attribute("href")
                if player_link_elements
                else None
            )

            # Extract other data columns
            row_data = [col.text.strip() for col in columns]
            row_data.append(team_name)  # Append team name
            row_data.append(player_link)  # Append player link
            row_data.append(season)  # Append season

            data.append(row_data)

    return data


# Usage of the function
all_rosters = process_batches(team_stats, batch_size=2, type=1)

# Write the data to CSV
csv_file_name = "nba_roster_1.csv"

header = [
    "Player",
    "#",
    "Pos",
    "Height",
    "Weight",
    "Birthdate",
    "Age",
    "Exp",
    "School",
    "How Acquired",
    "Team Name",
    "Player Link",
    "Season",
]


with open(csv_file_name, mode="w", newline="", encoding="utf-8") as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(header)  # Write header row
    writer.writerows(all_rosters)  # Write roster rows

print(f"Roster data saved to {csv_file_name}")

driver.quit()

**Player stat total**

In [None]:
driver = webdriver.Chrome()
url = "https://www.nba.com/stats/leaders"
url_season = [
    "?Season=2024-25",
    "?Season=2023-24",
    "?Season=2022-23",
    "?Season=2021-22",
    "?Season=2020-21",
]

data = pd.DataFrame()

In [None]:
for i in url_season:
    driver.get(url + i)
    time.sleep(2)  # Allow the page to load

    # Wait for the pagination element to appear
    try:
        pagination = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.XPATH, '//div[contains(@class, "Pagination_content")]')
            )
        )
        page = int(pagination.find_element(By.XPATH, ".//div[4]").text.split(" ")[-1])
    except Exception as e:
        print(f"Pagination element not found: {e}")
        continue

    # Locate table headers
    headers = driver.find_element(By.XPATH, '//tr[contains(@class, "Crom_headers")]')
    columns = [header.text for header in headers.find_elements(By.TAG_NAME, "th")]
    columns.append("Season")  # Add a season column

    # Temporary DataFrame for the current season
    season_data = pd.DataFrame(columns=columns)

    for _ in range(page):
        # Locate table rows
        table = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.XPATH, '//tbody[contains(@class, "Crom_body")]')
            )
        )
        rows = table.find_elements(By.XPATH, ".//tr")

        for row in rows:
            cells = row.find_elements(By.XPATH, ".//td")
            row_data = [cell.text for cell in cells]
            if row_data:  # Avoid empty rows
                row_data.append(i.split("=")[-1])  # Add the season
                season_data.loc[len(season_data)] = row_data

        # Click the next page button
        try:
            button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable(
                    (
                        By.XPATH,
                        './/button[@type="button" and @title="Next Page Button"]',
                    )
                )
            )
            driver.execute_script(
                "arguments[0].click();", button
            )  # Ensure no overlay blocks the click
            time.sleep(3)  # Wait for the next page to load
        except Exception as e:
            print(f"Pagination error: {e}")
            break

    # Append the season's data to the main DataFrame
    data = pd.concat([data, season_data], ignore_index=True)

In [None]:
# Save to CSV
path = "../../Data_NBA_1/nba_stats.csv"
data.to_csv(path, index=False, header=True)
print(f"Data saved to {path}")

# Close the browser
driver.quit()

**Team's Profile**

In [None]:
# Target URL for NBA Teams page
nba_url = "https://www.nba.com/teams"

# CSV file name to store the team data
csv_file_name = "teams_NBA.csv"

try:
    with open(csv_file_name, mode="w", newline="", encoding="utf-8") as csv_file:
        csv_writer = csv.writer(csv_file)

        # Write the header row for the CSV
        csv_writer.writerow(
            ["Division", "Team Name", "Team Profile", "Team Stats", "Team Schedule"]
        )

        driver.get(nba_url)

        # Wait for the page to load completely
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CLASS_NAME, "TeamDivisions_wrapper__5_SVo")
            )
        )

        # Locate all team divisions
        divisions = driver.find_elements(By.CLASS_NAME, "TeamDivisions_division__u3KUS")

        for division in divisions:
            division_name = division.find_element(
                By.CLASS_NAME, "TeamDivisions_divisionName__KFlSk"
            ).text
            print(f"Division: {division_name}")

            # Locate teams within the division
            teams = division.find_elements(By.CLASS_NAME, "TeamFigure_tf__jA5HW")
            for team in teams:
                team_name = team.find_element(
                    By.CLASS_NAME, "TeamFigure_tfMainLink__OPLFu"
                ).text

                # Get URLs for Profile, Stats, and Schedule
                team_links = team.find_elements(
                    By.CLASS_NAME, "TeamFigureLink_teamFigureLink__uqnNO"
                )
                team_profile = (
                    team_links[0].get_attribute("href")
                    if len(team_links) > 0
                    else "N/A"
                )
                team_stats = (
                    team_links[1].get_attribute("href")
                    if len(team_links) > 1
                    else "N/A"
                )
                team_schedule = (
                    team_links[2].get_attribute("href")
                    if len(team_links) > 2
                    else "N/A"
                )

                print(f"  Team: {team_name}")
                print(f"    Profile: {team_profile}")
                print(f"    Stats: {team_stats}")
                print(f"    Schedule: {team_schedule}")

                # Write the team data into the CSV
                csv_writer.writerow(
                    [division_name, team_name, team_profile, team_stats, team_schedule]
                )

finally:
    driver.quit()  # Ensure driver quits even if an error occurs

**Team's stat total**

In [None]:
csv_file_name = "teams_NBA.csv"
team_profile_list = []
team_stats = []

url_season = [
    "?Season=2024-25",
    "?Season=2023-24",
    "?Season=2022-23",
    "?Season=2021-22",
    "?Season=2020-21",
]

with open(csv_file_name, mode="r", encoding="utf-8") as csv_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        team_profile_list.append(row["Team Profile"])
        for season_query in url_season:
            team_stats.append(row["Team Stats"] + season_query)

#### **Basketball Reference**


In [None]:
BASE_URL = "https://www.basketball-reference.com/"
service = Service('C:\Program Files\WebDriver\msedgedriver.exe')

**Player's profile**

In [None]:
def get_players():
    data = []
    for i in range(ord('a'), ord('z') + 1):
        letter = chr(i)
        print(f"Crawling in {letter}")
        # Initialize ME browser
        driver = webdriver.Edge(service=service)
        # Construct the URL for each year
        url = f"{BASE_URL}/players/{letter}"
        driver.get(url)

        try:
            players = driver.find_elements(By.XPATH, "//table[@id='players']/tbody/tr") 
            for player in players: 
                retired = 1
                try:
                    name = player.find_element(By.XPATH, "./th/strong/a").text.replace('*', '')
                    retired = 0
                except:
                    try:
                        name = player.find_element(By.XPATH, "./th/a").text.replace('*', '')
                    except:
                        continue
        
                from_year = player.find_element(By.XPATH, "./td[1]").text 
                to_year = player.find_element(By.XPATH, "./td[2]").text 
                pos = player.find_element(By.XPATH, "./td[3]").text 
                height = player.find_element(By.XPATH, "./td[4]").text 
                weight = player.find_element(By.XPATH, "./td[5]").text 
                birth_date = player.find_element(By.XPATH, "./td[6]").text 
                colleges = player.find_element(By.XPATH, "./td[7]").text 
            
                data.append([name, from_year, to_year, pos, height, weight, birth_date, colleges, retired]) 
            
        except Exception as e:
            print(e)
            driver.quit()
            break

        # Đóng trình duyệt
        driver.quit()

    # Tạo DataFrame và lưu vào file CSV 
    df = pd.DataFrame(data, columns=["Name", "From", "To", "Pos", "Height", "Weight", "Birth Date", "Colleges", "Retired"]) 
    df.to_csv("players_data.csv", index=False)
    
    return df

df = get_players()
df.to_csv("player_profile.csv", index=False)

**Player's total stats**

In [None]:
def get_player_links():
    links = []
    for i in range(ord('a'), ord('z') + 1):
        letter = chr(i)
        print(f"Crawling in {letter}")
    
        # Construct the URL for each year
        url = f"{BASE_URL}/players/{letter}"
        driver.get(url)

        try:
            players = driver.find_elements(By.XPATH, "//table[@id='players']/tbody/tr") 
            for player in players: 
                try:
                    profile_link = player.find_element(By.XPATH, "./th/strong/a").get_attribute("href")
                except:
                    continue
        
                links.append(profile_link) 
            
        except Exception as e:
            print(e)
            driver.quit()
            break

        # Đóng trình duyệt
        driver.quit()
    
    return links

player_active_links = get_player_links()
with open("links.txt", "w") as file:
    for item in player_active_links:
        file.write(item + "\n")

def get_player_stats():
    data = []
    
    with open("links.txt", "r") as file:
        for line in file:
            url = line.strip()
            
            driver = webdriver.Edge(service=service)
            # Mở trang web
            driver.get(url)
            
            try:
                # Lấy dữ liệu từ bảng "totals"
                totals_table = driver.find_element(By.XPATH, "//table[@id='totals_stats']")
                rows = totals_table.find_elements(By.XPATH, ".//tbody/tr")
                
                name = driver.find_element(By.ID, "meta").find_elements(By.TAG_NAME, "div")[1].find_element(By.TAG_NAME, "h1").text

                for row in rows:
                    try:
                        season = row.find_element(By.XPATH, "./th").text
                        team = row.find_element(By.XPATH, "./td[2]").text
                        pos = row.find_element(By.XPATH, "./td[4]").text
                        g = row.find_element(By.XPATH, "./td[5]").text
                        gs = row.find_element(By.XPATH, "./td[6]").text
                        mp = row.find_element(By.XPATH, "./td[7]").text
                        fg = row.find_element(By.XPATH, "./td[8]").text
                        fga = row.find_element(By.XPATH, "./td[9]").text
                        fg_pct = row.find_element(By.XPATH, "./td[10]").text
                        fg3 = row.find_element(By.XPATH, "./td[11]").text
                        fg3a = row.find_element(By.XPATH, "./td[12]").text
                        fg3_pct = row.find_element(By.XPATH, "./td[13]").text
                        fg2 = row.find_element(By.XPATH, "./td[14]").text
                        fg2a = row.find_element(By.XPATH, "./td[15]").text
                        fg2_pct = row.find_element(By.XPATH, "./td[16]").text
                        efg_pct = row.find_element(By.XPATH, "./td[17]").text
                        ft = row.find_element(By.XPATH, "./td[18]").text
                        fta = row.find_element(By.XPATH, "./td[19]").text
                        ft_pct = row.find_element(By.XPATH, "./td[20]").text
                        orb = row.find_element(By.XPATH, "./td[21]").text
                        drb = row.find_element(By.XPATH, "./td[22]").text
                        trb = row.find_element(By.XPATH, "./td[23]").text
                        ast = row.find_element(By.XPATH, "./td[24]").text
                        stl = row.find_element(By.XPATH, "./td[25]").text
                        blk = row.find_element(By.XPATH, "./td[26]").text
                        tov = row.find_element(By.XPATH, "./td[27]").text
                        pf = row.find_element(By.XPATH, "./td[28]").text
                        pts = row.find_element(By.XPATH, "./td[29]").text
                        
                        data.append([season, team, pos, g, gs, mp, fg, fga, fg_pct, fg3, fg3a, fg3_pct, fg2, fg2a, fg2_pct, efg_pct, ft, fta, ft_pct, orb, drb, trb, ast, stl, blk, tov, pf, pts, name])
                    except:
                        continue
            
            except Exception as e:
                print(e)
                driver.quit()
                break
            # Đóng trình duyệt
            driver.quit()

    # Tạo DataFrame và thêm cột "Name"
    df = pd.DataFrame(data, columns=["Season", "Team", "Pos", "G", "GS", "MP", "FG", "FGA", "FG%", "3P", "3PA", "3P%", "2P", "2PA", "2P%", "eFG%", "FT", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS", "Name"])

    return df

df = get_player_stats()
df.to_csv("totals_data.csv", index=False)

**Player's per game stats**

In [None]:
def get_player_stats_pg():
    data = []
    
    with open("links.txt", "r") as file:
        for line in file:
            url = line.strip()
            
            driver = webdriver.Edge(service=service)
            # Mở trang web
            driver.get(url)
            
            try:
                totals_table = driver.find_element(By.XPATH, "//table[@id='per_game_stats']")
                rows = totals_table.find_elements(By.XPATH, ".//tbody/tr")
                
                name = driver.find_element(By.ID, "meta").find_elements(By.TAG_NAME, "div")[1].find_element(By.TAG_NAME, "h1").text

                for row in rows:
                    try:
                        season = row.find_element(By.XPATH, "./th").text
                        team = row.find_element(By.XPATH, "./td[2]").text
                        pos = row.find_element(By.XPATH, "./td[4]").text
                        g = row.find_element(By.XPATH, "./td[5]").text
                        gs = row.find_element(By.XPATH, "./td[6]").text
                        mp = row.find_element(By.XPATH, "./td[7]").text
                        fg = row.find_element(By.XPATH, "./td[8]").text
                        fga = row.find_element(By.XPATH, "./td[9]").text
                        fg_pct = row.find_element(By.XPATH, "./td[10]").text
                        fg3 = row.find_element(By.XPATH, "./td[11]").text
                        fg3a = row.find_element(By.XPATH, "./td[12]").text
                        fg3_pct = row.find_element(By.XPATH, "./td[13]").text
                        fg2 = row.find_element(By.XPATH, "./td[14]").text
                        fg2a = row.find_element(By.XPATH, "./td[15]").text
                        fg2_pct = row.find_element(By.XPATH, "./td[16]").text
                        efg_pct = row.find_element(By.XPATH, "./td[17]").text
                        ft = row.find_element(By.XPATH, "./td[18]").text
                        fta = row.find_element(By.XPATH, "./td[19]").text
                        ft_pct = row.find_element(By.XPATH, "./td[20]").text
                        orb = row.find_element(By.XPATH, "./td[21]").text
                        drb = row.find_element(By.XPATH, "./td[22]").text
                        trb = row.find_element(By.XPATH, "./td[23]").text
                        ast = row.find_element(By.XPATH, "./td[24]").text
                        stl = row.find_element(By.XPATH, "./td[25]").text
                        blk = row.find_element(By.XPATH, "./td[26]").text
                        tov = row.find_element(By.XPATH, "./td[27]").text
                        pf = row.find_element(By.XPATH, "./td[28]").text
                        pts = row.find_element(By.XPATH, "./td[29]").text
                        
                        data.append([season, team, pos, g, gs, mp, fg, fga, fg_pct, fg3, fg3a, fg3_pct, fg2, fg2a, fg2_pct, efg_pct, ft, fta, ft_pct, orb, drb, trb, ast, stl, blk, tov, pf, pts, name])
                    except:
                        continue
            
            
            except Exception as e:
                print(e)
                driver.quit()
                break
            
            driver.quit()
    # Tạo DataFrame và thêm cột "Name"
    df = pd.DataFrame(data, columns=["Season", "Team", "Pos", "G", "GS", "MP", "FG", "FGA", "FG%", "3P", "3PA", "3P%", "2P", "2PA", "2P%", "eFG%", "FT", "FTA", "FT%", "ORB", "DRB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS", "Name"])

    return df

df = get_player_stats_pg()
df.to_csv("player_stats_per_game.csv", index=False)

**Team's total stats**

In [None]:
def get_stats(year_interval: str='2020:2022', div_id: str='totals_team'):
    all_data = []
    start_year, end_year = year_interval.split(':')
    for year in range(int(start_year), int(end_year) + 1):
        # Initialize ME browser
        driver = webdriver.Edge(service=service)
        # Construct the URL for each year
        url = f"{BASE_URL}leagues/NBA_{year}.html"
        driver.get(url)
        
        try: 
            tbl = driver.find_element(By.ID, div_id)
        except Exception:
            driver.quit()
            continue
        
        # Extract column names
        header = tbl.find_element(By.TAG_NAME, 'thead')
        cols = header.text.strip().split(' ')

        rows = tbl.find_element(By.TAG_NAME, 'tbody')
        data_rows = rows.find_elements(By.TAG_NAME, 'tr')
        teams, Gs, MPs, FGs, FGAs, FGPs, thPs, thPAs, thPPs, tPs, tPAs, tPPs, FTs, FTAs, FTPs, ORBs, DRBs, TRBs, ASTs, STLs, BLKs, TOVs, PFs, PTSs  = [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []
        ranks = []
        # Extract data
        for row in data_rows:
            try:
                rank = row.find_element(By.TAG_NAME, 'th').text
                rank = int(rank)
            except:
                continue
            ranks.append(rank)
            
            stats = row.find_elements(By.TAG_NAME, 'td')
            teams.append(stats[0].text.replace('*',''))
            Gs.append(stats[1].text)
            MPs.append(stats[2].text)
            FGs.append(stats[3].text)
            FGAs.append(stats[4].text)
            FGPs.append(stats[5].text)
            thPs.append(stats[6].text)
            thPAs.append(stats[7].text)
            thPPs.append(stats[8].text)
            tPs.append(stats[9].text)
            tPAs.append(stats[10].text)
            tPPs.append(stats[11].text)
            FTs.append(stats[12].text)
            FTAs.append(stats[13].text)
            FTPs.append(stats[14].text)
            ORBs.append(stats[15].text)
            DRBs.append(stats[16].text)
            TRBs.append(stats[17].text)
            ASTs.append(stats[18].text)
            STLs.append(stats[19].text)
            BLKs.append(stats[20].text)
            TOVs.append(stats[21].text)
            PFs.append(stats[22].text)
            PTSs.append(stats[23].text)
            
        driver.quit()
        
        # Create a DataFrame for the current year
        data = list(zip(ranks, teams, Gs, MPs, FGs, FGAs, FGPs, thPs, thPAs, thPPs, tPs, tPAs, tPPs, FTs, FTAs, FTPs, ORBs, DRBs, TRBs, ASTs, STLs, BLKs, TOVs, PFs, PTSs))
        df_year = pd.DataFrame(data, columns=cols)
        df_year['Season'] = f"{year - 1}-{year}"  # Add a column for the year
        
        # Append the year data to the all_data list
        all_data.append(df_year)

    # Concatenate all the data frames
    final_df = pd.concat(all_data, axis=0, ignore_index=True)

    
    return final_df

totals_df = get_stats('2024:2025', 'totals-team')
totals_df.to_csv("total_stats.csv", header=True, index=False)

**Team's per game stats**

In [None]:
pgs_df = get_stats('1995:2024', 'per_game-team')
pgs_df.to_csv("per_game_stats.csv", header=True, index=False)

**Rookies' info**

In [None]:
def get_rookies(start_year, end_year):
    data = []
    for year in range(start_year, end_year + 1):
        print(f"Crawling in {year}")
        service = Service('C:\Program Files\WebDriver\msedgedriver.exe')
        # Initialize ME browser
        driver = webdriver.Edge(service=service)
        # Construct the URL for each year
        url = f"{BASE_URL}/leagues/NBA_{year}_rookies-career-stats.html"
        driver.get(url)

        try: 
            # Lấy dữ liệu từ bảng
            table = driver.find_element(By.ID, "rookies")
            rows = table.find_element(By.TAG_NAME, "tbody").find_elements(By.CLASS_NAME, "full_table")
            rank = 1
            season = f'{year - 1}-{year}'
            
            for row in rows:
                cols = row.find_elements(By.TAG_NAME, "td") 
                cols = [col.text for col in cols] 
                cols.insert(0, rank)
                cols.insert(0, season)
                data.append(cols)
                rank += 1
        except Exception as e:
            print(e)
            continue

        # Đóng trình duyệt
        driver.quit()

    # Tạo DataFrame và lưu vào file CSV
    df = pd.DataFrame(data, columns=["Season", "Rank", "Player", "Debut", "Age", "Yrs", "G", "MP", "FG", "FGA", "3P", "3PA", "FT", "FTA", "ORB", "TRB", "AST", "STL", "BLK", "TOV", "PF", "PTS", "FG%", "3P%", "FT%", "MP (Per Game)", "PTS (Per Game)", "TRB (Per Game)", "AST (Per Game)", "STL (Per Game)", "BLK (Per Game)"])
    
    return df

df = get_rookies(2015, 2015)
df.to_csv("rookies_stats.csv", index=False, header=False)
print("Dữ liệu đã được lưu vào file rookies_stats.csv")