In [None]:
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import os

# Create data folder
FOLDER_PATH = "./pdata"
# Necessary directories
DATA_DIRS = ["", "/advanced_data", "/per_game_data", "/per_100_data", "/regular_total_data", "/defense_data", "/usage_data"]
FILE_NAMES = ["", "/advanced", "/per_game_regular", "/per_100_regular", "/regular_total", "/defense", "/usage"]
DATA_DIRS = [FOLDER_PATH + dir for dir in DATA_DIRS]

for dir in DATA_DIRS:
    # Check if folder exists, and if not create it
    if not os.path.exists(dir):
        os.makedirs(dir)
        print(f"Folder '{dir}' created.")

# Page item locaters 
YEAR_DROPDOWN_XPATH = "/html/body/div[1]/div[2]/div[2]/div[3]/section[1]/div/div/div[1]/label/div/select"
PAGE_SELECTION_XPATH = "/html/body/div[1]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[3]/div/label/div/select"

# og "/html/body/main/div/div/div[2]/div/div/div[1]/div[3]/div/div/label/select"
PER_MODE_XPATH = "/html/body/div[1]/div[2]/div[2]/div[3]/section[1]/div/div/div[3]/label/div/select"
DATA_TABLE_CLASS = "Crom_table__p1iZz"
CURRENT_YEAR = 2024

In [None]:
# Find the advanced data of all players from year_a to year_b
def find_advanced(year_a,year_b):

    year_gap = year_b - year_a
    year_to_present = CURRENT_YEAR - year_b
    
    for i in range(year_gap+1):
        url = r"https://www.nba.com/stats/players/advanced/?sort=GP&dir=-1&Season=2021-22&SeasonType=Regular%20Season"
        # Open the page
        driver = webdriver.Firefox()
        driver.get(url)
        # Select year
        time.sleep(5)
        select = Select(driver.find_element(By.XPATH, YEAR_DROPDOWN_XPATH))
        select.select_by_index(i+year_to_present)
        
        time.sleep(10)
        # select the page number to all
        select = Select(driver.find_element(By.XPATH, PAGE_SELECTION_XPATH))
        select.select_by_index(0)
        
        # Find data table 
        src = driver.page_source
        parser = BeautifulSoup(src,"lxml")
        table = parser.find("table",attrs = { "class" : DATA_TABLE_CLASS })
        
        # find the header of the table and remove the attributes that are not shown
        headers = table.find_all('th')
        headerlist = [h.text.strip() for h in headers[1:]]
        headerlist1 = [a.upper() for a in headerlist if not "RANK" in a]
        print(headerlist1)
        
        # Get the data of from all the data cells
        rows = table.findAll('tr')[1:]
        print(f"{len(rows)} rows found")
        player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[1:]] for i in range(len(rows))]
        
        # Drop the last 5 attributes that are hidden in the table 
        headerlist1 = headerlist1[:-5]
        
        # Add the data into the pdDataFrame
        stats = pd.DataFrame(player_stats,columns=headerlist1)
        
        # Prepend the year to the player's name
        def add_year(x):
            return "_"+str(year_b - i)+"_"+x
        stats["PLAYER"] = stats["PLAYER"].map(add_year)
        #save the data as an excel table
        pd.DataFrame.to_excel(stats,f"{DATA_DIRS[1]}{FILE_NAMES[1]}{str(year_b-i)}.xlsx")
        driver.close()
        print(f"Saved data from {str(year_b-i)} to excel")
    return

In [None]:
find_advanced(1996,1996)

In [None]:
# Find the regular data per game of all players from year_a to year_b
def find_regular_pg(year_a,year_b):

    year_gap = year_b - year_a
    year_to_present = CURRENT_YEAR - year_b
    
    for i in range(year_gap+1):
        url = r"https://www.nba.com/stats/players/traditional/?SeasonType=Regular%20Season&sort=PTS&dir=-1&Season=2021-22"
        
        # Open the page
        driver = webdriver.Firefox()
        driver.get(url)
        # Select year
        time.sleep(5)
        select = Select(driver.find_element(By.XPATH, YEAR_DROPDOWN_XPATH))
        select.select_by_index(i+year_to_present)
        
        time.sleep(10)
        # select the page number to all
        select = Select(driver.find_element(By.XPATH, PAGE_SELECTION_XPATH))
        select.select_by_index(0)
        
        # Find data table 
        src = driver.page_source
        parser = BeautifulSoup(src,"lxml")
        table = parser.find("table",attrs = { "class" : DATA_TABLE_CLASS })
        
        # Find the header of the table and remove the attributes that are not shown
        headers = table.find_all('th')
        headerlist = [h.text.strip() for h in headers[1:]]
        headerlist1 = [a.upper() for a in headerlist if not "RANK" in a]
        print(headerlist1)

        # Get the data of from all the data cells
        rows = table.findAll('tr')[1:]
        player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[1:]] for i in range(len(rows))]
        
        # Add the data into the pdDataFrame
        stats = pd.DataFrame(player_stats,columns=headerlist1)
        
        # Prepend the year to the player's name
        def add_year(x):
            return "_"+str(year_b - i)+"_"+x
        stats["PLAYER"] = stats["PLAYER"].map(add_year)
        #save the data as an excel table
        pd.DataFrame.to_excel(stats,f"{DATA_DIRS[2]}{FILE_NAMES[2]}{str(year_b-i)}.xlsx")
        driver.close()
        print(f"Saved data from {str(year_b-i)} to excel")
    return

In [None]:
find_regular_pg(1996, 2024)

In [None]:
# Find the regular data per game of all players from year_a to year_b
def find_regular_p100(year_a,year_b):

    year_gap = year_b - year_a
    year_to_present = CURRENT_YEAR - year_b
    
    for i in range(year_gap+1):
        driver = webdriver.Firefox()
        url = r"https://www.nba.com/stats/players/traditional/?sort=PTS&dir=-1&Season=2021-22&SeasonType=Regular%20Season"
        driver.get(url)
        
        #choose year
        time.sleep(10)
        select = Select(driver.find_element(By.XPATH,YEAR_DROPDOWN_XPATH))
        select.select_by_index(i+year_to_present)
        
        #choose per 100 poss
        time.sleep(10)
        select = Select(driver.find_element(By.XPATH, PER_MODE_XPATH))
        select.select_by_index(2)
        
        time.sleep(10)
        # select the page number to all
        select = Select(driver.find_element(By.XPATH, PAGE_SELECTION_XPATH))
        select.select_by_index(0)
        
        # Find data table 
        src = driver.page_source
        parser = BeautifulSoup(src,"lxml")
        table = parser.find("table",attrs = { "class" : DATA_TABLE_CLASS })
        
        # Find the header of the table and remove the attributes that are not shown
        headers = table.find_all('th')
        headerlist = [h.text.strip() for h in headers[1:]]
        headerlist1 = [a.upper() for a in headerlist if not "RANK" in a]
        
        # Get the data of from all the data cells
        rows = table.findAll('tr')[1:]
        player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[1:]] for i in range(len(rows))]
        
        # Add the data into the pdDataFrame
        stats = pd.DataFrame(player_stats,columns=headerlist1)
        
        # Prepend the year to the player's name
        def add_year(x):
            return "_"+str(year_b - i)+"_"+x
        stats["PLAYER"] = stats["PLAYER"].map(add_year)
        #save the data as an excel table
        pd.DataFrame.to_excel(stats,f"{DATA_DIRS[3]}{FILE_NAMES[3]}{str(year_b-i)}.xlsx")
        driver.close()
        print(f"Saved data from {str(year_b-i)} to excel")
    return

In [None]:
# Run and rerun as necessary to get the data for each season
find_regular_p100(1996, 2024)

In [None]:
def find_regular_total(year_a,year_b):

    year_gap = year_b - year_a
    year_to_present = CURRENT_YEAR - year_b
    
    for i in range(year_gap+1):
        # Open page in browser
        url = r"https://www.nba.com/stats/players/traditional/?SeasonType=Regular%20Season&sort=PTS&dir=-1&Season=2021-22&PerMode=Totals"
        
        # Open the page
        driver = webdriver.Firefox()
        driver.get(url)
        # Select year
        time.sleep(5)
        select = Select(driver.find_element(By.XPATH, YEAR_DROPDOWN_XPATH))
        select.select_by_index(i+year_to_present)
        
        time.sleep(10)
        # select the page number to all
        select = Select(driver.find_element(By.XPATH, PAGE_SELECTION_XPATH))
        select.select_by_index(0)
        
        # Find data table 
        src = driver.page_source
        parser = BeautifulSoup(src,"lxml")
        table = parser.find("table",attrs = { "class" : DATA_TABLE_CLASS })
        
        # Find the header of the table and remove the attributes that are not shown
        headers = table.find_all('th')
        headerlist = [h.text.strip() for h in headers[1:]]
        headerlist1 = [a.upper() for a in headerlist if not "RANK" in a]
        
        # Get the data of from all the data cells
        rows = table.findAll('tr')[1:]
        player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[1:]] for i in range(len(rows))]
        
        # Add the data into the pdDataFrame
        stats = pd.DataFrame(player_stats,columns=headerlist1)
        
        # Prepend the year to the player's name
        def add_year(x):
            return "_"+str(year_b - i)+"_"+x
        stats["PLAYER"] = stats["PLAYER"].map(add_year)
        stats["TEAM"] = stats["TEAM"].map(add_year)
        #save the data as an excel table
        pd.DataFrame.to_excel(stats,f"{DATA_DIRS[4]}{FILE_NAMES[4]}{str(year_b-i)}.xlsx")
        driver.close()
        print(f"Saved data from {str(year_b-i)} to excel")
    return

In [None]:
# Run and rerun as necessary to get the data for each season
find_regular_total(1996,2000)

In [None]:
# Find the defense data of all players from year_a to year_b
def find_defense(year_a,year_b):

    year_gap = year_b - year_a
    year_to_present = CURRENT_YEAR - year_b
    
    for i in range(year_gap+1):
        # open the browser
        driver = webdriver.Firefox()
        url = r"https://www.nba.com/stats/players/defense?sort=USG_PCT&dir=-1&Season=2021-22&SeasonType=Regular%20Season"
        driver.get(url)
        # Select year
        time.sleep(15)
        select = Select(driver.find_element(By.XPATH,YEAR_DROPDOWN_XPATH))
        select.select_by_index(i+year_to_present)
        
        time.sleep(10)
        # select the page number to all
        select = Select(driver.find_element(By.XPATH, PAGE_SELECTION_XPATH))
        select.select_by_index(0)
        
        # Find data table 
        src = driver.page_source
        parser = BeautifulSoup(src,"lxml")
        table = parser.find("table",attrs = { "class" : DATA_TABLE_CLASS })
        
        # Find the header of the table and remove the attributes that are not shown
        headers = table.find_all('th')
        headerlist = [h.text.strip() for h in headers[1:]]
        headerlist1 = [a.upper() for a in headerlist if not "RANK" in a]
        
        # Get the data of from all the data cells
        rows = table.findAll('tr')[1:]
        player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[1:]] for i in range(len(rows))]
        
        # Add the data into the pdDataFrame
        stats = pd.DataFrame(player_stats,columns=headerlist1)
        
        # Prepend the year to the player's name
        def add_year(x):
            return "_"+str(year_b - i)+"_"+x
        stats["PLAYER"] = stats["PLAYER"].map(add_year)
        #save the data as an excel table
        pd.DataFrame.to_excel(stats,f"{DATA_DIRS[5]}{FILE_NAMES[5]}{str(year_b-i)}.xlsx")
        driver.close()
        print(f"Saved data from {str(year_b-i)} to excel")
    return

In [None]:
# Run and rerun as necessary to get the data for each season
find_defense(1996, 2024)

In [None]:
# Find the opponent data of all players from year_a to year_b
def find_usage(year_a,year_b):

    year_gap = year_b - year_a
    year_to_present = CURRENT_YEAR - year_b
    
    for i in range(year_gap+1):
        # open the browser
        driver = webdriver.Firefox()
        url = r"https://www.nba.com/stats/players/usage/?sort=USG_PCT&dir=-1&Season=2021-22&SeasonType=Regular%20Season"
        driver.get(url)

        try: 
            # Select year
            time.sleep(15)
            select = Select(driver.find_element(By.XPATH,YEAR_DROPDOWN_XPATH))
            select.select_by_index(i+year_to_present)
            
            time.sleep(10)
            # select the page number to all
            select = Select(driver.find_element(By.XPATH, PAGE_SELECTION_XPATH))
            select.select_by_index(0)
            
            # Find data table 
            src = driver.page_source
            parser = BeautifulSoup(src,"lxml")
            table = parser.find("table",attrs = { "class" : DATA_TABLE_CLASS })

            # Find the header of the table and remove the attributes that are not shown
            headers = table.find_all('th')
            headerlist = [h.text.strip() for h in headers[1:]]
            headerlist1 = [a.upper() for a in headerlist if not "RANK" in a]
            
            # Get the data of from all the data cells
            rows = table.findAll('tr')[1:]
            player_stats = [[td.getText().strip() for td in rows[i].findAll('td')[1:]] for i in range(len(rows))]
            stats = pd.DataFrame(player_stats,columns=headerlist1)

            # Add the data into the pdDataFrame
            stats = pd.DataFrame(player_stats,columns=headerlist1)
            
            # Prepend the year to the player's name
            def add_year(x):
                return "_"+str(year_b - i)+"_"+x
            stats["PLAYER"] = stats["PLAYER"].map(add_year)
            #save the data as an excel table
            pd.DataFrame.to_excel(stats,f"{DATA_DIRS[6]}{FILE_NAMES[6]}{str(year_b-i)}.xlsx")
        except Exception as e:
            print(f"Error: {e}")
            driver.close()
            continue
        driver.close()
        print(f"Saved data from {str(year_b-i)} to excel")
    return

In [None]:
# Run and rerun as necessary to get the data for each season
find_usage(1996, 2024)

In [None]:
# find the game logs of each team and player from year_a to year_b (only interested in most recent year(s))
def find_team_box_score(year_a,year_b):
    year_gap = year_b - year_a
    year_to_present = CURRENT_YEAR - year_b
    for i in range(year_gap+1):
        # open the browser
        driver = webdriver.Firefox()
        url = r"https://www.nba.com/stats/teams/boxscores?Season=2021-22"
        driver.get(url)
        time.sleep(15)
        
        # select date 
        select = Select(driver.find_element(By.XPATH,YEAR_DROPDOWN_XPATH))
        select.select_by_index(i+year_to_present)
        
        time.sleep(10)
        # select the page number to all
        select = Select(driver.find_element(By.XPATH, PAGE_SELECTION_XPATH))
        select.select_by_index(0)
        
        # select = Select(driver.find_element(By.XPATH,"/html/body/div[1]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[3]/div/label/div/select"))
        # src = driver.page_source
        # parser = BeautifulSoup(src,"lxml")
        # table = parser.find("div",attrs = {"class" : "Crom_container__C45Ti"})

        # find the table 
        src = driver.page_source
        parser = BeautifulSoup(src,"lxml")
        table = parser.find("table",attrs = { "class" : DATA_TABLE_CLASS })

        # Find the header of the table and remove the attributes that are not shown
        headers = table.find_all('th')
        headerlist = [h.text.strip().upper() for h in headers]
        # headerlist = [h.text.strip().upper() for h in headers[1:]]
        print(headerlist)

        rows = table.findAll('tr')[1:]
        team_stats = [[td.getText().strip() for td in rows[i].findAll('td')] for i in range(len(rows))]
        # Put data into pandas dataframe
        team_stats = pd.DataFrame(team_stats,columns=headerlist)
        print(f"length of team stats: {len(team_stats)}")

        team_stats.to_excel(f"{DATA_DIRS[0]}/team_box_score{str(year_b-i)}.xlsx")

        driver.close()
        print(f"Saved data from {str(year_b-i)} to excel")
    return


In [None]:
# Run and rerun as necessary to get the data for each season
find_team_box_score(1996, 2024)

In [None]:

def find_player_box_score(year_a,year_b):
    year_gap = year_b - year_a
    year_to_present = CURRENT_YEAR - year_b
    for i in range(year_gap+1):
        # open the browser
        driver = webdriver.Firefox()
        url = r"https://www.nba.com/stats/players/boxscores?Season=2021-22"
        driver.get(url)
        # select = Select(driver.find_element(By.XPATH,"/html/body/div[1]/div[2]/div[2]/div[3]/section[2]/div/div[2]/div[2]/div[1]/div[3]/div/label/div/select"))
        # select.select_by_index(0)
        time.sleep(15)
        
        # select date 
        select = Select(driver.find_element(By.XPATH,YEAR_DROPDOWN_XPATH))
        select.select_by_index(i+year_to_present)
        
        time.sleep(10)
        # select the page number to all
        select = Select(driver.find_element(By.XPATH, PAGE_SELECTION_XPATH))
        select.select_by_index(0)
        
        # find the table 
        src = driver.page_source
        parser = BeautifulSoup(src,"lxml")
        table = parser.find("table",attrs = { "class" : DATA_TABLE_CLASS })

        # Find the header of the table and remove the attributes that are not shown
        headers = table.find_all('th')
        headerlist = [h.text.strip().upper() for h in headers]
        print(headerlist)

        rows = table.findAll('tr')[1:]
        players_stats = [[td.getText().strip() for td in rows[i].findAll('td')] for i in range(len(rows))]
        players_stats = pd.DataFrame(players_stats,columns=headerlist)
        players_stats.to_excel(f"{DATA_DIRS[0]}/player_box_score{str(year_b-i)}.xlsx")

        driver.close()
        print(f"Saved data from {str(year_b-i)} to excel")
    return

In [None]:
# Run and rerun as necessary to get the data for each season
find_player_box_score(2022,2024)

In [None]:
# Preview player stats
players_stats

In [None]:
len(team_stats)

In [None]:
team_stats

In [None]:
# Read team box score data back in
team_stats = pd.read_excel(f"{DATA_DIRS[0]}/team_box_score2024.xlsx")

In [None]:
team_stats_arr = team_stats.to_numpy()
team_stats_arr[0,7]

In [None]:
team_difference = np.empty([1230, 19])

In [None]:
for i in range(1230):
    if team_stats_arr[2*i,3] == "W":
        team_difference[i,-1] = 1
        for j in range(18):
            team_difference[i,j] = team_stats_arr[2*i,j+5] - team_stats_arr[2*i+1,j+5]
    if team_stats_arr[2*i,3] == "L":
        team_difference[i,-1] = 0
        for j in range(18):
            team_difference[i,j] = team_stats_arr[2*i+1,j+5] - team_stats_arr[2*i,j+5]

In [None]:
team_difference

In [None]:
column = ['PTS',
 'FGM',
 'FGA',
 'FG%',
 '3PM',
 '3PA',
 '3P%',
 'FTM',
 'FTA',
 'FT%',
 'OREB',
 'DREB',
 'REB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'result']

In [None]:
team_difference_df = pd.DataFrame(team_difference, columns = column)

In [None]:
team_difference_df.to_excel(f"{DATA_DIRS[0]}/team_difference.xlsx")