In [146]:
import bs4
import pandas as pd
import numpy as np
from urllib import request

In [147]:
def get_seasons_urls(url : str) -> str:
    """
    Get a list of season URLs.
    """
    # Open the URL of the NBA Stats seasons list
    response = request.urlopen(url)
    html = response.read()

    # Get the HTML text of the NBA Stats website
    soup = bs4.BeautifulSoup(html, "lxml")

    # Find the <h2> tags
    h2_tags = soup.find_all('h2')
    h2_soup = bs4.BeautifulSoup(str(h2_tags), 'lxml')

    # Find the <a> tags
    a_tags = h2_soup.find_all('a')

    # Get the list of season URLs
    urls = [a['href'] for a in a_tags]

    return urls

In [191]:
def get_table_data(url : str) -> pd.DataFrame:
    """
    Returns a data frame of the table in the url
    """

    # Open the URL and read the HTML content
    response = request.urlopen(url)
    html = response.read()

    # Create BeautifulSoup object to parse the HTML
    soup = bs4.BeautifulSoup(html, "lxml")

    # Find the <article> tags
    article_tag = soup.find('article')
    article_soup = bs4.BeautifulSoup(str(article_tag), 'lxml')

    # Find the <table> tag
    table_tag = article_soup.find('table')
    table_soup = bs4.BeautifulSoup(str(table_tag))

    # Find the <thead> tag
    thead_tag = table_soup.find('thead')
    thead_soup = bs4.BeautifulSoup(str(thead_tag))

    # Find all the <th> tags within the <thead> tag
    th_head_tag = thead_soup.find_all('th')

    # Get the column names
    columns_name = [column.get_text() for column in th_head_tag]

    # Find all the <tr> tags within the <table> tag
    tr_tag = table_soup.find_all('tr')

    # Get the data
    data = [] 
    lines_list = tr_tag[1:]

    rank = 1
    for tr in lines_list:
        tr_soup = bs4.BeautifulSoup(str(tr))
        td_tags = tr_soup.find_all('td')

        attributes = []
        for td in td_tags:
            attribute = td.get_text()
            if attribute == "":  # If empty, it's the rank column
                attributes.append(str(rank))
                rank += 1
            else:
                attributes.append(attribute)

        data.append(attributes)

    # Get the data into a dataframe
    data_array = np.asarray(data)
    df = pd.DataFrame(data, columns=columns_name)

    # Add the Title of the dataframe (Season years + Player/Team/Referee stats) 
    h1_tag = article_soup.find('h1')
    df['Title'] = h1_tag.get_text() # This will add the title in each column
    
    return df

In [207]:
# Set the length of each URL to get only the URL we want (in this case, all the URLs we want have the same size in each category)
len_url_players = len("https://www.nbastuffer.com/2023-2024-nba-player-stats/")
len_url_teams = len("https://www.nbastuffer.com/2023-2024-nba-team-stats/")
len_url_referees = len("https://www.nbastuffer.com/2023-2024-nba-referee-stats/")

# Get the menu URL where the seasons URLs are
players_menu_url = "https://www.nbastuffer.com/nba-stats/player/"
teams_menu_url = "https://www.nbastuffer.com/nba-stats/team/"
referees_menu_url = "https://www.nbastuffer.com/nba-stats/referee/"

# Get all the seasons URLs for players
players_urls_list = [url for url in get_seasons_urls(players_menu_url) if len(url) == len_url_players]

# Get all the seasons URLs for teams
teams_urls_list = [url for url in get_seasons_urls(teams_menu_url) if len(url) == len_url_teams]

# Get all the seasons URLs for referees
referees_urls_list = [url for url in get_seasons_urls(referees_menu_url) if len(url) == len_url_referees]

In [211]:
players_list = []
for player_season_url in players_urls_list:
    df = get_table_data(player_season_url)
    print(f"Data uploaded succefuly : title :  {df['Title'][0]} - url : {player_season_url} - ")
    players_list.append(df)
    
teams_list = []
for player_season_url in teams_urls_list:
    df = get_table_data(player_season_url)
    print(f"Data uploaded succefuly :  {df['Title'][0]} - url : {player_season_url}")
    teams_list.append(df)
    
referee_list = []
for player_season_url in referees_urls_list:
    df = get_table_data(player_season_url)
    print(f"Data uploaded succefuly : title :  {df['Title'][0]} url : {player_season_url} - ")
    referee_list.append(df)


url : https://www.nbastuffer.com/2023-2024-nba-player-stats/ - title :  2023-2024 NBA Player Stats
url : https://www.nbastuffer.com/2022-2023-nba-player-stats/ - title :  2022-2023 NBA Player Stats
url : https://www.nbastuffer.com/2021-2022-nba-player-stats/ - title :  2021-2022 NBA Player Stats
url : https://www.nbastuffer.com/2020-2021-nba-player-stats/ - title :  2020-2021 NBA Player Stats
url : https://www.nbastuffer.com/2019-2020-nba-player-stats/ - title :  2019-2020 NBA Player Stats
url : https://www.nbastuffer.com/2018-2019-nba-player-stats/ - title :  2018-2019 NBA Player Stats
url : https://www.nbastuffer.com/2017-2018-nba-player-stats/ - title :  2017-2018 NBA Player Stats
url : https://www.nbastuffer.com/2023-2024-nba-team-stats/ - title :  2023-2024 NBA Advanced Team Stats
url : https://www.nbastuffer.com/2022-2023-nba-team-stats/ - title :  2022-2023 NBA Advanced Team Stats
url : https://www.nbastuffer.com/2021-2022-nba-team-stats/ - title :  2021-2022 NBA Advanced Team S