In [146]:
import bs4
import pandas as pd
import numpy as np
from urllib import request

In [147]:
def get_seasons_urls(url : str) -> str:
    """
    Get a list of season URLs.
    """
    # Open the URL of the NBA Stats seasons list
    response = request.urlopen(url)
    html = response.read()

    # Get the HTML text of the NBA Stats website
    soup = bs4.BeautifulSoup(html, "lxml")

    # Find the <h2> tags
    h2_tags = soup.find_all('h2')
    h2_soup = bs4.BeautifulSoup(str(h2_tags), 'lxml')

    # Find the <a> tags
    a_tags = h2_soup.find_all('a')

    # Get the list of season URLs
    urls = [a['href'] for a in a_tags]

    return urls

In [191]:
def get_table_data(url : str) -> pd.DataFrame:
    """
    Returns a data frame of the table in the url
    """

    # Open the URL and read the HTML content
    response = request.urlopen(url)
    html = response.read()

    # Create BeautifulSoup object to parse the HTML
    soup = bs4.BeautifulSoup(html, "lxml")

    # Find the <article> tags
    article_tag = soup.find('article')
    article_soup = bs4.BeautifulSoup(str(article_tag), 'lxml')

    # Find the <table> tag
    table_tag = article_soup.find('table')
    table_soup = bs4.BeautifulSoup(str(table_tag))

    # Find the <thead> tag
    thead_tag = table_soup.find('thead')
    thead_soup = bs4.BeautifulSoup(str(thead_tag))

    # Find all the <th> tags within the <thead> tag
    th_head_tag = thead_soup.find_all('th')

    # Get the column names
    columns_name = [column.get_text() for column in th_head_tag]

    # Find all the <tr> tags within the <table> tag
    tr_tag = table_soup.find_all('tr')

    # Get the data
    data = [] 
    lines_list = tr_tag[1:]

    rank = 1
    for tr in lines_list:
        tr_soup = bs4.BeautifulSoup(str(tr))
        td_tags = tr_soup.find_all('td')

        attributes = []
        for td in td_tags:
            attribute = td.get_text()
            if attribute == "":  # If empty, it's the rank column
                attributes.append(str(rank))
                rank += 1
            else:
                attributes.append(attribute)

        data.append(attributes)

    # Get the data into a dataframe
    data_array = np.asarray(data)
    df = pd.DataFrame(data, columns=columns_name)

    # Add the Title of the dataframe (Season years + Player/Team/Referee stats) 
    h1_tag = article_soup.find('h1')
    df['Title'] = h1_tag.get_text() # This will add the title in each column
    
    return df

In [217]:
def upload_data(urls_list : list[str]) -> pd.DataFrame:
    """
    Uploads data from a list of URLs and returns a list of DataFrames.
    """
    data_list = []
    for url in urls_list:
        # Retrieve table data from the URL
        df = get_table_data(url)
        # Print a success message with the title and URL
        print(f"{df['Title'][0]} data uploaded successfully from {url}")
        # Append the DataFrame to the data_list
        data_list.append(df)
    return data_list

In [207]:
# Set the length of each URL to get only the URL we want (in this case, all the URLs we want have the same size in each category)
len_url_players = len("https://www.nbastuffer.com/2023-2024-nba-player-stats/")
len_url_teams = len("https://www.nbastuffer.com/2023-2024-nba-team-stats/")
len_url_referees = len("https://www.nbastuffer.com/2023-2024-nba-referee-stats/")

# Get the menu URL where the seasons URLs are
players_menu_url = "https://www.nbastuffer.com/nba-stats/player/"
teams_menu_url = "https://www.nbastuffer.com/nba-stats/team/"
referees_menu_url = "https://www.nbastuffer.com/nba-stats/referee/"

# Get all the seasons URLs for players
players_urls_list = [url for url in get_seasons_urls(players_menu_url) if len(url) == len_url_players]

# Get all the seasons URLs for teams
teams_urls_list = [url for url in get_seasons_urls(teams_menu_url) if len(url) == len_url_teams]

# Get all the seasons URLs for referees
referees_urls_list = [url for url in get_seasons_urls(referees_menu_url) if len(url) == len_url_referees]

In [218]:
# Upload players' stats data
players_list = upload_data(players_urls_list)

# Upload teams' stats data
teams_list = upload_data(teams_urls_list)

# Upload referees' stats data
referee_list = upload_data(referees_urls_list)

2023-2024 NBA Player Stats data uploaded successfully from https://www.nbastuffer.com/2023-2024-nba-player-stats/
2022-2023 NBA Player Stats data uploaded successfully from https://www.nbastuffer.com/2022-2023-nba-player-stats/
2021-2022 NBA Player Stats data uploaded successfully from https://www.nbastuffer.com/2021-2022-nba-player-stats/
2020-2021 NBA Player Stats data uploaded successfully from https://www.nbastuffer.com/2020-2021-nba-player-stats/
2019-2020 NBA Player Stats data uploaded successfully from https://www.nbastuffer.com/2019-2020-nba-player-stats/
2018-2019 NBA Player Stats data uploaded successfully from https://www.nbastuffer.com/2018-2019-nba-player-stats/
2017-2018 NBA Player Stats data uploaded successfully from https://www.nbastuffer.com/2017-2018-nba-player-stats/
2023-2024 NBA Advanced Team Stats data uploaded successfully from https://www.nbastuffer.com/2023-2024-nba-team-stats/
2022-2023 NBA Advanced Team Stats data uploaded successfully from https://www.nbast

In [219]:
# Data Frame visualisation

# Print the first three lines of each DataFrame in players_list
print("Players Data:")
for df in players_list:
    print(df.head(3))

# Print the first three lines of each DataFrame in teams_list
print("Teams Data:")
for df in teams_list:
    print(df.head(3))

# Print the first three lines of each DataFrame in referee_list
print("Referees Data:")
for df in referee_list:
    print(df.head(3))

Players Data:
  RANK          NAME TEAM POS   AGE  GP   MPG  USG%   TO%  FTA  ...  SPG  BPG  \
0    1  LeBron James  Lal   F  39.1  48    35    29  15.2  266  ...  1.3  0.6   
1    2    Chris Paul  Gol   G  38.8  32  27.6  14.3  13.4   42  ...  1.1  0.1   
2    3    Kyle Lowry  Mia   G  37.9  37    28  13.3  17.7   42  ...  1.1  0.4   

   TPG   P+R   P+A P+R+A    VI   ORtg   DRtg                       Title  
0  3.2  32.1  32.6  39.9  12.8  116.8  113.2  2023-2024 NBA Player Stats  
1  1.2  12.7  16.1  19.9   9.1  127.8  112.9  2023-2024 NBA Player Stats  
2  1.4  11.6  12.2  15.6   6.9  115.6  107.5  2023-2024 NBA Player Stats  

[3 rows x 30 columns]
  RANK              NAME TEAM  POS   AGE  GP   MPG  USG%   TO%  FTA  ...  SPG  \
0    1  Precious Achiuwa  Tor    F  23.6  55  20.7  19.3  12.7  124  ...  0.6   
1    2      Steven Adams  Mem    C  29.7  42    27  14.6  23.1  129  ...  0.9   
2    3       Bam Adebayo  Mia  C-F  25.7  75  34.6  25.3  14.4  402  ...  1.2   

   BPG  TPG  

TESTS

In [None]:
url_test_preseason = 'https://www.espn.com/nba/team/schedule/_/name/den/season/2024/seasontype/2'
dataFrame_title = 'title'

# Open the URL and read the HTML content
response = request.urlopen(url_test_preseason)
html = response.read()

# Create BeautifulSoup object to parse the HTML
soup = bs4.BeautifulSoup(html, "lxml")

# Find the <table> tag
table_tag = soup.find('table')
table_soup = bs4.BeautifulSoup(str(table_tag))

# Find the <thead> tag
thead_tag = table_soup.find('thead')
thead_soup = bs4.BeautifulSoup(str(thead_tag))

# Find all the <th> tags within the <thead> tag
th_head_tag = thead_soup.find_all('th')

# Find all the <tr> tags within the <table> tag
tr_tag = table_soup.find_all('tr')

# Get the table lines
lines_list = tr_tag

# Get the column names
tr_soup = bs4.BeautifulSoup(str(lines_list[1]))
td_tags = tr_soup.find_all('td')
columns_name = []
for td in td_tags:
    attribute = td.get_text()
    columns_name.append(attribute)

# Get the data
data = [] 
for tr in lines_list[2:]:
    tr_soup = bs4.BeautifulSoup(str(tr))
    td_tags = tr_soup.find_all('td')
    
    if len(td_tags) < len(columns_name): # Get only the informations we want
        break

    attributes = []
    for td in td_tags:
        attribute = td.get_text()
        attributes.append(attribute)

    data.append(attributes)


# Get the data into a dataframe
data_array = np.asarray(data)
df = pd.DataFrame(data, columns=columns_name)

# Add the Title of the dataframe 
df['Title'] = dataFrame_title # This will add the title in each column