In [5]:
import bs4
import pandas as pd
import numpy as np
from urllib import request
import urllib.parse

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [7]:
schedule_url = 'https://www.espn.com/nba/schedule'
standings_url = 'https://www.espn.com/nba/standings/_/group/league'
player_stats_url = 'https://www.espn.com/nba/stats/player'
team_stats_url = 'https://www.espn.com/nba/stats/team'

In [88]:
def parse_schedule(url : str, title : str) -> pd.DataFrame:
    """
    Parses the HTML content of a given URL representing an NBA team's schedule,
    extracts the schedule data, and stores it in a pandas DataFrame.

    Args:
        url (str): The URL of the webpage containing the schedule.
        title (str): The title to be assigned to the DataFrame.

    Returns:
        pandas.DataFrame: A DataFrame containing the parsed schedule data,
        with each column representing a different attribute of the games
        (e.g., opponent, result, date), and the 'Title' column containing
        the specified title for reference.
    """
    # Open the URL and read the HTML content
    response = request.urlopen(url)
    html = response.read()

    # Create BeautifulSoup object to parse the HTML
    soup = bs4.BeautifulSoup(html, "lxml")

    # Find the <table> tag
    table_tag = soup.find('table')
    table_soup = bs4.BeautifulSoup(str(table_tag))

    # Get the table lines
    lines_list = table_soup.find_all('tr')

    # Get the column names
    if(len(lines_list[0]) == len(lines_list[1])):
        names_ligne_position = 0
    else:
        names_ligne_position = 1

    tr_soup = bs4.BeautifulSoup(str(lines_list[names_ligne_position]))
    td_tags = tr_soup.find_all('td')
    columns_name = []
    for td in td_tags:
        attribute = td.get_text()
        columns_name.append(attribute)

    columns_name.append('Home_Away') # Add the home away column

    # Get the data
    data = [] 
    for tr in lines_list[names_ligne_position+1:]:
        tr_soup = bs4.BeautifulSoup(str(tr))
        td_tags = tr_soup.find_all('td')

        if len(td_tags) < len(columns_name) - 1: # Get only the information we want
            continue

        attributes = []
        for td in td_tags:
            attribute = td.get_text()
            attributes.append(attribute)

        home_away = attributes[1][:3] # Get if the match played in home or away
        oppenent_name = attributes[1][2:]
        
        # Add the home_away attribute
        if(attributes[1][-1] == '*'):
            attributes[1] = oppenent_name[:-2]
            home_away = 'N' # Neutral
        else: 
            if attributes[1][:2] == 'vs':
                home_away = 'A' # Away
            else:
                home_away = 'H' # Home
            attributes[1] = oppenent_name

        attributes.append(home_away) 
        
        data.append(attributes)

    # Get the data into a dataframe
    df = pd.DataFrame(data, columns=columns_name)

    # Add the Title of the dataframe 
    df['Title'] = title

    return df

In [315]:
parse_schedule('https://www.espn.com/nba/team/schedule/_/name/hou/season/2021/seasontype/1','a')

Unnamed: 0,DATE,OPPONENT,RESULT,W-L,Hi Points,Hi Rebounds,Hi Assists,home_away,Title
0,"Fri, Dec 11",Chicago,W125-104,1-0,Caboclo 17,Caboclo 7,Wall 9,home,a
1,"Sun, Dec 13",Chicago,L104-91,1-1,Wall 21,Cousins 6,Wall 4,home,a
2,"Tue, Dec 15",San Antonio,W112-98,2-1,Wall 15,Cousins 11,Harden 4,away,a
3,"Thu, Dec 17",San Antonio,W128-106,3-1,Wood 27,Wood 10,Harden 9,away,a


In [11]:
def upload_schedule(schedule_url : str) -> list[pd.DataFrame]:
    # Open the URL and read the HTML content
    response = request.urlopen(schedule_url)
    html = response.read()

    # Create BeautifulSoup object to parse the HTML
    soup = bs4.BeautifulSoup(html, "lxml")

    # Get the <select> tags
    tag = soup.find_all('select')
    team_select_tag = tag[0] # Select tag for teams
    year_select_tag = tag[2] # Select tag for years

    # Get all the option tags for teams
    team_soup = bs4.BeautifulSoup(str(team_select_tag), "lxml")
    team_option_tag = team_soup.find_all('option')

    # Get all the option tags for years
    year_soup = bs4.BeautifulSoup(str(year_select_tag), "lxml")
    year_option_tag = year_soup.find_all('option')

    # Get the teams 
    teams = [('Atlanta Hawks', 'atl')]
    for i in team_option_tag[1:]:
        teams.append((i.get_text(),i['data-param-value']))

    # Get the years 
    years = []
    for i in year_option_tag:
        years.append((i.get_text(),i['data-param-value']))


    schedules = [] # Create a schedules list

    for (team,team_code) in teams:
        for (year,year_code) in years:
            # Get the URL of the team in the specified year
            team_url = schedule_url[:-3] + team_code + '/season/' + year_code
            response = urllib.request.urlopen(team_url)
            html_content = response.read()

            soup = bs4.BeautifulSoup(html_content, "lxml")

            # Get the <select> tags
            tag = soup.find_all('select')

            # Select tag for season types
            if(len(tag) <= 4):
                seasontype_tag = tag[-1] 
            else:
                seasontype_tag = tag[4] 

            seasontype_soup = bs4.BeautifulSoup(str(seasontype_tag), "lxml")
            seasontype_tag = seasontype_soup.find_all('option') # Get all the option tags for season types
            
            seasonstypes = []
            for i in seasontype_tag:
                try:
                    seasonstypes.append((i.get_text(), i['data-param-value']))
                except KeyError: # No Data Available
                    continue 
            
            for (seasontype,seasontype_code) in seasonstypes:

                url = team_url + '/seasontype/' + seasontype_code[:-1]
                title = f'{team} {year} {seasontype} NBA Schedule'

                df = parse_schedule(url,title) # Parse the schedule and obtain a DataFrame
                
                schedules.append(df) # Append the DataFrame to the schedules list

                print(f'{title} uploaded successfully from {url}')

    return schedules

In [320]:
upload_schedule('https://www.espn.com/nba/team/schedule/_/name/atl')

Atlanta Hawks 2023-24 Preseason NBA Schedule uploaded successfully from https://www.espn.com/nba/team/schedule/_/name/atl/season/2024/seasontype/1
Atlanta Hawks 2023-24 Regular Season NBA Schedule uploaded successfully from https://www.espn.com/nba/team/schedule/_/name/atl/season/2024/seasontype/2
Atlanta Hawks 2022-23 Preseason NBA Schedule uploaded successfully from https://www.espn.com/nba/team/schedule/_/name/atl/season/2023/seasontype/1
Atlanta Hawks 2022-23 Regular Season NBA Schedule uploaded successfully from https://www.espn.com/nba/team/schedule/_/name/atl/season/2023/seasontype/2
Atlanta Hawks 2022-23 Postseason NBA Schedule uploaded successfully from https://www.espn.com/nba/team/schedule/_/name/atl/season/2023/seasontype/3
Atlanta Hawks 2022-23 Play-In NBA Schedule uploaded successfully from https://www.espn.com/nba/team/schedule/_/name/atl/season/2023/seasontype/5
Atlanta Hawks 2021-22 Preseason NBA Schedule uploaded successfully from https://www.espn.com/nba/team/schedul

In [103]:
def parse_standings(url : str, title : str) -> pd.DataFrame:
    """
    Parses the HTML content of a given URL representing an NBA team's schedule,
    extracts the schedule data, and stores it in a pandas DataFrame.

    Args:
        url (str): The URL of the webpage containing the schedule.
        title (str): The title to be assigned to the DataFrame.

    Returns:
        pandas.DataFrame: A DataFrame containing the parsed schedule data,
        with each column representing a different attribute of the games
        (e.g., opponent, result, date), and the 'Title' column containing
        the specified title for reference.
    """
    # Open the URL and read the HTML content
    response = request.urlopen(url)
    html = response.read()

    # Create BeautifulSoup object to parse the HTML
    soup = bs4.BeautifulSoup(html, "lxml")

    # Get the teams name table 
    names_table_tag = soup.find('table')
    names_table_soup = bs4.BeautifulSoup(str(names_table_tag))

    # Get the info table 
    info_table_tag = soup.find('div', {"class":"Table__Scroller"})
    info_table_soup = bs4.BeautifulSoup(str(info_table_tag))


    # Get the tables lines
    name_lines_list = names_table_soup.find_all('tr')
    info_lines_list = info_table_soup.find_all('tr')

    # Get the column names
    tr_tag = info_table_soup.find('tr', {"class":"Table__sub-header Table__TR Table__even"})
    columns_name = [column.get_text() for column in tr_tag]
    columns_name = ['Team Name'] + columns_name

    data = [] 
    for i in range(len(name_lines_list)):

        # Get the team name
        tr_soup_name = bs4.BeautifulSoup(str(name_lines_list[i]))
        tr_soup_info = bs4.BeautifulSoup(str(info_lines_list[i]))
        
        # Get the team informations
        td_tags = tr_soup_name.find_all('td') + tr_soup_info.find_all('td')

        attributes = []
        for td in td_tags:
            attribute = td.get_text()
            if(attributes == []):
                offset = 3
                if(attribute[3] == '-'):
                    offset += 1
                attributes.append(attribute[offset:])
            else:
                attributes.append(attribute)
        data.append(attributes)

    # Get the data into a dataframe
    df = pd.DataFrame(data[1:], columns=columns_name)

    # Add the Title of the dataframe 
    df['Title'] = title

    return df

In [106]:
def upload_standings(standings_url : str) -> list[pd.DataFrame]:
    # Open the URL and read the HTML content
    response = request.urlopen(standings_url)
    html = response.read()

    # Create BeautifulSoup object to parse the HTML
    soup = bs4.BeautifulSoup(html, "lxml")

    # Get the <select> tags
    tag = soup.find_all('select')
    year_select_tag = tag[0] # Select tag for year

    # Get all the option tags for teams
    year_soup = bs4.BeautifulSoup(str(year_select_tag), "lxml")
    year_option_tag = year_soup.find_all('option')

    # Get the teams 
    years = []
    for y in year_option_tag:
        years.append((y.get_text(),y['value']))

    standings = []
    for (year_label,year) in years:
        # Get the Regular Season data
        url = f'https://www.espn.com/nba/standings/_/season/{year}/group/league'
        title = f'NBA Standings Regular Season {year_label}'

        df = parse_standings(url, title)
        standings.append(df)

        print(f'{title} uploaded successfully from {url}')

        # Get the Pre Season data
        url = f'https://www.espn.com/nba/standings/_/seasontype/pre/season/{year}/group/league'
        title = f'NBA Standings Pre Season {year_label}'

        df = parse_standings(url, title)
        standings.append(df)

        print(f'{title} uploaded successfully from {url}')

    return standings

In [107]:
upload_standings(standings_url)

NBA Standings Regular Season 2023-24 uploaded successfully from https://www.espn.com/nba/standings/_/season/2024/group/league
NBA Standings Pre Season 2023-24 uploaded successfully from https://www.espn.com/nba/standings/_/seasontype/pre/season/2024/group/league
NBA Standings Regular Season 2022-23 uploaded successfully from https://www.espn.com/nba/standings/_/season/2023/group/league
NBA Standings Pre Season 2022-23 uploaded successfully from https://www.espn.com/nba/standings/_/seasontype/pre/season/2023/group/league
NBA Standings Regular Season 2021-22 uploaded successfully from https://www.espn.com/nba/standings/_/season/2022/group/league
NBA Standings Pre Season 2021-22 uploaded successfully from https://www.espn.com/nba/standings/_/seasontype/pre/season/2022/group/league
NBA Standings Regular Season 2020-21 uploaded successfully from https://www.espn.com/nba/standings/_/season/2021/group/league
NBA Standings Pre Season 2020-21 uploaded successfully from https://www.espn.com/nba/

[                 Team Name   W   L   PCT    GB   HOME   AWAY   DIV   CONF  \
 0           Boston Celtics  43  12  .782     -   26-3   17-9  13-1   30-6   
 1   Minnesota Timberwolves  39  16  .709     4   19-5  20-11   8-2   27-8   
 2    Oklahoma City Thunder  37  17  .685   5.5   21-6  16-11  10-4  22-13   
 3      Cleveland Cavaliers  36  17  .679     6   19-9   17-8   8-4  24-11   
 4              LA Clippers  36  17  .679     6   20-6  16-11   8-3  22-13   
 5           Denver Nuggets  36  19  .655     7   21-5  15-14   4-5  20-13   
 6          Milwaukee Bucks  35  21  .625   8.5   23-7  12-14   9-7  25-13   
 7           ew York Knicks  33  22  .600    10   19-8  14-14   6-3  22-12   
 8             Phoenix Suns  33  22  .600    10  18-11  15-11   6-8  19-15   
 9      ew Orleans Pelicans  33  22  .600    10  16-10  17-12   8-6  23-18   
 10      Philadelphia 76ers  32  22  .593  10.5  17-11  15-11   5-4  21-14   
 11        Dallas Mavericks  32  23  .582    11  17-13  15-10   

In [230]:
def parse_teams(url : str, title : str) -> pd.DataFrame:
    # Open the URL and read the HTML content
    response = request.urlopen(url)
    html = response.read()

    # Create BeautifulSoup object to parse the HTML
    soup = bs4.BeautifulSoup(html, "lxml")

    # Get the teams name table 
    names_table_tag = soup.find_all('div', {'class' : 'ResponsiveTable ResponsiveTable--fixed-left mt4 Table2__title--remove-capitalization'}) ########################################################################################
    names_table_soup = bs4.BeautifulSoup(str(names_table_tag))
    names_table_tag2 = names_table_soup.find_all('img') ########################################################################################
    names_table_soup2 = bs4.BeautifulSoup(str(names_table_tag2))

    # Get the info table 
    info_table_tag = soup.find('td', {"class":"Table__TD"}) 
    info_table_soup = bs4.BeautifulSoup(str(info_table_tag))

    # Get the tables lines
    name_lines_list = names_table_soup2.find_all('a') ########################################################################################
    info_lines_list = info_table_soup.find_all('tr')

    print('print')
    for i in name_lines_list:
        print(i.get_text())

    # Get the column names
    tr_tag = info_table_soup.find('tr', {"class":"Table__sub-header Table__TR Table__even"})
    columns_name = [column.get_text() for column in tr_tag]
    columns_name = ['RK', 'Team Name'] + columns_name

    data = [] 
    rank = 0
    for i in range(len(name_lines_list)):

        # Get the team name
        tr_soup_name = bs4.BeautifulSoup(str(name_lines_list[i]))
        tr_soup_info = bs4.BeautifulSoup(str(info_lines_list[i]))
        
        # Get the team informations
        td_tags = tr_soup_name.find_all('td') + tr_soup_info.find_all('td')

        attributes = []
        for td in td_tags:
            attribute = td.get_text()
            if(attribute == '-'):
                attributes.append(str(rank))
            else:
                attributes.append(attribute)
        data.append(attributes)
        rank += 1

    # Get the data into a dataframe
    df = pd.DataFrame(data[1:], columns=columns_name)

    # Add the Title of the dataframe 
    df['Title'] = title

    return df

parse_teams('https://www.espn.com/nba/stats/team','azer')

print


TypeError: 'NoneType' object is not iterable

In [234]:

# Open the URL and read the HTML content
response = request.urlopen('https://www.espn.com/nba/stats/team')
html = response.read()

# Create BeautifulSoup object to parse the HTML
soup = bs4.BeautifulSoup(html, "lxml")

# Get the teams name table 
names_table_tag = soup.find_all('div', {'class' : 'layout is-full'}) ########################################################################################
names_table_soup = bs4.BeautifulSoup(str(names_table_tag))


names_table_tag2 = names_table_soup.find_all('img') ########################################################################################
names_table_soup2 = bs4.BeautifulSoup(str(names_table_tag2))

#----------------------------------------------------------------------------------------------------------------------------------
#----------------------------------------------------------------------------------------------------------------------------------

print('debut : ')
print(names_table_tag)

print(' teams : ')
for i in names_table_tag2:
    print(i.get_text())


debut : 
[<div class="Wrapper Card__Content"><div class="flex justify-between mt3 mb5 items-center"><h1 class="headline headline__h1 dib">NBA Team Stats 2023-24</h1></div><div class="tabs__wrapper mb5"><nav class="tabs__nav tabs__nav--bb tabs__nav--removeMobileMargin tabs__nav--brdr-clr-gray-08"><ul class="tabs__list" role="tablist"><li aria-selected="true" class="tabs__list__item tabs__list__item--clicked tabs__list__item--active" role="tab" title="Team"><a class="AnchorLink Button--unstyled tabs__link" data-track-name="" href="/nba/stats/team" tabindex="0">Team</a></li><li aria-selected="false" class="tabs__list__item" role="tab" title="Opponent"><a class="AnchorLink Button--unstyled tabs__link" data-track-name="" href="/nba/stats/team/_/view/opponent" tabindex="0">Opponent</a></li><li aria-selected="false" class="tabs__list__item" role="tab" title="Differential"><a class="AnchorLink Button--unstyled tabs__link" data-track-name="" href="/nba/stats/team/_/view/differential" tabindex="