In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import os
import re
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

# Driver startup

In [2]:
driver_path = os.getenv('chrome_driver_path')

In [3]:
# Set up the WebDriver
service = Service(driver_path)
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(service=service, options=options)

## nba.com

In [15]:
team = 'nuggets'

In [16]:
# Load the website
driver.get(f'https://www.nba.com/{team}/schedule')

In [17]:
html_content = driver.page_source

In [18]:
# Load the HTML content into BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

In [19]:
# Clean up: Quit the web driver
driver.quit()

In [20]:
# Define a function to process text by applying all the desired transformations
def process_text(text):
    # Add space between lowercase and uppercase transitions
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    
    # Add space between the day and time
    pattern = r'(Apr \d{1,2})(\d{1,2}:\d{2} [AP]M)'
    text = re.sub(pattern, r'\1 \2', text)
    
    # Add spaces around "TBD"
    text = re.sub(r'\s*TBD\s*', ' TBD ', text)
    
    return text

In [21]:
# Define a function to parse game details from a schedule item
def parse_game_date(schedule_item):
    # Pattern for date and time
    date_time_pattern = r'(\w+ \w+ \d{1,2} \d{1,2}:\d{2} [AP]M MDT|\w+ \w+ \d{1,2} TBD)'
    
    
    # Extract date and time
    date_time_match = re.search(date_time_pattern, schedule_item)
    date_time = date_time_match.group(0) if date_time_match else ''

    # game_details = f'{date_time}' 
    
    return date_time

In [22]:
# Example: Extract schedule items
def extract_schedule_items(soup):
    schedule_items = soup.find_all(class_="ScheduleItem_scheduleCardMinHeight__bg2g4")
    for item in schedule_items:
        clean_text = process_text(item.text)
        game_date = parse_game_date(clean_text)
        print(game_date)

In [23]:
extract_schedule_items(soup)

Saturday Apr 20 6:30 PM MDT
Monday Apr 22 8:00 PM MDT
Thursday Apr 25 8:00 PM MDT
Saturday Apr 27 6:30 PM MDT
Monday Apr 29 TBD
Thursday May 2 TBD
Saturday May 4 TBD


## espn.com

In [4]:
url = 'https://www.espn.com/nba/team/schedule/_/name/den/denver-nuggets'

In [5]:
# Load the website
driver.get(url)

In [8]:
# Locate the table container by its class "Table__Scroller"
table_scroller = driver.find_element(By.CLASS_NAME, 'Table__Scroller')

<selenium.webdriver.remote.webelement.WebElement (session="c5686e833f1008d06edff59a5c02df47", element="f.010F35E83847263CE1A83B0CABFF15F2.d.5E1AB21B2DD6A0B2B956C45F38EB1FAD.e.27")>

In [9]:
# Locate the table within the container
table = table_scroller.find_element(By.TAG_NAME, 'table')

In [11]:
# Initialize a list to hold the rows of the table
rows = []

# Iterate through each row in the table
for row in table.find_elements(By.TAG_NAME, 'tr'):
    # Initialize a list to hold the cells in the row
    cells = []
    
    # Iterate through each cell in the row
    for cell in row.find_elements(By.TAG_NAME, 'td'):
        # Append the cell's text to the list of cells
        cells.append(cell.text)
    
    # Append the list of cells (row) to the list of rows
    rows.append(cells)

# Convert the list of rows into a Pandas DataFrame
df = pd.DataFrame(rows)

In [12]:
df

Unnamed: 0,0,1,2,3,4
0,Conference First Round,,,,
1,DATE,OPPONENT,TIME,TV,TICKETS
2,"Sat, Apr 20",vs\nLos Angeles,6:30 PM,,Tickets as low as $158
3,"Mon, Apr 22",vs\nLos Angeles,8:00 PM,TNT,Tickets as low as $91
4,"Thu, Apr 25",@\nLos Angeles,8:00 PM,TNT,Tickets as low as $168
5,"Sat, Apr 27",@\nLos Angeles,6:30 PM,,Tickets as low as $169
6,"Mon, Apr 29",vs\nLos Angeles,TBD,,Tickets as low as $100
7,"Thu, May 2",@\nLos Angeles,TBD,,Tickets as low as $186
8,"Sat, May 4",vs\nLos Angeles,TBD,TNT,Tickets as low as $146


In [13]:
driver.quit()

In [21]:
df_clean = df.iloc[1:]
df_clean.columns = df_clean.iloc[0]
df_clean = df_clean.iloc[1:, :-1]
df_clean['location'] = df_clean['OPPONENT'].str.contains('^vs\n').map({True: 'vs', False: '@'})
df_clean['OPPONENT'] = df_clean['OPPONENT'].replace({'^vs\n': '', '^@\n': ''}, regex=True)
df_clean

1,DATE,OPPONENT,TIME,TV,location
2,"Sat, Apr 20",Los Angeles,6:30 PM,,vs
3,"Mon, Apr 22",Los Angeles,8:00 PM,TNT,vs
4,"Thu, Apr 25",Los Angeles,8:00 PM,TNT,@
5,"Sat, Apr 27",Los Angeles,6:30 PM,,@
6,"Mon, Apr 29",Los Angeles,TBD,,vs
7,"Thu, May 2",Los Angeles,TBD,,@
8,"Sat, May 4",Los Angeles,TBD,TNT,vs


## foxsports.com

In [4]:
# city, team = "denver", "nuggets"
city, team = "oklahoma-city", "thunder"
# city, team = "phoenix", "suns"

In [5]:
url = f'https://www.foxsports.com/nba/{city}-{team}-team-schedule'
url

'https://www.foxsports.com/nba/oklahoma-city-thunder-team-schedule'

In [6]:
# Load the website
driver.get(url)

In [7]:
# Locate the <div class="table"> element
table_div = driver.find_element(By.CLASS_NAME, 'table')

# Locate the <table id="table-0" class="data-table"> within the table div
table = table_div.find_element(By.ID, 'table-0')

In [8]:
# Initialize a list to hold the rows of the table
rows = []

# Iterate through each row in the table
for row in table.find_elements(By.TAG_NAME, 'tr'):
    # Initialize a list to hold the cells in the row
    cells = []
    
    # Iterate through each cell in the row
    for cell in row.find_elements(By.TAG_NAME, 'td'):
        # Append the cell's text to the list of cells
        cells.append(cell.text)
    
    # Append the list of cells (row) to the list of rows
    rows.append(cells)

# Convert the list of rows into a Pandas DataFrame
df_raw = pd.DataFrame(rows)

# Print the DataFrame to view the scraped data
print(df_raw)

      0     1            2                                  3
0  None  None         None                               None
1  4/21   TBD  7:30PM\nTNT  Paycom Center,\nOklahoma City, OK
2  4/24   TBD  7:30PM\nTNT  Paycom Center,\nOklahoma City, OK
3  4/27  @TBD  1:30PM\nTNT                                  -
4  4/29  @TBD          TBA                                  -
5   5/1   TBD      11:00AM  Paycom Center,\nOklahoma City, OK
6   5/3  @TBD      11:00AM                                  -
7   5/5   TBD      11:00AM  Paycom Center,\nOklahoma City, OK


In [9]:
# Quit the driver
driver.quit()

In [11]:
df

Unnamed: 0,date,opponent,time,arena,game_type,TV
1,4/21,TBD,7:30PM,"Paycom Center,\nOklahoma City, OK",Home,TNT
2,4/24,TBD,7:30PM,"Paycom Center,\nOklahoma City, OK",Home,TNT
3,4/27,TBD,1:30PM,-,Away,TNT
4,4/29,TBD,TBA,-,Away,
5,5/1,TBD,11:00AM,"Paycom Center,\nOklahoma City, OK",Home,
6,5/3,TBD,11:00AM,-,Away,
7,5/5,TBD,11:00AM,"Paycom Center,\nOklahoma City, OK",Home,


In [10]:
col_names = ['date','opponent','time','arena']
df = df_raw.iloc[1:]
df.columns = col_names
df['game_type'] = df['opponent'].apply(lambda x: 'Away' if '@' in x else 'Home')
df['opponent'] = df['opponent'].str.replace('@', '').str.split().str[-1]
df['TV'] = df['time'].apply(lambda x: x.split('\n')[1] if '\n' in x else '')
df['time'] = df['time'].apply(lambda x: x.split('\n')[0] if '\n' in x else x)
df['location'] = df['arena'].apply(lambda x: x.split('\n')[1])
df['arena'] = df['arena'].apply(lambda x: x.split('\n')[0]).str.replace(',','')
df['display_text'] = df.apply(lambda row: f"{row['date']} @ {row['time']}: {row['opponent']} @ {team.title()} at {row['arena']} in {row['location']}" if row['game_type'] == 'Home' else f"{row['date']} @ {row['time']}: {team.title()} @ {row['opponent']} at {row['arena']} in {row['location']}",
    axis=1)
df

IndexError: list index out of range

In [14]:
df['display_text']

1    4/20 @ 6:30PM: Lakers @ Nuggets at Ball Arena ...
2    4/22 @ 8:00PM: Lakers @ Nuggets at Ball Arena ...
3    4/25 @ 8:00PM: Nuggets @ Lakers at Crypto.com ...
4    4/27 @ 6:30PM: Nuggets @ Lakers at Crypto.com ...
5    4/29 @ 11:00AM: Lakers @ Nuggets at Ball Arena...
6    5/2 @ 11:00AM: Nuggets @ Lakers at Crypto.com ...
7    5/4 @ 11:00AM: Lakers @ Nuggets at Ball Arena ...
Name: display_text, dtype: object

In [15]:
df['display_text'][1]

'4/20 @ 6:30PM: Lakers @ Nuggets at Ball Arena in Denver, CO'

In [16]:
df_final = df[['display_text']]
final_msg = df_final.display_text.to_string(index=False)
print(final_msg)

4/20 @ 6:30PM: Lakers @ Nuggets at Ball Arena i...
4/22 @ 8:00PM: Lakers @ Nuggets at Ball Arena i...
4/25 @ 8:00PM: Nuggets @ Lakers at Crypto.com A...
4/27 @ 6:30PM: Nuggets @ Lakers at Crypto.com A...
4/29 @ 11:00AM: Lakers @ Nuggets at Ball Arena ...
5/2 @ 11:00AM: Nuggets @ Lakers at Crypto.com A...
5/4 @ 11:00AM: Lakers @ Nuggets at Ball Arena i...
