# The goal here is to input any team, search through a list, and return that team rather than having disparate classes for each one

## Soccer

In [None]:
import pandas as pd

import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options

from bs4 import BeautifulSoup

In [None]:
fb_data = pd.read_csv(r"C:\Users\Owner\Documents\Data Projects\GitHub\Apps\project_w\data\football_data.csv")

In [None]:
team = 'Aston Villa'
team_url = fb_data[fb_data['team'] == team].url.values[0]

### test scraping

In [None]:
# Get the ChromeDriver path from your environment variable
chrome_driver_path = os.getenv('chrome_driver_path')

In [None]:
try:

    # Setup WebDriver
    service = Service(chrome_driver_path)  # Use the path from environment variable
    driver = webdriver.Chrome(service=service)

    # Open the page
    driver.get(team_url)
    
    # Wait until the cookie popup is present and clickable
    WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.XPATH, "//button[text()='Reject All']"))
    ).click()

    # Now proceed with your scraping task
    ## Use BeautifulSoup to parse the page source once the page is fully loaded
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find the table and extract data as before
    table = soup.find_all('table', class_='matches')
    dates = [row.text for row in table[0].find_all('td', class_="full-date")]
    leagues = [row.text.strip() for row in table[0].find_all('td', class_="competition")]
    homes = [row.text.strip() for row in table[0].find_all('td', class_="team")[::2]]
    aways = [row.text.strip() for row in table[0].find_all('td', class_="team")[1::2]]
    times = [row.text.strip() for row in table[0].find_all('td', class_="score-time")]

    # Create dataframes
    df_fixtures = pd.DataFrame(
        {
            'Date': dates,
            'League': leagues,
            'Home team': homes,
            'Time': times,
            'Away team': aways
        }
    )

except TimeoutException:
    print("Timed out waiting for cookie pop-up or other elements")
finally:
    # Close the browser
    driver.quit()

In [None]:
df_fixtures

In [None]:
team_url

### Try the matches tab scrape

In [None]:
team_url_m = team_url + 'matches/'
team_url_m

In [None]:
# Get the ChromeDriver path from your environment variable
chrome_driver_path = os.getenv('chrome_driver_path')

In [None]:
try:

    # Setup WebDriver
    service = Service(chrome_driver_path)  # Use the path from environment variable
    driver = webdriver.Chrome(service=service)

    # Open the page
    driver.get(team_url_m)
    
    # Wait until the cookie popup is present and clickable
    WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.XPATH, "//button[text()='Reject All']"))
    ).click()

    # Now proceed with your scraping task
    ## Use BeautifulSoup to parse the page source once the page is fully loaded
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find the table and extract data as before
    table = soup.find_all('table', class_='matches')
    dates = [row.text for row in table[0].find_all('td', class_="full-date")]
    leagues = [row.text.strip() for row in table[0].find_all('td', class_="competition")]
    homes = [row.text.strip() for row in table[0].find_all('td', class_="team")[::2]]
    aways = [row.text.strip() for row in table[0].find_all('td', class_="team")[1::2]]
    times = [row.text.strip() for row in table[0].find_all('td', class_="score-time")]

    # Create dataframes
    df_fixtures = pd.DataFrame(
        {
            'Date': dates,
            'League': leagues,
            'Home team': homes,
            'Time': times,
            'Away team': aways
        }
    )

except TimeoutException:
    print("Timed out waiting for cookie pop-up or other elements")
finally:
    # Close the browser
    driver.quit()

In [None]:
df_fixtures.head()

### Loop through each soccer team for df

In [None]:
# Convert to dictionary
team_url_dict = fb_data.set_index('team')['url'].to_dict()

# Amend each URL by adding 'matches/' at the end
team_url_dict = {team: url + 'matches/' for team, url in team_url_dict.items()}

In [None]:
# # Get the first three key-value pairs
# first_three = {k: team_url_dict[k] for k in list(team_url_dict.keys())[:3]}
# first_three

In [None]:
# Get the ChromeDriver path from your environment variable
chrome_driver_path = os.getenv('chrome_driver_path')

In [None]:
for team, url in team_url_dict.items():

    try:

        # Setup WebDriver
        service = Service(chrome_driver_path)  # Use the path from environment variable
        driver = webdriver.Chrome(service=service)

        # Open the page
        driver.get(url)
        
        # Wait until the cookie popup is present and clickable
        WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[text()='Reject All']"))
        ).click()

        # Now proceed with your scraping task
        ## Use BeautifulSoup to parse the page source once the page is fully loaded
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find the table and extract data as before
        table = soup.find_all('table', class_='matches')
        dates = [row.text for row in table[0].find_all('td', class_="full-date")]
        leagues = [row.text.strip() for row in table[0].find_all('td', class_="competition")]
        homes = [row.text.strip() for row in table[0].find_all('td', class_="team")[::2]]
        aways = [row.text.strip() for row in table[0].find_all('td', class_="team")[1::2]]
        times = [row.text.strip() for row in table[0].find_all('td', class_="score-time")]

        # Create dataframes
        df_team_fixtures = pd.DataFrame(
            {
                'Team': team,
                'Date': dates,
                'League': leagues,
                'Home Team': homes,
                'Time': times,
                'Away Team': aways
            }
        )

        ## Add to master df
        if 'df_fb_master' in locals():
            df_fb_master = pd.concat([df_fb_master, df_team_fixtures], ignore_index=True)
        else:
            df_fb_master = df_team_fixtures.copy()


    except TimeoutException:
        print("Timed out waiting for cookie pop-up or other elements")
        
    finally:
        # Close the browser
        driver.quit()

In [None]:
df_fb_master

In [None]:
df_fb_master.Team.value_counts()

In [None]:
df_fb_master.groupby('Team').size()

In [None]:
df_fb_master.to_csv('df_fb_master_2024_10_28.csv',index=False)

## NBA

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

import requests

from datetime import datetime

import time

### Init vars

In [None]:
nba_abbr_data = pd.read_csv(r"C:\Users\Owner\Documents\Data Projects\GitHub\Apps\project_w\data\nba_team_abbr.csv")
today = datetime.now().date()

### Init helper fns

In [None]:
# Define a function to apply the condition
def get_game_time_status(date):
    if date.date() >= today:
        return 'Upcoming'
    else:
        return 'Past'
    
# Define a function to apply the condition
def get_season(row_number):
    if row_number >= 82:
        return 'playoffs'
    else:
        return 'regular season'

### Build scraper fn

In [None]:
# Get the schedule from nba ref
def retrieve_schedule(team_abbr):

    # URL of the website
    url = f"https://www.basketball-reference.com/teams/{team_abbr}/2025_games.html"

    headers = {
        "User-Agent": "YourAppName/1.0 (https://yourwebsite.com; contact@yourwebsite.com)"
    }

    # Send a GET request to the URL
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        time.sleep(4) # if successful, wait n secs before reading in the html
        dfs_list = pd.read_html(url)

        # Prep data
        drop_cols = ['Unnamed: 3','Unnamed: 4']
        col_mapping = {
            'Unnamed: 5': 'Home/Away',
            'Unnamed: 7': 'Result',
            'Unnamed: 8': 'OT'
        }

        # Organize
        df = pd.concat(dfs_list, ignore_index=True).drop(columns=drop_cols).rename(columns=col_mapping)
        df['Home/Away'] = df['Home/Away'].fillna("vs.")
        df = df[df['Date'] != 'Date'].reset_index(drop=True)
        df['computer_date'] = pd.to_datetime(df['Date'], format='%a, %b %d, %Y')
        df['user_team'] = nba_abbr_data['team'][nba_abbr_data['abbr'] == team_abbr].iloc[0]
        
        # Create new cols from fns
        df['GameTimeStatus'] = df['computer_date'].apply(lambda x: get_game_time_status(x))
        df['game_type'] = df.index.to_series().apply(lambda x: get_season(x))
        df['game_type'] = np.where(df['Notes'] == 'In-Season Tournament', 'In-Season Tournament', df['game_type']) # Apply conditional update to game_type column

        return df
    
    else:
        print(f"Error retrieving data for {team_abbr}")

In [None]:
# pd.read_html("https://www.basketball-reference.com/teams/ATL/2025_games.html")

### Loop through each nba team

In [None]:
for nba_team in nba_abbr_data['abbr']:
    
    # Scrape
    nba_team_sch = retrieve_schedule(nba_team)

    # Check for dataframe
    if 'df_nba_master' in locals():
        df_nba_master = pd.concat([df_nba_master, nba_team_sch], ignore_index=True)
    else:
        df_nba_master = nba_team_sch.copy()

    # Pause for n seconds to address HTTPError: HTTP Error 429: Too Many Requests
    time.sleep(4)

In [None]:
print(df_nba_master.user_team.value_counts(), len(df_nba_master.user_team.value_counts()))

In [None]:
# pho_team_sch = retrieve_schedule('PHO')
# df_nba_master = pd.concat([df_nba_master, pho_team_sch], ignore_index=True)

In [None]:
df_nba_master.to_csv('df_nba_master_2024_10_29.csv',index=False)

## NHL

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

import requests

from datetime import datetime

import time

### Init vars

In [None]:
nhl_abbr_data = pd.read_csv(r"C:\Users\Owner\Documents\Data Projects\GitHub\Apps\project_w\data\nhl_team_abbr.csv")
today = datetime.now().date()

### Init helper fns

In [None]:
# Define a function to apply the condition
def get_game_time_status(date):
    if date.date() >= today:
        return 'Upcoming'
    else:
        return 'Past'
    
# Define a function to apply the condition
def get_season(row_number):
    if row_number >= 82:
        return 'playoffs'
    else:
        return 'regular season'

### Build scraper fn

In [None]:
# Get the schedule from nba ref
def retrieve_nhl_schedule(team_abbr):

    # URL of the website
    url = f"https://www.hockey-reference.com/teams/{team_abbr}/2025_games.html"

    headers = {
        "User-Agent": "YourAppName/1.0 (https://yourwebsite.com; contact@yourwebsite.com)"
    }

    # Send a GET request to the URL
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        time.sleep(4) # if successful, wait n secs before reading in the html
        dfs_list = pd.read_html(url)

        # Prep data
        col_mapping = {
            'Unnamed: 3': 'Home/Away',
            'Unnamed: 7': 'Result',
            'Unnamed: 8': 'OT'
        }

        # Organize
        df = pd.concat(dfs_list, ignore_index=True).rename(columns=col_mapping)
        df['Home/Away'] = df['Home/Away'].fillna("vs.")
        df = df[df['Date'] != 'Date'].reset_index(drop=True)
        df['computer_date'] = pd.to_datetime(df['Date'])
        df['Date'] = df['computer_date'].dt.strftime('%a, %b %d, %Y')
        df['user_team'] = nhl_abbr_data['team'][nhl_abbr_data['abbr'] == team_abbr].iloc[0]
        
        # Create new cols from fns
        df['GameTimeStatus'] = df['computer_date'].apply(lambda x: get_game_time_status(x))
        df['game_type'] = df.index.to_series().apply(lambda x: get_season(x))

        return df
    
    else:
        print(f"Error retrieving data for {team_abbr}")

### Loop through each nhl team

In [None]:
for nhl_team in nhl_abbr_data['abbr']:
    
    # Scrape
    nhl_team_sch = retrieve_nhl_schedule(nhl_team)

    # Check for dataframe
    if 'df_nhl_master' in locals():
        df_nhl_master = pd.concat([df_nhl_master, nhl_team_sch], ignore_index=True)
    else:
        df_nhl_master = nhl_team_sch.copy()

    # Pause for n seconds to address HTTPError: HTTP Error 429: Too Many Requests
    time.sleep(4)

In [None]:
print(df_nhl_master.user_team.value_counts(), len(df_nhl_master.user_team.value_counts()))

In [None]:
# veg_team_sch = retrieve_nhl_schedule('VEG')
# df_nhl_master = pd.concat([df_nhl_master, veg_team_sch], ignore_index=True)

In [None]:
df_nhl_master.to_csv('df_nhl_master_2024_10_30.csv',index=False)

In [None]:
df_nhl_master[(df_nhl_master['user_team'] == 'Colorado Avalanche') & (df_nhl_master['GameTimeStatus'] == 'Upcoming')]

## NFL (reference)

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

import requests

from datetime import datetime

import time

import re

### Init vars

In [None]:
nfl_abbr_data = pd.read_csv(r"C:\Users\Owner\Documents\Data Projects\GitHub\Apps\project_w\data\nfl_team_abbr.csv")
today = datetime.now().date()

### Build scraper fn

In [None]:
# Get the schedule from nba ref
def retrieve_nfl_schedule(team_abbr):

    # URL of the website
    url = f"https://www.pro-football-reference.com/teams/{team_abbr}/2024/gamelog/"

    headers = {
        "User-Agent": "YourAppName/1.0 (https://yourwebsite.com; contact@yourwebsite.com)"
    }

    # Send a GET request to the URL
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        time.sleep(4) # if successful, wait n secs before reading in the html
        dfs_list = pd.read_html(url)

        # Concatenate all DataFrames in dfs_list
        df = pd.concat(dfs_list, ignore_index=True)

        # Flatten the multi-level columns, ignoring 'Unnamed' and removing '_level_'
        df.columns = [
            re.sub(r'\d+\s', '', ' '.join(col).strip().replace("Unnamed: ", "").replace("_level_", "")).replace("0", "").replace("1", "").replace("6","Home/Away").replace("4","Result")
            for col in df.columns
        ]

        df = df.drop(columns='3')

        # Organize
        df['Home/Away'] = df['Home/Away'].fillna("vs.")
        df['user_team'] = nfl_abbr_data['team'][nfl_abbr_data['abbr'] == team_abbr].iloc[0]
        df['game_type'] = "regular season"

        return df
    
    else:
        print(f"Error retrieving data for {team_abbr}")

### Loop through each nfl team

In [None]:
for nfl_team in nfl_abbr_data['abbr']:
    
    # Scrape
    nfl_team_sch = retrieve_nfl_schedule(nfl_team)

    # Check for dataframe
    if 'df_nfl_master' in locals():
        df_nfl_master = pd.concat([df_nfl_master, nfl_team_sch], ignore_index=True)
    else:
        df_nfl_master = nfl_team_sch.copy()

    # Pause for n seconds to address HTTPError: HTTP Error 429: Too Many Requests
    time.sleep(4)

In [None]:
print(df_nfl_master.user_team.value_counts(), len(df_nfl_master.user_team.value_counts()))

In [None]:
df_nfl_master.to_csv('df_nfl_master_2024_10_31.csv',index=False)

## NFL (espn)

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

import requests

from datetime import datetime

import time

import re

### Init vars

In [2]:
nfl_abbr_data = pd.read_csv(r"C:\Users\Owner\Documents\Data Projects\GitHub\Apps\project_w\data\nfl_espn_team_abbr.csv")
# today = datetime.now().date()

### Build scraper fn

In [None]:
# Get the schedule from nba ref
def retrieve_espn_nfl_schedule(team_abbr):

    # URL of the website
    url = f"https://www.espn.com/nfl/team/schedule/_/name/{team_abbr}"

    headers = {
        "User-Agent": "YourAppName/1.0 (https://yourwebsite.com; contact@yourwebsite.com)"
    }

    # Send a GET request to the URL
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        time.sleep(4) # if successful, wait n secs before reading in the html
        df = pd.read_html(url)[0]

        # Find the index of the row that starts with "WK", "DATE", etc.
        start_row = df[(df[0] == "WK") & (df[1] == "DATE") & (df[2] == "OPPONENT") & (df[3] == "TIME")].index[0]

        # Find the index of the row that starts with "Preseason"
        end_row = df[df[0] == "Preseason"].index[0]

        # Select rows between the start and end rows, and set the start_row as the header
        df = df.iloc[(start_row-1) + 1:end_row].reset_index(drop=True)  # +1 to exclude the header row from the data
        df.columns = df.iloc[0]  # Set the first row of the selection as column headers
        df = df[1:]  # Drop the header row itself

        # Filter out rows where any cell has "BYE WEEK"
        df = df[~df.eq("BYE WEEK").any(axis=1)]

        # Drop the first column and the last two columns
        df = df.iloc[:, :-2].reset_index(drop=True)

        # Extract "vs" or "@" into a new column "Home/Away"
        df['Home/Away'] = df['OPPONENT'].str.extract(r'(vs|@)')
        df['Home/Away'] = df['Home/Away'].replace('vs', 'vs.')

        # Remove "vs" or "@" from the "OPPONENT" column and strip whitespace
        df['OPPONENT'] = df['OPPONENT'].str.replace(r'vs |@ ', '', regex=True).str.strip()

        ## Organize
        df['user_team'] = nfl_abbr_data['team'][nfl_abbr_data['abbr'] == team_abbr].iloc[0]
        df['game_type'] = "regular season"
        df = df.drop(columns=df.filter(regex="^tickets$").columns)
        df['TIME'] = df['TIME'].replace("TBD", pd.NA)

        # Replace "TBD" with NaN using loc
        df.loc[df['DATE'].str.contains('TBD'), 'DATE'] = np.nan

        return df
    
    else:
        print(f"Error retrieving data for {team_abbr}")

### Loop through each nfl team

In [4]:
for nfl_team in nfl_abbr_data['abbr']:
    
    # Scrape
    nfl_team_sch = retrieve_espn_nfl_schedule(nfl_team)

    # Check for dataframe
    if 'df_nfl_master' in locals():
        df_nfl_master = pd.concat([df_nfl_master, nfl_team_sch], ignore_index=True)
    else:
        df_nfl_master = nfl_team_sch.copy()

    # Pause for n seconds to address HTTPError: HTTP Error 429: Too Many Requests
    time.sleep(4)

In [5]:
print(df_nfl_master.user_team.value_counts(), len(df_nfl_master.user_team.value_counts()))

Miami Dolphins           10
Dallas Cowboys           10
Kansas City Chiefs       10
Philadelphia Eagles      10
Detroit Lions            10
Los Angeles Chargers     10
Chicago Bears            10
Los Angeles Rams         10
Tennessee Titans         10
Minnesota Vikings        10
Seattle Seahawks          9
San Francisco 49ers       9
New England Patriots      9
Pittsburgh Steelers       9
Tampa Bay Buccaneers      9
Las Vegas Raiders         9
New York Giants           9
New Orleans Saints        9
Arizona Cardinals         9
Atlanta Falcons           9
Jacksonville Jaguars      9
Indianapolis Colts        9
Green Bay Packers         9
Denver Broncos            9
Cleveland Browns          9
Cincinnati Bengals        9
Carolina Panthers         9
Buffalo Bills             9
Baltimore Ravens          9
Washington Commanders     9
New York Jets             8
Houston Texans            8
Name: user_team, dtype: int64 32


In [6]:
# Convert both arrays to sets
user_team_set = set(df_nfl_master.user_team.unique())
abbr_team_set = set(nfl_abbr_data.team.unique())

# Find differences
only_in_user_team = user_team_set - abbr_team_set
only_in_abbr_team = abbr_team_set - user_team_set

# Display results
print("Teams in `df_nfl_master.user_team` but not in `nfl_abbr_data.team`:", only_in_user_team)
print("Teams in `nfl_abbr_data.team` but not in `df_nfl_master.user_team`:", only_in_abbr_team)


Teams in `df_nfl_master.user_team` but not in `nfl_abbr_data.team`: set()
Teams in `nfl_abbr_data.team` but not in `df_nfl_master.user_team`: set()


In [16]:
df_nfl_master['Home/Away'] = df_nfl_master['Home/Away'].replace('vs', 'vs.')

In [18]:
df_nfl_master.to_csv('df_nfl_master_2024_11_01.csv',index=False)

In [17]:
df_nfl_master

Unnamed: 0,WK,DATE,OPPONENT,TIME,TV,Home/Away,user_team,game_type
0,9,"Sun, Nov 3",Chicago,4:05 PM,CBS,vs.,Arizona Cardinals,regular season
1,10,"Sun, Nov 10",New York,4:25 PM,CBS,vs.,Arizona Cardinals,regular season
2,12,"Sun, Nov 24",Seattle,4:25 PM,FOX,@,Arizona Cardinals,regular season
3,13,"Sun, Dec 1",Minnesota,1:00 PM,FOX,@,Arizona Cardinals,regular season
4,14,"Sun, Dec 8",Seattle,4:05 PM,CBS,vs.,Arizona Cardinals,regular season
...,...,...,...,...,...,...,...,...
291,13,"Sun, Dec 1",Tennessee,1:00 PM,CBS,vs.,Washington Commanders,regular season
292,15,"Sun, Dec 15",New Orleans,1:00 PM,FOX,@,Washington Commanders,regular season
293,16,"Sun, Dec 22",Philadelphia,1:00 PM,FOX,vs.,Washington Commanders,regular season
294,17,,Atlanta,,,vs.,Washington Commanders,regular season
