# The goal here is to input any team, search through a list, and return that team rather than having disparate classes for each one

## Soccer

In [12]:
import pandas as pd

import os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options

from bs4 import BeautifulSoup

In [13]:
fb_data = pd.read_csv(r"C:\Users\Owner\Documents\Data Projects\GitHub\Apps\project_w\data\football_data.csv")

In [14]:
team = 'Aston Villa'
team_url = fb_data[fb_data['team'] == team].url.values[0]

### test scraping

In [15]:
# Get the ChromeDriver path from your environment variable
chrome_driver_path = os.getenv('chrome_driver_path')

In [18]:
try:

    # Setup WebDriver
    service = Service(chrome_driver_path)  # Use the path from environment variable
    driver = webdriver.Chrome(service=service)

    # Open the page
    driver.get(team_url)
    
    # Wait until the cookie popup is present and clickable
    WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.XPATH, "//button[text()='Reject All']"))
    ).click()

    # Now proceed with your scraping task
    ## Use BeautifulSoup to parse the page source once the page is fully loaded
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find the table and extract data as before
    table = soup.find_all('table', class_='matches')
    dates = [row.text for row in table[0].find_all('td', class_="full-date")]
    leagues = [row.text.strip() for row in table[0].find_all('td', class_="competition")]
    homes = [row.text.strip() for row in table[0].find_all('td', class_="team")[::2]]
    aways = [row.text.strip() for row in table[0].find_all('td', class_="team")[1::2]]
    times = [row.text.strip() for row in table[0].find_all('td', class_="score-time")]

    # Create dataframes
    df_fixtures = pd.DataFrame(
        {
            'Date': dates,
            'League': leagues,
            'Home team': homes,
            'Time': times,
            'Away team': aways
        }
    )

except TimeoutException:
    print("Timed out waiting for cookie pop-up or other elements")
finally:
    # Close the browser
    driver.quit()

In [19]:
df_fixtures

Unnamed: 0,Date,League,Home team,Time,Away team
0,02/10/24,UCL,Aston Villa,1 - 0,Bayern Munich
1,06/10/24,PRL,Aston Villa,0 - 0,Manchester United
2,19/10/24,PRL,Fulham,1 - 3,Aston Villa
3,22/10/24,UCL,Aston Villa,2 - 0,Bologna
4,26/10/24,PRL,Aston Villa,1 - 1,AFC Bournemouth
5,30/10/24,LEC,Aston Villa,13 : 45,Crystal Palace
6,03/11/24,PRL,Tottenham Hotspur,07 : 00,Aston Villa
7,06/11/24,UCL,Club Brugge,10 : 45,Aston Villa
8,09/11/24,PRL,Liverpool,13 : 00,Aston Villa
9,23/11/24,PRL,Aston Villa,08 : 00,Crystal Palace


In [6]:
team_url

'https://ng.soccerway.com//teams/england/aston-villa-football-club/665/'

### Try the matches tab scrape

In [7]:
team_url_m = team_url + 'matches/'
team_url_m

'https://ng.soccerway.com//teams/england/aston-villa-football-club/665/matches/'

In [8]:
# Get the ChromeDriver path from your environment variable
chrome_driver_path = os.getenv('chrome_driver_path')

In [9]:
try:

    # Setup WebDriver
    service = Service(chrome_driver_path)  # Use the path from environment variable
    driver = webdriver.Chrome(service=service)

    # Open the page
    driver.get(team_url_m)
    
    # Wait until the cookie popup is present and clickable
    WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.XPATH, "//button[text()='Reject All']"))
    ).click()

    # Now proceed with your scraping task
    ## Use BeautifulSoup to parse the page source once the page is fully loaded
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find the table and extract data as before
    table = soup.find_all('table', class_='matches')
    dates = [row.text for row in table[0].find_all('td', class_="full-date")]
    leagues = [row.text.strip() for row in table[0].find_all('td', class_="competition")]
    homes = [row.text.strip() for row in table[0].find_all('td', class_="team")[::2]]
    aways = [row.text.strip() for row in table[0].find_all('td', class_="team")[1::2]]
    times = [row.text.strip() for row in table[0].find_all('td', class_="score-time")]

    # Create dataframes
    df_fixtures = pd.DataFrame(
        {
            'Date': dates,
            'League': leagues,
            'Home team': homes,
            'Time': times,
            'Away team': aways
        }
    )

except TimeoutException:
    print("Timed out waiting for cookie pop-up or other elements")
finally:
    # Close the browser
    driver.quit()

In [20]:
df_fixtures.head()

Unnamed: 0,Date,League,Home team,Time,Away team
0,02/05/24,UCL,Aston Villa,2 - 4,Olympiakos
1,05/05/24,PRL,Brighton & Hove Albion,1 - 0,Aston Villa
2,09/05/24,UCL,Olympiakos,2 - 0,Aston Villa
3,13/05/24,PRL,Aston Villa,3 - 3,Liverpool
4,19/05/24,PRL,Crystal Palace,5 - 0,Aston Villa


### Loop through each soccer team for df

In [3]:
# Convert to dictionary
team_url_dict = fb_data.set_index('team')['url'].to_dict()

# Amend each URL by adding 'matches/' at the end
team_url_dict = {team: url + 'matches/' for team, url in team_url_dict.items()}

In [4]:
# # Get the first three key-value pairs
# first_three = {k: team_url_dict[k] for k in list(team_url_dict.keys())[:3]}
# first_three

In [5]:
# Get the ChromeDriver path from your environment variable
chrome_driver_path = os.getenv('chrome_driver_path')

In [6]:
for team, url in team_url_dict.items():

    try:

        # Setup WebDriver
        service = Service(chrome_driver_path)  # Use the path from environment variable
        driver = webdriver.Chrome(service=service)

        # Open the page
        driver.get(url)
        
        # Wait until the cookie popup is present and clickable
        WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[text()='Reject All']"))
        ).click()

        # Now proceed with your scraping task
        ## Use BeautifulSoup to parse the page source once the page is fully loaded
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find the table and extract data as before
        table = soup.find_all('table', class_='matches')
        dates = [row.text for row in table[0].find_all('td', class_="full-date")]
        leagues = [row.text.strip() for row in table[0].find_all('td', class_="competition")]
        homes = [row.text.strip() for row in table[0].find_all('td', class_="team")[::2]]
        aways = [row.text.strip() for row in table[0].find_all('td', class_="team")[1::2]]
        times = [row.text.strip() for row in table[0].find_all('td', class_="score-time")]

        # Create dataframes
        df_team_fixtures = pd.DataFrame(
            {
                'Team': team,
                'Date': dates,
                'League': leagues,
                'Home Team': homes,
                'Time': times,
                'Away Team': aways
            }
        )

        ## Add to master df
        if 'df_fb_master' in locals():
            df_fb_master = pd.concat([df_fb_master, df_team_fixtures], ignore_index=True)
        else:
            df_fb_master = df_team_fixtures.copy()


    except TimeoutException:
        print("Timed out waiting for cookie pop-up or other elements")
        
    finally:
        # Close the browser
        driver.quit()

In [7]:
df_fb_master

Unnamed: 0,Team,Date,League,Home Team,Time,Away Team
0,Arsenal,17/04/24,UCL,Bayern Munich,1 - 0,Arsenal
1,Arsenal,20/04/24,PRL,Wolverhampton Wanderers,0 - 2,Arsenal
2,Arsenal,23/04/24,PRL,Arsenal,5 - 0,Chelsea
3,Arsenal,28/04/24,PRL,Tottenham Hotspur,2 - 3,Arsenal
4,Arsenal,04/05/24,PRL,Arsenal,3 - 0,AFC Bournemouth
...,...,...,...,...,...,...
4895,Elche,22/03/25,SED,Elche,-,Eldense
4896,Elche,29/03/25,SED,Córdoba,-,Elche
4897,Elche,05/04/25,SED,Elche,-,Racing Ferrol
4898,Elche,12/04/25,SED,Cadiz,-,Elche


In [8]:
df_fb_master.Team.value_counts()

Arsenal        50
Salernitana    50
Lecce          50
Monza          50
Empoli         50
               ..
Reims          50
Nice           50
Lyon           50
Clermont       50
Elche          50
Name: Team, Length: 98, dtype: int64

In [9]:
df_fb_master.groupby('Team').size()

Team
AC Milan           50
AFC Bournemouth    50
Ajaccio            50
Almeria            50
Angers             50
                   ..
Villarreal         50
Werder Bremen      50
West Ham United    50
Wolfsburg          50
Wolverhampton …    50
Length: 98, dtype: int64

In [11]:
df_fb_master.to_csv('df_fb_master_2024_10_28.csv',index=False)

## NBA

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

import requests

from datetime import datetime

import time

### Init vars

In [11]:
nba_abbr_data = pd.read_csv(r"C:\Users\Owner\Documents\Data Projects\GitHub\Apps\project_w\data\nba_team_abbr.csv")
today = datetime.now().date()

### Init helper fns

In [3]:
# Define a function to apply the condition
def get_game_time_status(date):
    if date.date() >= today:
        return 'Upcoming'
    else:
        return 'Past'
    
# Define a function to apply the condition
def get_season(row_number):
    if row_number >= 82:
        return 'playoffs'
    else:
        return 'regular season'

### Build scraper fn

In [4]:
# Get the schedule from nba ref
def retrieve_schedule(team_abbr):

    # URL of the website
    url = f"https://www.basketball-reference.com/teams/{team_abbr}/2025_games.html"

    headers = {
        "User-Agent": "YourAppName/1.0 (https://yourwebsite.com; contact@yourwebsite.com)"
    }

    # Send a GET request to the URL
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        time.sleep(4) # if successful, wait n secs before reading in the html
        dfs_list = pd.read_html(url)

        # Prep data
        drop_cols = ['Unnamed: 3','Unnamed: 4']
        col_mapping = {
            'Unnamed: 5': 'Home/Away',
            'Unnamed: 7': 'Result',
            'Unnamed: 8': 'OT'
        }

        # Organize
        df = pd.concat(dfs_list, ignore_index=True).drop(columns=drop_cols).rename(columns=col_mapping)
        df['Home/Away'] = df['Home/Away'].fillna("vs.")
        df = df[df['Date'] != 'Date'].reset_index(drop=True)
        df['computer_date'] = pd.to_datetime(df['Date'], format='%a, %b %d, %Y')
        df['user_team'] = nba_abbr_data['team'][nba_abbr_data['abbr'] == team_abbr].iloc[0]
        
        # Create new cols from fns
        df['GameTimeStatus'] = df['computer_date'].apply(lambda x: get_game_time_status(x))
        df['game_type'] = df.index.to_series().apply(lambda x: get_season(x))
        df['game_type'] = np.where(df['Notes'] == 'In-Season Tournament', 'In-Season Tournament', df['game_type']) # Apply conditional update to game_type column

        return df
    
    else:
        print(f"Error retrieving data for {team_abbr}")

In [5]:
# pd.read_html("https://www.basketball-reference.com/teams/ATL/2025_games.html")

### Loop through each nba team

In [6]:
for nba_team in nba_abbr_data['abbr']:
    
    # Scrape
    nba_team_sch = retrieve_schedule(nba_team)

    # Check for dataframe
    if 'df_nba_master' in locals():
        df_nba_master = pd.concat([df_nba_master, nba_team_sch], ignore_index=True)
    else:
        df_nba_master = nba_team_sch.copy()

    # Pause for n seconds to address HTTPError: HTTP Error 429: Too Many Requests
    time.sleep(4)

Error retrieving data for PHX


In [18]:
print(df_nba_master.user_team.value_counts(), len(df_nba_master.user_team.value_counts()))

Atlanta Hawks             80
Boston Celtics            80
Washington Wizards        80
Utah Jazz                 80
Toronto Raptors           80
San Antonio Spurs         80
Sacramento Kings          80
Portland Trail Blazers    80
Philadelphia 76ers        80
Orlando Magic             80
Oklahoma City Thunder     80
New York Knicks           80
New Orleans Pelicans      80
Minnesota Timberwolves    80
Milwaukee Bucks           80
Miami Heat                80
Memphis Grizzlies         80
Los Angeles Lakers        80
Los Angeles Clippers      80
Indiana Pacers            80
Houston Rockets           80
Golden State Warriors     80
Detroit Pistons           80
Denver Nuggets            80
Dallas Mavericks          80
Cleveland Cavaliers       80
Chicago Bulls             80
Charlotte Hornets         80
Brooklyn Nets             80
Phoenix Suns              80
Name: user_team, dtype: int64 30


In [12]:
# pho_team_sch = retrieve_schedule('PHO')
# df_nba_master = pd.concat([df_nba_master, pho_team_sch], ignore_index=True)

In [19]:
df_nba_master.to_csv('df_nba_master_2024_10_29.csv',index=False)

## NHL

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np

import requests

from datetime import datetime

import time

### Init vars

In [2]:
nhl_abbr_data = pd.read_csv(r"C:\Users\Owner\Documents\Data Projects\GitHub\Apps\project_w\data\nhl_team_abbr.csv")
today = datetime.now().date()

### Init helper fns

In [3]:
# Define a function to apply the condition
def get_game_time_status(date):
    if date.date() >= today:
        return 'Upcoming'
    else:
        return 'Past'
    
# Define a function to apply the condition
def get_season(row_number):
    if row_number >= 82:
        return 'playoffs'
    else:
        return 'regular season'

### Build scraper fn

In [4]:
# Get the schedule from nba ref
def retrieve_nhl_schedule(team_abbr):

    # URL of the website
    url = f"https://www.hockey-reference.com/teams/{team_abbr}/2025_games.html"

    headers = {
        "User-Agent": "YourAppName/1.0 (https://yourwebsite.com; contact@yourwebsite.com)"
    }

    # Send a GET request to the URL
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        time.sleep(4) # if successful, wait n secs before reading in the html
        dfs_list = pd.read_html(url)

        # Prep data
        col_mapping = {
            'Unnamed: 3': 'Home/Away',
            'Unnamed: 7': 'Result',
            'Unnamed: 8': 'OT'
        }

        # Organize
        df = pd.concat(dfs_list, ignore_index=True).rename(columns=col_mapping)
        df['Home/Away'] = df['Home/Away'].fillna("vs.")
        df = df[df['Date'] != 'Date'].reset_index(drop=True)
        df['computer_date'] = pd.to_datetime(df['Date'])
        df['Date'] = df['computer_date'].dt.strftime('%a, %b %d, %Y')
        df['user_team'] = nhl_abbr_data['team'][nhl_abbr_data['abbr'] == team_abbr].iloc[0]
        
        # Create new cols from fns
        df['GameTimeStatus'] = df['computer_date'].apply(lambda x: get_game_time_status(x))
        df['game_type'] = df.index.to_series().apply(lambda x: get_season(x))

        return df
    
    else:
        print(f"Error retrieving data for {team_abbr}")

### Loop through each nhl team

In [5]:
for nhl_team in nhl_abbr_data['abbr']:
    
    # Scrape
    nhl_team_sch = retrieve_nhl_schedule(nhl_team)

    # Check for dataframe
    if 'df_nhl_master' in locals():
        df_nhl_master = pd.concat([df_nhl_master, nhl_team_sch], ignore_index=True)
    else:
        df_nhl_master = nhl_team_sch.copy()

    # Pause for n seconds to address HTTPError: HTTP Error 429: Too Many Requests
    time.sleep(4)

In [6]:
print(df_nhl_master.user_team.value_counts(), len(df_nhl_master.user_team.value_counts()))

Anaheim Ducks            82
Boston Bruins            82
Winnipeg Jets            82
Vegas Golden Knights     82
Vancouver Canucks        82
 Utah Hockey Club        82
Toronto Maple Leafs      82
St. Louis Blues          82
San Jose Sharks          82
Seattle Kraken           82
Pittsburgh Penguins      82
Philadelphia Flyers      82
Ottawa Senators          82
New York Rangers         82
New York Islanders       82
Nashville Predators      82
New Jersey Devils        82
Montreal Canadiens       82
Minnesota Wild           82
Los Angeles Kings        82
Florida Panthers         82
Edmonton Oilers          82
Detroit Red Wings        82
Dallas Stars             82
Colorado Avalanche       82
Chicago Blackhawks       82
Calgary Flames           82
Columbus Blue Jackets    82
Buffalo Sabres           82
Washington Capitals      82
Tampa Bay Lightning      81
Carolina Hurricanes      81
Name: user_team, dtype: int64 32


In [None]:
# veg_team_sch = retrieve_nhl_schedule('VEG')
# df_nhl_master = pd.concat([df_nhl_master, veg_team_sch], ignore_index=True)

In [7]:
df_nhl_master.to_csv('df_nhl_master_2024_10_30.csv',index=False)

In [14]:
df_nhl_master[(df_nhl_master['user_team'] == 'Colorado Avalanche') & (df_nhl_master['GameTimeStatus'] == 'Upcoming')]

Unnamed: 0,GP,Date,Time,Home/Away,Opponent,GF,GA,Result,OT,W,L,OL,Streak,Att.,LOG,Notes,computer_date,user_team,GameTimeStatus,game_type
583,11,"Wed, Oct 30, 2024",9:00 PM,vs.,Tampa Bay Lightning,,,,,,,,,,,,2024-10-30,Colorado Avalanche,Upcoming,regular season
584,12,"Sat, Nov 02, 2024",8:00 PM,@,Nashville Predators,,,,,,,,,,,,2024-11-02,Colorado Avalanche,Upcoming,regular season
585,13,"Tue, Nov 05, 2024",9:00 PM,vs.,Seattle Kraken,,,,,,,,,,,,2024-11-05,Colorado Avalanche,Upcoming,regular season
586,14,"Thu, Nov 07, 2024",8:00 PM,@,Winnipeg Jets,,,,,,,,,,,,2024-11-07,Colorado Avalanche,Upcoming,regular season
587,15,"Sat, Nov 09, 2024",9:00 PM,vs.,Carolina Hurricanes,,,,,,,,,,,,2024-11-09,Colorado Avalanche,Upcoming,regular season
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650,78,"Sat, Apr 05, 2025",7:00 PM,@,St. Louis Blues,,,,,,,,,,,,2025-04-05,Colorado Avalanche,Upcoming,regular season
651,79,"Tue, Apr 08, 2025",9:30 PM,vs.,Vegas Golden Knights,,,,,,,,,,,,2025-04-08,Colorado Avalanche,Upcoming,regular season
652,80,"Thu, Apr 10, 2025",9:00 PM,vs.,Vancouver Canucks,,,,,,,,,,,,2025-04-10,Colorado Avalanche,Upcoming,regular season
653,81,"Sat, Apr 12, 2025",4:00 PM,@,Los Angeles Kings,,,,,,,,,,,,2025-04-12,Colorado Avalanche,Upcoming,regular season
