In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
DRIVER = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [3]:
PREM_LEAGUE_SCORES_URL = "https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures"

In [4]:
def load_scores_table_soup(url, driver, tag_id = "sched_2022-2023_9_1"):
    # Load the page using Selenium
    driver.get(url)
    
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.ID, tag_id)))

    # create a Beautiful Soup object from the response content
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    return soup

In [5]:
soup = load_scores_table_soup(PREM_LEAGUE_SCORES_URL, DRIVER)

In [7]:
def get_matches_info(soup):

    table_headers = soup.find("thead").find_all('th')
    col_names = [header.get("aria-label").strip() for header in table_headers]

    rows = soup.find("table", {"id": "sched_2022-2023_9_1"}).find("tbody").find_all("tr")

    table_data = []
    for row in rows:
        matchweek_num = row.find('th')

        td_cells = row.find_all('td')
        # cols = [cell.text.strip() if i != len(col_names)-2 and i != 3 and i != 8 else cell.find('a')["href"] for i, cell in enumerate(td_cells)]
        cols = [cell.text.strip() if i != 3 and i != 7 and i != len(col_names)-3 else cell.find('a')["href"] if cell.find('a') else None for i, cell in enumerate(td_cells)]
        # cols = [cell.text.strip() if i != 3 and i != 7 and i != 11 else cell.find('a')["href"] if cell.find('a') else None for i, cell in enumerate(td_cells)]
        cols.insert(0, matchweek_num.text.strip()) # match week is in th

        table_data.append(cols)
        
    
    # Create a pandas DataFrame from the scraped data
    df = pd.DataFrame(table_data, columns=col_names)

    col_names = ["Home", "Away", "Match Report"]
    for col in col_names:
        df[f"{col}_uid"] = df[f"{col}"].apply(lambda x: x.split("/")[3] if x is not None else None)

    # Drop rows where the first cell (index 0) contains "Wk"
    df = df[~df.iloc[:, 0].str.contains('Wk', na=False)]

    # Strip whitespace from each cell
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

    df.replace('', None, inplace=True)

    # Drop rows with all empty or None values
    new_df = df.dropna(how='all').reset_index(drop=True)
    new_df.fillna('', inplace=True)

    return new_df

In [8]:
df = get_matches_info(soup)

In [32]:
df[10:15]

Unnamed: 0,Matchweek Number,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee,Match Report,Notes,Home_uid,Away_uid,Match Report_uid
10,,,,,,,,,,,,,,,,,
11,2.0,Sat,2022-08-13,12:30 (19:30),/en/squads/8602292d/Aston-Villa-Stats,2.3,2–1,1.6,/en/squads/d3fd31cc/Everton-Stats,41883.0,Villa Park,Michael Oliver,/en/matches/8cd71c65/Aston-Villa-Everton-Augus...,,8602292d,d3fd31cc,8cd71c65
12,2.0,Sat,2022-08-13,15:00 (22:00),/en/squads/b8fd03ef/Manchester-City-Stats,1.7,4–0,0.1,/en/squads/4ba7cbea/Bournemouth-Stats,53453.0,Etihad Stadium,David Coote,/en/matches/311d705c/Manchester-City-Bournemou...,,b8fd03ef,4ba7cbea,311d705c
13,2.0,Sat,2022-08-13,15:00 (22:00),/en/squads/33c895d4/Southampton-Stats,1.2,2–2,1.8,/en/squads/5bfb9659/Leeds-United-Stats,30815.0,St. Mary's Stadium,Tony Harrington,/en/matches/54b33a13/Southampton-Leeds-United-...,,33c895d4,5bfb9659,54b33a13
14,2.0,Sat,2022-08-13,15:00 (22:00),/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats,0.9,0–0,1.5,/en/squads/fd962109/Fulham-Stats,31178.0,Molineux Stadium,John Brooks,/en/matches/669b1665/Wolverhampton-Wanderers-F...,,8cec06e1,fd962109,669b1665


In [55]:
new_df = df

In [58]:
new_df


Unnamed: 0,Matchweek Number,Day,Date,Time,Home,xG,Score,xG.1,Away,Attendance,Venue,Referee,Match Report,Notes,Home_uid,Away_uid,Match Report_uid
0,1,Fri,2022-08-05,20:00 (03:00),/en/squads/47c64c55/Crystal-Palace-Stats,1.2,0–2,1.0,/en/squads/18bb7c10/Arsenal-Stats,25286,Selhurst Park,Anthony Taylor,/en/matches/e62f6e78/Crystal-Palace-Arsenal-Au...,,47c64c55,18bb7c10,e62f6e78
1,1,Sat,2022-08-06,12:30 (19:30),/en/squads/fd962109/Fulham-Stats,1.2,2–2,1.2,/en/squads/822bd0ba/Liverpool-Stats,22207,Craven Cottage,Andy Madley,/en/matches/6713c1dc/Fulham-Liverpool-August-6...,,fd962109,822bd0ba,6713c1dc
2,1,Sat,2022-08-06,15:00 (22:00),/en/squads/361ca564/Tottenham-Hotspur-Stats,1.5,4–1,0.5,/en/squads/33c895d4/Southampton-Stats,61732,Tottenham Hotspur Stadium,Andre Marriner,/en/matches/09d8a999/Tottenham-Hotspur-Southam...,,361ca564,33c895d4,09d8a999
3,1,Sat,2022-08-06,15:00 (22:00),/en/squads/b2b47a98/Newcastle-United-Stats,1.7,2–0,0.3,/en/squads/e4a775cb/Nottingham-Forest-Stats,52245,St James' Park,Simon Hooper,/en/matches/1ac96eb4/Newcastle-United-Nottingh...,,b2b47a98,e4a775cb,1ac96eb4
4,1,Sat,2022-08-06,15:00 (22:00),/en/squads/5bfb9659/Leeds-United-Stats,0.8,2–1,1.3,/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats,36347,Elland Road,Robert Jones,/en/matches/82702941/Leeds-United-Wolverhampto...,,5bfb9659,8cec06e1,82702941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,38,Sun,2023-05-28,16:30 (23:30),/en/squads/d3fd31cc/Everton-Stats,1.0,1–0,0.5,/en/squads/4ba7cbea/Bournemouth-Stats,39201,Goodison Park,Stuart Attwell,/en/matches/94de848f/Everton-Bournemouth-May-2...,,d3fd31cc,4ba7cbea,94de848f
376,38,Sun,2023-05-28,16:30 (23:30),/en/squads/a2d435b3/Leicester-City-Stats,1.4,2–1,1.4,/en/squads/7c21e445/West-Ham-United-Stats,32183,King Power Stadium,Simon Hooper,/en/matches/a96c9915/Leicester-City-West-Ham-U...,,a2d435b3,7c21e445,a96c9915
377,38,Sun,2023-05-28,16:30 (23:30),/en/squads/8602292d/Aston-Villa-Stats,2.8,2–1,1.4,/en/squads/d07537b9/Brighton-and-Hove-Albion-S...,,Villa Park,David Coote,/en/matches/ac0e65e2/Aston-Villa-Brighton-and-...,,8602292d,d07537b9,ac0e65e2
378,38,Sun,2023-05-28,16:30 (23:30),/en/squads/5bfb9659/Leeds-United-Stats,1.5,1–4,2.2,/en/squads/361ca564/Tottenham-Hotspur-Stats,36871,Elland Road,Anthony Taylor,/en/matches/c9c73ddd/Leeds-United-Tottenham-Ho...,,5bfb9659,361ca564,c9c73ddd
