In [1]:
import os
import pandas as pd
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from constants import FBREF_HOMEPAGE_URL, FBREF_MATCHES_URL_PATH, PREM_LEAGUE_SCORES_URL
from fbref_scraper_match_report import load_match_soup_combined, retrieve_match_soup_info_to_df
from fbref_all_matches_link_scraper import load_scores_table_soup, get_matches_info

In [2]:
# create a new Options object
options = Options()

# add the "--headless" argument, headless drivers are faster usually
options.add_argument("--headless")

# pass the Options object to the webdriver
DRIVER = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) 

In [3]:
def create_folder(folder_name):
    # Check if the directory exists
    if not os.path.exists(folder_name):
        # If the directory does not exist, create it
        os.makedirs(folder_name)

In [4]:
soup = load_scores_table_soup(PREM_LEAGUE_SCORES_URL, DRIVER, tag_id = "sched_2022-2023_9_1")
df_links = get_matches_info(soup)

create_folder(f'match_data')
df_links.to_csv(f'match_data/all_match_links_epl_22_23.csv', index=False)

In [4]:
df_links = pd.read_csv("match_data/all_match_links_epl_22_23.csv")

In [None]:
for match_uid in df_links["Match Report_uid"]:
    # If directory already exists, skip this iteration
    if os.path.isdir(f'match_data/{match_uid}'):
        continue

    int_url = urljoin(FBREF_HOMEPAGE_URL, FBREF_MATCHES_URL_PATH) # intermediate url bc urljoin can only join 2 paths
    match_url = urljoin(int_url, match_uid)
    match_soup = load_match_soup_combined(match_url, DRIVER) # Time taken to load match soup: about 85s
    outfield_df_whole, keeper_df_whole = retrieve_match_soup_info_to_df(match_soup)

    create_folder(f'match_data/{match_uid}')

    index = 0
    for df in [outfield_df_whole, keeper_df_whole]:
        if index == 0:
            df.to_csv(f'match_data/{match_uid}/{match_uid}_outfield.csv', index=False)
            index += 1
        else:
            df.to_csv(f'match_data/{match_uid}/{match_uid}_keeper.csv', index=False)
    break

In [None]:
DRIVER.quit()