In [1]:
import os, logging
import pandas as pd
from urllib.parse import urljoin
import concurrent.futures
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from constants import FBREF_HOMEPAGE_URL, FBREF_MATCHES_URL_PATH, PREM_LEAGUE_SCORES_URL
from fbref_scraper_match_report import load_match_soup_combined, retrieve_match_soup_info_to_df
from fbref_all_matches_link_scraper import load_scores_table_soup, get_matches_info

In [2]:
def create_driver():
    # create a new Options object
    options = Options()

    # add the "--headless" argument, headless drivers are faster usually
    options.add_argument("--headless")

    # pass the Options object to the webdriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) 
    return driver

def create_folder(folder_name):
    # Check if the directory exists
    if not os.path.exists(folder_name):
        # If the directory does not exist, create it
        os.makedirs(folder_name)

In [3]:
if not os.path.isfile(f'match_data/all_match_links_epl_22_23.csv'):
    driver = create_driver()
    soup = load_scores_table_soup(PREM_LEAGUE_SCORES_URL, driver, tag_id = "sched_2022-2023_9_1")
    df_links = get_matches_info(soup)

    create_folder(f'match_data')
    df_links.to_csv(f'match_data/all_match_links_epl_22_23.csv', index=False)

In [4]:
df_links = pd.read_csv("match_data/all_match_links_epl_22_23.csv")

In [5]:
def scrape_page(match_uid):
    # If directory already exists, skip this iteration
    if os.path.isdir(f'match_data/{match_uid}'):
        return
    
    # create a new WebDriver instance
    driver = create_driver()

    int_url = urljoin(FBREF_HOMEPAGE_URL, FBREF_MATCHES_URL_PATH)
    match_url = urljoin(int_url, match_uid)
    match_soup = load_match_soup_combined(match_url, driver)
    outfield_df_whole, keeper_df_whole = retrieve_match_soup_info_to_df(match_soup)

    create_folder(f'match_data/{match_uid}')

    index = 0
    for df in [outfield_df_whole, keeper_df_whole]:
        if index == 0:
            df.to_csv(f'match_data/{match_uid}/{match_uid}_outfield.csv', index=False)
            index += 1
        else:
            df.to_csv(f'match_data/{match_uid}/{match_uid}_keeper.csv', index=False)

    # close the WebDriver instance
    driver.quit()

In [7]:
num_pages_scraped = 0
total_pages = len(df_links["Match Report_uid"])

logging.basicConfig(filename='logfile_match_report_errors.log', level=logging.ERROR)

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    futures = {executor.submit(scrape_page, str(match_uid)) for match_uid in df_links["Match Report_uid"]}

    for future in concurrent.futures.as_completed(futures):
        match_uid = futures[future]  # Get the match_uid associated with this future
        try:
            result = future.result()  # if your function returns any result
        except Exception as e:
            #print(f"An error occurred: {e} at match_uid: {match_uid}")
            logging.error(f"An error occurred with match_uid {match_uid}: {e}")
        else:
            num_pages_scraped += 1
            percentage_scraped = (num_pages_scraped / total_pages) * 100
            print(f"Page scraped successfully. Total pages scraped: {num_pages_scraped}/{total_pages} ({percentage_scraped:.2f}%)", flush=True)

KeyboardInterrupt: 