In [1]:
import os, logging
import pandas as pd
from urllib.parse import urljoin
import concurrent.futures
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from constants import FBREF_HOMEPAGE_URL, FBREF_MATCHES_URL_PATH, PREM_LEAGUE_SCORES_URL, PREM_URL, LEAGUE_TEAM_TABLE_ID
from fbref_scraper_match_report import load_match_soup_combined, retrieve_match_soup_info_to_df
from fbref_all_matches_link_scraper import load_scores_table_soup, get_matches_info
from fbref_league_teams import get_teams_table_soup, get_teams_info
from helper import create_folder

In [2]:
def create_driver():
    # create a new Options object
    options = Options()

    # add the "--headless" argument, headless drivers are faster usually
    options.add_argument("--headless")

    # pass the Options object to the webdriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) 
    return driver

In [3]:
if not os.path.isfile(f'team_data/team_data_epl_22_23.csv'):
    driver = create_driver()
    teams_soup = get_teams_table_soup(PREM_URL, driver, LEAGUE_TEAM_TABLE_ID)
    df_team_urls = get_teams_info(teams_soup, LEAGUE_TEAM_TABLE_ID)
    create_folder(f'team_data')
    df_team_urls.to_csv(f'team_data/team_data_epl_22_23.csv', index=False)

In [3]:
if not os.path.isfile(f'match_data/all_match_links_epl_22_23.csv'):
    driver = create_driver()
    soup = load_scores_table_soup(PREM_LEAGUE_SCORES_URL, driver, tag_id = "sched_2022-2023_9_1")
    df_links = get_matches_info(soup)

    create_folder(f'match_data')
    df_links.to_csv(f'match_data/all_match_links_epl_22_23.csv', index=False)

In [4]:
df_links = pd.read_csv("match_data/all_match_links_epl_22_23.csv")

In [5]:
def setup_logging():
    # Set up error logger
    logging.basicConfig(filename='logfile_match_report_errors.log', level=logging.ERROR)

    # Set up success logger
    success_logger = logging.getLogger('success')
    success_logger.setLevel(logging.INFO)
    success_handler = logging.FileHandler('success.log')
    success_handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s'))
    success_logger.addHandler(success_handler)

    return success_logger

def scrape_page(match_uid):
    # If directory already exists, skip this iteration
    if os.path.isdir(f'match_data/{match_uid}'):
        return
    
    # create a new WebDriver instance
    driver = create_driver()

    int_url = urljoin(FBREF_HOMEPAGE_URL, FBREF_MATCHES_URL_PATH)
    match_url = urljoin(int_url, match_uid)
    match_soup = load_match_soup_combined(match_url, driver)
    outfield_df_whole, keeper_df_whole = retrieve_match_soup_info_to_df(match_soup)

    create_folder(f'match_data/{match_uid}')

    index = 0
    for df in [outfield_df_whole, keeper_df_whole]:
        if index == 0:
            df.to_csv(f'match_data/{match_uid}/{match_uid}_outfield.csv', index=False)
            index += 1
        else:
            df.to_csv(f'match_data/{match_uid}/{match_uid}_keeper.csv', index=False)

    # close the WebDriver instance
    driver.quit()

In [6]:
# Set up loggers
success_logger = setup_logging()
error_logger = logging.getLogger()

num_pages_scraped = 0
total_pages = len(df_links["Match Report_uid"])

unscraped_uids = []
for match_uid in df_links["Match Report_uid"]:
    if not os.path.isdir(f'match_data/{match_uid}'):
        unscraped_uids.append(str(match_uid))

num_pages_scraped_before = total_pages - len(unscraped_uids)

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    futures = {executor.submit(scrape_page, match_uid) for match_uid in unscraped_uids}

    for i, future in enumerate(concurrent.futures.as_completed(futures)):
        match_uid = unscraped_uids[i]  # Get the match_uid at the corresponding index
        try:
            result = future.result()  
        except Exception as e:
            error_logger.error(f"An error occurred with match_uid {match_uid}: {e}")
        else:
            num_pages_scraped += 1
            total_scraped = (num_pages_scraped+num_pages_scraped_before)
            percentage_scraped = (total_scraped / total_pages) * 100
            success_logger.info(f"Page {match_uid} scraped successfully. Total pages scraped: {total_scraped}/{total_pages} ({percentage_scraped:.2f}%)")
