### Notebook run on colab for data scrapping the Olympedia Website : https://www.olympedia.org

This method was taken from Joseph Cheng : https://github.com/josephwccheng/olympedia_web_scraping/tree/main and slightly adapted to get the performance metrics when available. 
He considered it to be a futur step to do : "Download Raw HTML files of all the results from the "_distinct_result_id.csv" file. This is a todo for future work for those who would like to dive down more on each specific events and their performances"

In [None]:
!pip install aiohttp nest_asyncio beautifulsoup4




In [None]:
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import csv
from bs4 import BeautifulSoup
from typing import Dict, List, Tuple
import aiohttp
import asyncio
import nest_asyncio
import os
import time
import random

nest_asyncio.apply()

dataroot = os.path.join("/content", "drive", "MyDrive", "Colab files")
result_ids_path = os.path.join(dataroot, '_distinct_result_id.csv')
csv_save_path = os.path.join(dataroot, 'olympedia_all_results.csv')

# Function to identify if a row is part of the final round based on specific columns
def is_final_round(row, headers):
    final_columns = ["Final", "Final Round"]
    for col in final_columns:
        if col in headers and row.get(col, "").strip() and row.get(col).strip() != "–":
            return True
    return False

# Function to parse the event results and filter for final rounds
def parse_results_table(table) -> Tuple[List[str], List[Dict[str, str]]]:
    headers = [th.get_text().strip() for th in table.find_all('th')]
    rows = table.find_all('tr')[1:]  # skip the header row
    results = []

    has_final_column = any(col in headers for col in ["Final", "Final Round"])

    for row in rows:
        cols = row.find_all('td')
        if cols:
            result = {}
            for i, col in enumerate(cols):
                # Handle edge cases for cells containing spans or being empty
                if i < len(headers):
                    text = col.get_text().strip()
                    if not text:
                        text = "N/A"
                    result[headers[i]] = text
            if not has_final_column or is_final_round(result, headers):
                results.append(result)
    return headers, results

async def fetch(session, url, retries=5, backoff_factor=1):
    for attempt in range(retries):
        try:
            async with session.get(url) as response:
                if response.status == 200:
                    return await response.text()
                else:
                    raise ValueError(f"Request to {url} returned status {response.status}")
        except Exception as e:
            if attempt < retries - 1:
                sleep_time = backoff_factor * (2 ** attempt) + random.uniform(0, 1)
                print(f"Attempt {attempt + 1} failed: {e}. Retrying in {sleep_time:.2f} seconds...")
                await asyncio.sleep(sleep_time)
            else:
                print(f"All {retries} attempts failed for {url}.")
                raise e

async def fetch_result_page(session, result_id):
    base_url = 'https://www.olympedia.org'
    url = f"{base_url}/results/{result_id}"
    return await fetch(session, url)

async def scrape_result(result_id, semaphore):
    async with semaphore:
        async with aiohttp.ClientSession() as session:
            try:
                result_page = await fetch_result_page(session, result_id)
                return result_id, result_page
            except Exception as e:
                print(f"Error fetching result ID {result_id}: {e}")
                return result_id, None

def get_result_info_from_page(result_id: str, page_content: str) -> Dict:
    result_soup = BeautifulSoup(page_content, 'html.parser')
    try:
        breadcrumb = result_soup.select('body > div.container > ol.breadcrumb > li')
        edition = breadcrumb[2].get_text()
        edition_id = breadcrumb[2].find('a')['href'].split('/')[-1]
        sport = breadcrumb[3].get_text()
        event_title = result_soup.select('body > div.container > h1.event_title')[0].get_text()
    except IndexError as e:
        print(f"Error parsing breadcrumb or event title for result ID {result_id}: {e}")
        return {}

    event_info = {
        'result_id': result_id,
        'event_title': event_title,
        'edition': edition,
        'edition_id': edition_id,
        'sport': sport,
    }

    # Parsing the event bio table
    try:
        event_bio_table = result_soup.select('body > div.container > table.biodata')[0]
        event_bio_header = [item.get_text() for item in event_bio_table.select('table > tr > th')]
        event_bio_value = [item.get_text() for item in event_bio_table.select('table > tr > td')]

        bio_fields = {
            'Date': 'result_date',
            'Location': 'result_location',
            'Participants': 'result_participants',
            'Format': 'result_format'
        }

        for field, key in bio_fields.items():
            event_info[key] = event_bio_value[event_bio_header.index(field)] if field in event_bio_header else 'na'
    except (IndexError, AttributeError) as e:
        print(f"Error parsing event bio table for result ID {result_id}: {e}")
        for key in ['result_date', 'result_location', 'result_participants', 'result_format', 'result_detail']:
            event_info[key] = 'na'

    # Parsing the results table
    results_table = result_soup.find('table', class_='table table-striped')
    if results_table:
        try:
            headers, results = parse_results_table(results_table)
            event_info['results'] = results
        except Exception as e:
            print(f"Error parsing results table for result ID {result_id}: {e}")
            event_info['results'] = []
    else:
        event_info['results'] = []

    return event_info

def save_to_csv(all_results: List[Dict], all_columns: set, file_path: str):
    base_headers = [
        'result_id', 'event_title', 'edition', 'edition_id', 'sport',
        'result_date', 'result_location', 'result_participants',
        'result_format'
    ]
    result_headers = list(all_columns)
    headers = base_headers + result_headers

    with open(file_path, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=headers)
        writer.writeheader()

        for result_info in all_results:
            base_info = {k: result_info[k] for k in base_headers if k in result_info}
            for result in result_info['results']:
                row = {**base_info, **{col: '' for col in result_headers}, **result}
                writer.writerow(row)

async def main(result_ids):
    all_results = []
    all_columns = set()
    semaphore = asyncio.Semaphore(10) 

    tasks = [scrape_result(result_id, semaphore) for result_id in result_ids]

    for i, task in enumerate(asyncio.as_completed(tasks)):
        result_id, result_page = await task
        if result_page:
            result_info = get_result_info_from_page(result_id, result_page)
            if result_info:  
                all_results.append(result_info)
                for result in result_info['results']:
                    all_columns.update(result.keys())

    save_to_csv(all_results, all_columns, csv_save_path)
    print("Combined CSV file has been saved.")

def load_result_ids(file_path: str) -> List[str]:
    with open(file_path, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        result_ids = [row[0] for row in reader]
    return result_ids

# Run the script
if __name__ == "__main__":
    result_ids = load_result_ids(result_ids_path)
    asyncio.run(main(result_ids))


Attempt 1 failed: Request to https://www.olympedia.org/results/45335 returned status 504. Retrying in 1.68 seconds...
Attempt 1 failed: Request to https://www.olympedia.org/results/85903 returned status 504. Retrying in 1.41 seconds...
Attempt 1 failed: Request to https://www.olympedia.org/results/354150 returned status 504. Retrying in 1.52 seconds...
Attempt 1 failed: Request to https://www.olympedia.org/results/34161 returned status 504. Retrying in 1.65 seconds...
Attempt 1 failed: Request to https://www.olympedia.org/results/111376 returned status 504. Retrying in 1.99 seconds...
Attempt 2 failed: Request to https://www.olympedia.org/results/34161 returned status 504. Retrying in 2.95 seconds...
Attempt 1 failed: Request to https://www.olympedia.org/results/27375 returned status 504. Retrying in 1.55 seconds...
Attempt 2 failed: Request to https://www.olympedia.org/results/111376 returned status 504. Retrying in 2.82 seconds...
Attempt 1 failed: Request to https://www.olympedia.or