# Olympic Swimming History 1912 to 2020

### 1. Import Packages

In [1]:
import re
import json
import time
import random
import pickle
import requests
import numpy as np
import pandas as pd
from datetime import date
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc

### 2. Define Functions

In [95]:
def open_olympic_page(url):
    #Create Automated Full-Sized Selenium Webbrowser with Options to Reduce Bot Detection
    options = webdriver.ChromeOptions()
    options.add_argument("start-maximized")
    options.add_argument("disable-infobars")
    options.add_argument("--disable-extensions")
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36")
    driver = uc.Chrome()
    driver.get(url)
    time.sleep(2.5)
    
    #Prevent Cookie Check and Bot Detection
    pickle.dump(driver.get_cookies(), open("cookies.pkl","wb"))
    driver.maximize_window()
    time.sleep(2.5)
    
    #Accept Cookie Button If Exists
    cookie_btn = driver.find_element(By.ID, 'onetrust-accept-btn-handler')
    if cookie_btn:
        driver.execute_script("arguments[0].click();", cookie_btn)
        
    #Scroll to Bottom of Page - Then Scroll to Top of the Page (ensures entire page loads)
    driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    driver.execute_script("window.scrollTo(0, 220)")
    return driver

def get_all_events(driver):
    # Scrape Olympic Dropdown and Find How Many Olympics There Are
    driver.execute_script("window.scrollTo(0, 220)")
    olympic_dropdown = driver.find_element(By.CSS_SELECTOR, f"button[data-cy='game-select']")
    olympic_dropdown.click()
    time.sleep(1.5)
    
    num_of_olympics = int(len(driver.find_elements(By.CLASS_NAME, "styles__WrapperButton-sc-5sdhfb-1.iCbHQL.link-item")))
    olympic_dropdown.click()
    time.sleep(1.5)
    
    # Loop over all Olympics until 1912
    for i in range(num_of_olympics - 4):
        #Click Olympic Event Dropdown
        driver.execute_script("window.scrollTo(0, 220)")
        olympic_dropdown = driver.find_element(By.CSS_SELECTOR, f"button[data-cy='game-select']")
        olympic_dropdown.click()
        time.sleep(1.5)
        
        #Select Next Olympic Event (olympic locator class names dynamically change so use get atrribute on a specific button to find button class)
        olympic_class = driver.find_element(By.CSS_SELECTOR, f"button[data-cy='item-beijing-2022']")
        olympic_class_name = str(olympic_class.get_attribute("class")).replace(" ", ".").split("item")[0] + "item"
        
        olympic_buttons = driver.find_elements(By.CLASS_NAME, olympic_class_name)
        time.sleep(1.5)
        olympic_buttons[i].click()
    
        time.sleep(1.5)
        
        #Select the Discipline Dropdown
        driver.execute_script("window.scrollTo(0, 220)")
        disciple_dropdown = driver.find_element(By.CSS_SELECTOR, f"button[data-cy='discipline-select']")
        disciple_dropdown.click()
        time.sleep(1.5)
        
        #Determine if Olympics is Summer by Finding Swimming Tag, If No Swimming Tag - Choose Next Olympics
        swimming=False
        try:
            swimming_select = driver.find_element(By.CLASS_NAME, 'SWM')
            swimming_select.click()
            swimming = True
        except:
            pass
        
        if swimming:
            select_events(driver)
            
    driver.close()
    
    
def format_time(time_str):
    if not time_str.strip():
        # Empty string or string with only whitespace
        return time_str
    elif '.' in time_str:
        # Format of "hh:mm:ss.s"
        format_str = '%H:%M:%S.%f'
    else:
        # Format of "mm:ss"
        format_str = '%M:%S'
    try:
        dt = datetime.strptime(time_str, format_str)
        return dt.strftime('%H:%M:%S.%f')
    except ValueError:
        # String is not a valid time format
        return time_str
    
def scrape_results(driver):
    content = BeautifulSoup(driver.page_source, 'html.parser')
    event = content.select_one('h1[class^="styles__Heading-sc-"]').text

    gender = "Women" if 'women' in event.lower() else "Men"
    
    year_match = re.search(r'(\w+)\s+(\d{4})', event)
    location, year = year_match.groups()
    
    olympic_strokes = ['Underwater', 'Freestyle', 'Backstroke', 'Breaststroke', 'Butterfly', 'Individual Medley', 'Team', 'Obstacle', 'Freestyle For Sailors Men', 'Medley']
    for stroke in olympic_strokes:
        if stroke.lower() in event.lower():
            event_stroke = stroke.lower().capitalize()
            break
            
    athlete_text, time_text, countries = ([] for i in range(3))
    times = content.find_all('div', {'class': 'styles__ResultInfoWrapper-sc-rh9yz9-2 ktcvbA'})
    time_text = [str(time.text).split('Results:')[1] for time in times]
    
    #Scrape All Results and Competitors for All Relay Events
    if ('relay' and '4x') in str(event).lower():
        relay = 1
        distance_match = re.search(r'\d+\s*[xX]\s*\d+', event)
        
        # Extract the matched text from the regular expression object
        distance_str = distance_match.group(0)
        distance = distance_str.lower().replace('m', '')

        countries = [country.text for country in content.select('div.styles__Country-sc-rh9yz9-9.cwqEZX')]

        athlete_dropdown_len = int(len(driver.find_elements(By.CSS_SELECTOR, f"span[data-cy='icon-caret-down']")))

        for i in range(0, athlete_dropdown_len, 2):
                dropdown_buttons = driver.find_elements(By.CSS_SELECTOR, f"span[data-cy='icon-caret-down']")
                time.sleep(2.5)
                
                # Keep Trying Clicking Dropdown until It Works
                clicked = True
                while clicked:
                    try:
                        driver.execute_script("arguments[0].click();", dropdown_buttons[i])
                        clicked = False
                    except:
                        time.sleep(5)
                        pass
                
                time.sleep(2.5)

                team_members = list(map(lambda member: member.text, driver.find_elements(By.CSS_SELECTOR, f"a[data-cy='team-member']")))
                team_members = [member for member in team_members if member.strip()]
                athletes = ', '.join(str(member) for member in team_members)

                athlete_text.append(athletes)

                up_button = driver.find_element(By.CSS_SELECTOR, f"span[data-cy='icon-caret-up']")
                driver.execute_script("arguments[0].click();", up_button)
    
    #Scrape All Results and Competitors for all Non-Relay Events
    else:
        relay = 0
        time.sleep(2.5)
        match = re.search(r'\b(\d+)m\b', str(event))
        distance = str(match.group())
        
        athletes = content.find_all('div', {'class':'styles__AthleteData-sc-1yhe77y-2 bqCIEP'})
        country_codes = content.find_all('span', {'class':'styles__CountryName-sc-1r5phm6-1 bojjbG'})

        #Convert to All Scraped Data to Text
        athlete_text = [str(athlete.text.title()) for athlete in athletes]
        countries = [str(country.text) for country in country_codes]
        
    #Make Lists the Same Size in Case of DNF's (Common in Older Olympics)
    shortest_len = min(len(athlete_text), len(time_text), len(countries))  
    if len(athlete_text) != shortest_len or len(time_text) != shortest_len or len(countries) != shortest_len:
            print(location + ' ' + year + ' ' + distance + ' ' + event_stroke)
            print('shortening')
            athlete_text = athlete_text[:shortest_len]
            time_text = time_text[:shortest_len]
            countries = countries[:shortest_len]
            
    #Create Temporary Dataframe with all Scraped Results
    olympic_dict = {'Location': location, 'Year': year, 'Distance (in meters)': distance, 'Stroke': event_stroke, 'Relay?': relay, 'Gender': gender, 'Team': countries, 'Athlete':athlete_text,'Results':time_text}
    iterate_df = pd.DataFrame(olympic_dict, columns=['Location', 'Year', 'Distance (in meters)', 'Stroke', 'Relay?', 'Gender', 'Team', 'Athlete', 'Results'])
    
    #Standardize Times and Rank Them
    iterate_df = iterate_df.head(8)
    iterate_df['Results'] = iterate_df['Results'].apply(format_time)
    iterate_df['Rank'] = iterate_df['Results'].rank(ascending=True, method='min')

    #0 - Did not Start/Finish, Disqualfied
    #1 - Gold, 
    #2 - Silver, 
    #3 - Bronze
    #4 - No Medal
    #5 - No Data
    
    disqualifications = ["Did not start", "Disqualified", "Did not finish", 'at 5 metres']
    for index, row in iterate_df.iterrows():
        if str(row['Results']) in disqualifications:
            iterate_df.at[index, 'Rank'] = 0
        elif row['Results'] == None or row['Results'] == "":
            iterate_df.at[index, 'Rank'] = 5
        elif row['Rank'] >= 4:
            iterate_df.at[index, 'Rank'] = 4

    #Encode Dataframe to Selected Dataframe and Combine with Previous Results (if any)
    try:
        base_df = pd.read_excel(output_filename)
    except:
        base_df = pd.DataFrame(columns = ['Location', 'Year', 'Distance (in meters)', 'Stroke', 'Relay?', 'Gender', 'Team', 'Athlete', 'Results'])

    base_df = pd.concat([base_df, iterate_df], ignore_index=True)
    base_df.to_excel(output_filename, index=False)

### 3. Run Program (make sure to specify and output filename before running)

In [96]:
base_url = "https://olympics.com/en/olympic-games/tokyo-2020/results/swimming/men-s-100m-backstroke"
output_filename = r"___________"

if __name__ == "__main__":
    olympic_driver = open_olympic_page(base_url)
    get_all_events(olympic_driver)

Tokyo 2020 100m Medley
shortening
Tokyo 2020 200m Freestyle
shortening
Tokyo 2020 100m Medley
shortening
Tokyo 2020 100m Freestyle
shortening
Tokyo 2020 100m Medley
shortening
Tokyo 2020 200m Freestyle
shortening
Beijing 2008 4x100 Freestyle
shortening
Sydney 2000 4x100 Medley
shortening
Atlanta 1996 4x100 Freestyle
shortening
Atlanta 1996 4x100 Medley
shortening
Angeles 1984 4x100 Medley
shortening
Paris 1924 4x200 Freestyle
shortening
Stockholm 1912 100m Freestyle
shortening
London 1908 100m Freestyle
shortening
London 1908 200m Breaststroke
shortening
London 1908 400m Freestyle
shortening
London 1908 4x200 Freestyle
shortening
Louis 1904 100m Backstroke
shortening
Louis 1904 4x50 Freestyle
shortening
Paris 1900 200m Team
shortening


AttributeError: 'NoneType' object has no attribute 'group'