In [1]:
from time import sleep
from bs4 import BeautifulSoup 
import requests
import pandas as pd
import re

In [2]:
def flatten_list(list_of_lists):
    """
    This function flattens the list of lists, such that [[a1,a2,a3], [b1,b2], ..., [z1,z2,z3]] becomes [a1, a2, ... , z1, z3].
    
    :param [list_of_lists]: List of lists to be flattened
    :type [list_of_lists]: list
     
    :return : Flattened list
    :rtype: list
    """
    flatten_list = []
    for the_list in list_of_lists:
        for element in the_list:
            flatten_list.append(element)
    return flatten_list

In [3]:
rowing_url = "https://www.olympedia.org/sport_groups/RO"
response = requests.get(rowing_url)
if response.status_code == 200:
    page = BeautifulSoup(response.content, 'lxml')
else: print(f"Error loading {rowing_url}")

boat_urls = []
for boat in ['Quadruple Sculls', 'Eights', 'Coxless Fours']:
    tags = page.find_all("a", string = boat)
    boat_urls.append(["https://www.olympedia.org"+tag['href'] for tag in tags])

boat_urls = flatten_list(boat_urls)
boat_urls = boat_urls[:-2]

event_urls = []
for boat_url in boat_urls:
    sleep(1)
    response = requests.get(boat_url)
    if response.status_code == 200:
        page = BeautifulSoup(response.content, 'lxml')
        page = page.find('table',attrs = {'class': "table table-striped"})
        for year in range(1980, 2024, 4):
            try: event_url = page.find("td", string=str(year)).find_next_sibling("td").find("a")['href']
            except: print(f"No data for {year} {boat_url}")
            else:
                event_urls.append("https://www.olympedia.org"+event_url)
    else: print(f"Error loading {boat_url}")


No data for 1980 https://www.olympedia.org/event_names/397
No data for 1984 https://www.olympedia.org/event_names/397
No data for 1980 https://www.olympedia.org/event_names/2627
No data for 1984 https://www.olympedia.org/event_names/2627
No data for 1988 https://www.olympedia.org/event_names/2627
No data for 1992 https://www.olympedia.org/event_names/2627
No data for 1996 https://www.olympedia.org/event_names/2627
No data for 2000 https://www.olympedia.org/event_names/2627
No data for 2004 https://www.olympedia.org/event_names/2627
No data for 2008 https://www.olympedia.org/event_names/2627
No data for 2012 https://www.olympedia.org/event_names/2627
No data for 2016 https://www.olympedia.org/event_names/2627
No data for 2020 https://www.olympedia.org/event_names/2627


In [45]:
all_content = []

def get_content(url):

    response = requests.get(url)
    if response.status_code == 200:
        page = BeautifulSoup(response.content, 'lxml')
        event = page.find("h1")
        bio_summary = page.find(attrs = {"class":"biodata"})

        content = "<div>" + str(event) + "\n" + f"<h4>{url}</h4>" + "\n" + str(bio_summary)
        finals_names = page.find_all("h3", string=re.compile("^Final"))
        if len(finals_names) > 0:
               for name in finals_names:
                finals_table = name.find_next_sibling('table',attrs = {'class': "table table-striped"})
                content += "\n" + str(name) + "\n" + str(finals_table)
        else:
            #In case the final round has a single final
            try: 
                finals_table = page.find("h2", string="Final Round").find_next_sibling('table',attrs = {'class': "table table-striped"})
                content += "\n" + "<h2>Final Round</h2>" + "\n" + str(finals_table)
            except: 
                print(f"Failed to find a table at {url}")
                return


        content += "</div>"

        all_content.append(content)
    else: print(f"Error fetching {url}")
    sleep(1)

In [87]:
failed_extraction = []

def extract_data(finals_content):
    def get_boatclass_code(self):
        """ Turns 'Eights, Men' to 'M8+' and similar for the other boat classes"""
        gender_map = {'Men': 'M', 'Women': 'W'}
        boat_map = {'Eights': '8+', 'QuadrupleSculls' : '4x', 'CoxlessFours' : '4-'}
        parts = self.replace(" ","").split(",")
        code = gender_map[parts[1]] + boat_map[parts[0]]
        return code
    
    finals_info = BeautifulSoup(finals_content, 'lxml')

    boat = get_boatclass_code(finals_info.find("h1").text)
    dates = finals_info.find('th', string='Date').find_next_sibling("td").text
    year = int(dates[-4:])
    full_location = finals_info.find('th', string='Location').find_next_sibling("td").text
    city = full_location.replace(" ","").split(",")[-1]
    final_types = finals_info.find_all("h3")

    results_df = pd.DataFrame()
    tables = pd.read_html(finals_content)
    for i in range(1,len(tables)):
        try: tables[i]['FinalType'] = final_types[i-1].text
            #print(final_types[i-1].text)
        except: tables[i]['FinalType'] = 'Standalone Final'
        results_df = pd.concat([results_df, tables[i]])
    if set(['Pos',"500 m", "1,000 m", "1,500 m", "500-1,000 m", "1,000-1,500 m", "1,500-2,000 m"]).issubset(set(results_df.columns)):
        results_df = results_df[results_df['Pos'].notnull()].loc[:,["FinalType",'Competitors', 'NOC', 'Time', "500 m", "1,000 m", "1,500 m", "500-1,000 m", "1,000-1,500 m", "1,500-2,000 m"]]
        results_df[["500 m", "1,000 m", "1,500 m", "500-1,000 m", "1,000-1,500 m", "1,500-2,000 m"]] = \
            results_df[["500 m", "1,000 m", "1,500 m", "500-1,000 m", "1,000-1,500 m", "1,500-2,000 m"]].applymap((lambda x: x.split()[0] ))
        results_df['Rank'] = range(1, len(results_df)+1)
    elif 'Pos' in results_df.columns:
        results_df = results_df[results_df['Pos'].notnull()].loc[:,["FinalType",'Competitors', 'NOC', 'Time']]
        results_df['Rank'] = range(1, len(results_df)+1)
    else: 
        print(f"No 'Pos' column for {boat}, {year}")
        failed_extraction.append(finals_content)

    results_df[['Boat', 'Year', 'City']] = boat,year,city
    
    return results_df


In [47]:
for event_url in event_urls:
    get_content(event_url)

Failed to find a table at https://www.olympedia.org/results/159011


In [90]:
results = pd.DataFrame()

for content in all_content:
    results = pd.concat([results, extract_data(content)])

In [91]:
results

Unnamed: 0,FinalType,Competitors,NOC,Time,500 m,"1,000 m","1,500 m","500-1,000 m","1,000-1,500 m","1,500-2,000 m",Rank,Boat,Year,City
0,Final A,East Germany,GDR,5:49.81,1:24.43,2:53.31,4:21.61,1:28.88,1:28.30,1:28.20,1,M4x,1980,Moskva
2,Final A,Soviet Union,URS,5:51.47,1:23.40,2:51.90,4:22.05,1:28.50,1:30.15,1:29.42,2,M4x,1980,Moskva
4,Final A,Bulgaria,BUL,5:52.38,1:23.79,2:55.26,4:25.29,1:31.47,1:30.03,1:27.09,3,M4x,1980,Moskva
6,Final A,France,FRA,5:53.45,1:24.79,2:54.99,4:24.72,1:30.20,1:29.73,1:28.73,4,M4x,1980,Moskva
8,Final A,Spain,ESP,6:01.19,1:25.83,2:57.17,4:28.43,1:31.34,1:31.26,1:32.76,5,M4x,1980,Moskva
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10,Final A,Netherlands,NED,5:50.81,1:25.63,2:53.44,4:22.17,1:27.81,1:28.73,1:28.64,6,M4-,2021,Japan
0,Final B,Poland,POL,5:57.17,1:27.36,2:57.36,4:27.53,1:30.00,1:30.17,1:29.64,7,M4-,2021,Japan
2,Final B,Canada,CAN,5:58.29,1:29.06,3:00.14,4:29.05,1:31.08,1:28.91,1:29.24,8,M4-,2021,Japan
4,Final B,Switzerland,SUI,6:02.32,1:29.14,2:59.56,4:30.14,1:30.42,1:30.58,1:32.18,9,M4-,2021,Japan


In [92]:
results.to_csv("Olympics_results.csv")