# Scraping Data from Olympedia - Olympic Games Project

### This script contains the following:
#### 1. Importing libraries and packages
#### 2. Specify URL and create soup
#### 3. Find the tables to import
#### 4. Pull the data into the dataframe
#### 5. Export to .csv

## 1. Importing libraries and packages

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## 2. Specify URL and create soup

In [2]:
url = 'https://www.olympedia.org/editions/61/medal'

page = requests.get(url)

soup = BeautifulSoup(page.text, 'html')

In [3]:
print(soup)

<!DOCTYPE html>
<html>
<head>
<title>Olympedia – Medal winners Tokyo 2020</title>
<meta content="authenticity_token" name="csrf-param"/>
<meta content="fXfIQ/x4WMUUkRrzqLQrMe0w69m6JkwzS6jugSkOMBfFxzyG058o+/Ed1Nu/smTZZmL3KllVSz7oWeQ4/Tfllw==" name="csrf-token"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="EN" http-equiv="content-language"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="/assets/bootstrap.min-460a43de22fd9534d595e5aea2715cb154560291c9c6401b526e31c86a5ce32d.css" media="all" rel="stylesheet"/>
<link href="/assets/bootstrap-sortable-363d232309d54b549fa85446295ef2b5d290e3f8a49f1a646247340be3705ef9.css" media="all" rel="stylesheet"/>
<link href="/assets/jquery-ui-1.11.4.min-359ba1b9eb679ad05fb4c8fda710ee4c0239354f1ba635200b6065638295d646.css" media="all" rel="stylesheet"/>
<link href="/assets/lightbox-e29689e123fc27505d2b9d919f43ffcb6fade539cb4670f21c35aa07848105e7.css" media="screen" rel="stylesheet"/

## 3. Find the table to import

In [4]:
# Pull the table we are interested in

soup.find_all('table') # It appears to be only one big table

[<table class="table">
 <thead>
 <tr>
 <th>Sport/Event</th>
 <td><span class="Gold">Gold</span></td>
 <td></td>
 <td><span class="Silver">Silver</span></td>
 <td></td>
 <td><span class="Bronze">Bronze</span></td>
 </tr>
 </thead>
 <tr>
 <td colspan="7">
 <h2>3x3 Basketball</h2>
 </td>
 </tr>
 <tr class="top odd">
 <td class="top"><a href="/results/18000000">3x3 Basketball, Men</a></td>
 <td style="vertical-align: top">Latvia</td><td style="vertical-align: top"><a href="/countries/LAT"><img src="https://olympedia-flags.s3.eu-central-1.amazonaws.com/LAT.png" style="padding-right: 2px; vertical-align: middle"/>LAT</a></td>
 <td style="vertical-align: top">ROC</td><td style="vertical-align: top"><a href="/countries/ROC"><img src="https://olympedia-flags.s3.eu-central-1.amazonaws.com/ROC.png" style="padding-right: 2px; vertical-align: middle"/>ROC</a></td>
 <td style="vertical-align: top">Serbia</td><td style="vertical-align: top"><a href="/countries/SRB"><img src="https://olympedia-flags.s

In [16]:
# Assign the table to the object 'table'

table = soup.find_all('table')[0]

In [17]:
print(table)

<table class="table">
<thead>
<tr>
<th>Sport/Event</th>
<td><span class="Gold">Gold</span></td>
<td></td>
<td><span class="Silver">Silver</span></td>
<td></td>
<td><span class="Bronze">Bronze</span></td>
</tr>
</thead>
<tr>
<td colspan="7">
<h2>3x3 Basketball</h2>
</td>
</tr>
<tr class="top odd">
<td class="top"><a href="/results/18000000">3x3 Basketball, Men</a></td>
<td style="vertical-align: top">Latvia</td><td style="vertical-align: top"><a href="/countries/LAT"><img src="https://olympedia-flags.s3.eu-central-1.amazonaws.com/LAT.png" style="padding-right: 2px; vertical-align: middle"/>LAT</a></td>
<td style="vertical-align: top">ROC</td><td style="vertical-align: top"><a href="/countries/ROC"><img src="https://olympedia-flags.s3.eu-central-1.amazonaws.com/ROC.png" style="padding-right: 2px; vertical-align: middle"/>ROC</a></td>
<td style="vertical-align: top">Serbia</td><td style="vertical-align: top"><a href="/countries/SRB"><img src="https://olympedia-flags.s3.eu-central-1.amazon

## 4. Pull the data into the dataframe

### Get the headers

In [26]:
# Get the headers from the first row of the table body

thead = table.find('thead')

thead

<thead>
<tr>
<th>Sport/Event</th>
<td><span class="Gold">Gold</span></td>
<td></td>
<td><span class="Silver">Silver</span></td>
<td></td>
<td><span class="Bronze">Bronze</span></td>
</tr>
</thead>

In [27]:
# Extract the main header 'Sport/Event'

main_header = thead.find('th').text.strip()

main_header

'Sport/Event'

In [29]:
# Get the first row of the thead to find Gold, Silver, Bronze

first_row = thead.find('tr')

first_row

<tr>
<th>Sport/Event</th>
<td><span class="Gold">Gold</span></td>
<td></td>
<td><span class="Silver">Silver</span></td>
<td></td>
<td><span class="Bronze">Bronze</span></td>
</tr>

In [31]:
# Pull in the medal headers filtering the empty <td> elements

medal_headers = [td.text.strip() for td in first_row.find_all('td') if td.text.strip()]

medal_headers

['Gold', 'Silver', 'Bronze']

In [32]:
# Combine the main header with medal headers

olympic_table_titles = [main_header] + medal_headers

print(olympic_table_titles)

['Sport/Event', 'Gold', 'Silver', 'Bronze']


In [49]:
# Adjust the headers list

headers = ['Discipline', 'Sport/Event', 'Gold', 'Gold Country', 'Silver', 'Silver Country', 'Bronze', 'Bronze Country']

In [50]:
# Put the headers into a data frame

df = pd.DataFrame(columns = headers)

df

Unnamed: 0,Discipline,Sport/Event,Gold,Gold Country,Silver,Silver Country,Bronze,Bronze Country


### Get the data

In [51]:
# Get the data from the table

rows = table.find_all('tr')

In [52]:
# Initialize a variable to store the current sport

current_sport = None

In [53]:
# Iterate over the rows to extract data

for row in rows:
    cols = row.find_all(['th', 'td'])
    individual_row_data = [col.text.strip() for col in cols if col.text.strip()]
    
    # Check if the row represents a sport name
    
    if len(individual_row_data) == 1:
        current_sport = individual_row_data[0]
        continue
    
    # Debugging: Print the length and content of individual_row_data
    
    print(f"Row length: {len(individual_row_data)}, Data: {individual_row_data}")
    
    # Check if the length of individual_row_data matches the number of columns - 1 (since 'Discipline' is an extra column)
    
    if len(individual_row_data) == len(headers) - 1:
        
        # Add the current sport name to the row data
        
        individual_row_data.insert(0, current_sport)
        df.loc[len(df)] = individual_row_data
    else:
        print(f"Skipping row due to length mismatch: {individual_row_data}")

Row length: 4, Data: ['Sport/Event', 'Gold', 'Silver', 'Bronze']
Skipping row due to length mismatch: ['Sport/Event', 'Gold', 'Silver', 'Bronze']
Row length: 7, Data: ['3x3 Basketball, Men', 'Latvia', 'LAT', 'ROC', 'ROC', 'Serbia', 'SRB']
Row length: 7, Data: ['3x3 Basketball, Women', 'United States', 'USA', 'ROC', 'ROC', "People's Republic of China", 'CHN']
Row length: 7, Data: ['Individual, Men', 'Mete Gazoz', 'TUR', 'Mauro Nespoli', 'ITA', 'Takaharu Furukawa', 'JPN']
Row length: 7, Data: ['Team, Men', 'Republic of Korea', 'KOR', 'Chinese Taipei', 'TPE', 'Japan', 'JPN']
Row length: 7, Data: ['Individual, Women', 'An San', 'KOR', 'Yelena Osipova', 'ROC', 'Lucilla Boari', 'ITA']
Row length: 7, Data: ['Team, Women', 'Republic of Korea', 'KOR', 'ROC', 'ROC', 'Germany', 'GER']
Row length: 7, Data: ['Team, Mixed', 'Republic of Korea', 'KOR', 'Netherlands', 'NED', 'Mexico', 'MEX']
Row length: 7, Data: ['Individual All-Around, Men', 'Daiki Hashimoto', 'JPN', 'Xiao Ruoteng', 'CHN', 'Nikita Na

Row length: 7, Data: ['Singles, Women', 'Belinda Bencic', 'SUI', 'Markéta Vondroušová', 'CZE', 'Elina Svitolina', 'UKR']
Row length: 7, Data: ['Doubles, Women', 'Czech Republic 1', 'CZE', 'Switzerland', 'SUI', 'Brazil', 'BRA']
Row length: 7, Data: ['Doubles, Mixed', 'ROC 2', 'ROC', 'ROC 1', 'ROC', 'Australia', 'AUS']
Row length: 7, Data: ['Individual, Men', 'Ivan Litvinovich', 'BLR', 'Dong Dong', 'CHN', 'Dylan Schmidt', 'NZL']
Row length: 7, Data: ['Individual, Women', 'Zhu Xueying', 'CHN', 'Liu Lingling', 'CHN', 'Bryony Page', 'GBR']
Row length: 7, Data: ['Olympic Distance, Men', 'Kristian Blummenfelt', 'NOR', 'Alex Yee', 'GBR', 'Hayden Wilde', 'NZL']
Row length: 7, Data: ['Olympic Distance, Women', 'Flora Duffy', 'BER', 'Georgia Taylor-Brown', 'GBR', 'Katie Zaferes', 'USA']
Row length: 7, Data: ['Relay, Mixed', 'Great Britain', 'GBR', 'United States', 'USA', 'France', 'FRA']
Row length: 7, Data: ['Volleyball, Men', 'France', 'FRA', 'ROC', 'ROC', 'Argentina', 'ARG']
Row length: 7, Dat

In [54]:
df

Unnamed: 0,Discipline,Sport/Event,Gold,Gold Country,Silver,Silver Country,Bronze,Bronze Country
0,3x3 Basketball,"3x3 Basketball, Men",Latvia,LAT,ROC,ROC,Serbia,SRB
1,3x3 Basketball,"3x3 Basketball, Women",United States,USA,ROC,ROC,People's Republic of China,CHN
2,Archery,"Individual, Men",Mete Gazoz,TUR,Mauro Nespoli,ITA,Takaharu Furukawa,JPN
3,Archery,"Team, Men",Republic of Korea,KOR,Chinese Taipei,TPE,Japan,JPN
4,Archery,"Individual, Women",An San,KOR,Yelena Osipova,ROC,Lucilla Boari,ITA
...,...,...,...,...,...,...,...,...
334,Wrestling,"Featherweight, Freestyle, Women",Mayu Mukaida,JPN,Pang Qianyu,CHN,Vanesa KaladzinskayaBat Ochiryn Bolortuyaa,BLRMGL
335,Wrestling,"Lightweight, Freestyle, Women",Risako Kawai,JPN,Iryna Kurachkina,BLR,Evelina NikolovaHelen Maroulis,BULUSA
336,Wrestling,"Middleweight, Freestyle, Women",Yukako Kawai,JPN,Aysuluu Tynybekova,KGZ,Taybe YuseinIryna Koliadenko,BULUKR
337,Wrestling,"Light-Heavyweight, Freestyle, Women",Tamyra Mensah-Stock,USA,Blessing Oborududu,NGR,Meerim ZhumanazarovaAlla Cherkasova,KGZUKR


In [None]:
# Add 'Year' column with the value 2020 at the first position

df.insert(0, 'Year', 2020)

In [58]:
# Add 'Host city' column with the value Tokyo at the second position

df.insert(1, 'Host city', 'Tokyo')

In [59]:
df.head()

Unnamed: 0,Year,Host city,Discipline,Sport/Event,Gold,Gold Country,Silver,Silver Country,Bronze,Bronze Country
0,2020,Tokyo,3x3 Basketball,"3x3 Basketball, Men",Latvia,LAT,ROC,ROC,Serbia,SRB
1,2020,Tokyo,3x3 Basketball,"3x3 Basketball, Women",United States,USA,ROC,ROC,People's Republic of China,CHN
2,2020,Tokyo,Archery,"Individual, Men",Mete Gazoz,TUR,Mauro Nespoli,ITA,Takaharu Furukawa,JPN
3,2020,Tokyo,Archery,"Team, Men",Republic of Korea,KOR,Chinese Taipei,TPE,Japan,JPN
4,2020,Tokyo,Archery,"Individual, Women",An San,KOR,Yelena Osipova,ROC,Lucilla Boari,ITA


## 5. Export to .csv

In [60]:
df.to_csv(r'/Users/giadairene/Documents/Data Analytics Projects/Tokyo2020.csv', index = False)