## 1. Import Libraries

https://www.olympedia.org/static/faq

In [1]:
import os
import pandas as pd
import psycopg2 as pg2
import warnings

warnings.filterwarnings("ignore")

## 2. Tables of the Database
### Connecting the database from Jupyter Notebook

In [170]:
mypassword = os.getenv("POSTGRESQL_PASSWORD")

try:
    conn = pg2.connect(user = "postgres", password = mypassword, database = "olympics")
    cursor = conn.cursor()
    print("Database Connection Successful")
except pg2.connector.Error as err:
    print(f"Error: '{err}'") 

Database Connection Successful


Once the database is successfully connected, let's list the name of the tables that it contains.

In [3]:
cursor.execute("""
SELECT table_schema, table_name
FROM information_schema.tables
WHERE table_schema = 'olympics'
""")

table_names = []
print('--- Tables within "olympics" database --- ')
for table in cursor:
    print(table[1])
    table_names.append(table[1])

--- Tables within "olympics" database --- 
olympic_history
athletes_roles
host_cities


Here are the previews of the 2 tables under the olympics database:

In [4]:
for table in table_names:
    print("Table: ", table)
    display(pd.read_sql("SELECT * FROM olympics." + table, conn))

Table:  olympic_history


Unnamed: 0,id,name,gender,born,died,height,weight,team,game,noc,sport,event,medal
0,131892,Meryem Erdoğan,Female,24 April 1990,,172 cm,55 kg,Türkiye,2016 Summer Olympics,TUR,Athletics,"Athletics, Marathon, Women(Olympic)",
1,131892,Meryem Erdoğan,Female,24 April 1990,,172 cm,55 kg,Türkiye,2020 Summer Olympics,TUR,Athletics,"Athletics, Marathon, Women(Olympic)",
2,131892,Meryem Erdoğan,Female,24 April 1990,,172 cm,55 kg,Türkiye,2020 Summer Olympics,TUR,Athletics,"Athletics, Marathon, Women(Olympic)",
3,4300,Maurice Maina,Male,1 January 1963,,158 cm,47 kg,Kenya,1988 Summer Olympics,KEN,Boxing,"Boxing, Light-Flyweight, Men(Olympic)",
4,4300,Maurice Maina,Male,1 January 1963,,158 cm,47 kg,Kenya,1988 Summer Olympics,KEN,Boxing,"Boxing, Light-Flyweight, Men(Olympic)",
...,...,...,...,...,...,...,...,...,...,...,...,...,...
476343,20989,Caitlin Bilodeaux-Banos,Female,17 March 1965,,170 cm,64 kg,United States,1988 Summer Olympics,USA,Fencing,"Fencing, Foil, Individual, Women(Olympic)",
476344,20989,Caitlin Bilodeaux-Banos,Female,17 March 1965,,170 cm,64 kg,United States,1988 Summer Olympics,USA,Fencing,"Fencing, Foil, Team, Women(Olympic)",
476345,20989,Caitlin Bilodeaux-Banos,Female,17 March 1965,,170 cm,64 kg,United States,1992 Summer Olympics,USA,Fencing,"Fencing, Foil, Individual, Women(Olympic)",
476346,20989,Caitlin Bilodeaux-Banos,Female,17 March 1965,,170 cm,64 kg,United States,1992 Summer Olympics,USA,Fencing,"Fencing, Foil, Team, Women(Olympic)",


Table:  athletes_roles


Unnamed: 0,id,name,roles
0,131892,Meryem Erdoğan,Competed in Olympic Games
1,4300,Maurice Maina,Competed in Olympic Games
2,60239,Stanislav Tůma,Competed in Olympic Games
3,129369,Eunice Kirwa,Competed in Olympic Games
4,142670,Sinem Kurtbay,Competed in Olympic Games
...,...,...,...
155653,122196,Aleksa Šaponjić,Competed in Olympic Games
155654,52168,Zhang Yousheng,Competed in Olympic Games
155655,18974,Werner Delmes,Competed in Olympic Games • Coach
155656,126253,Tim Payne,Competed in Olympic Games


Table:  host_cities


Unnamed: 0,year,season,game,host_city
0,1896,Summer,1896 Summer Olympics,Athina
1,1900,Summer,1900 Summer Olympics,Paris
2,1904,Summer,1904 Summer Olympics,St. Louis
3,1908,Summer,1908 Summer Olympics,London
4,1912,Summer,1912 Summer Olympics,Stockholm
...,...,...,...,...
57,2010,Winter,2010 Winter Olympics,Vancouver
58,2014,Winter,2014 Winter Olympics,Sochi
59,2018,Winter,2018 Winter Olympics,PyeongChang
60,2022,Winter,2022 Winter Olympics,Beijing


## 3. Data Cleaning

### 3.1 Standardize inconsistent date values and Handle missing values

In both the `born` and `died` columns, the date values exhibit inconsistency. The following are the various formats found in both columns:
- `'(1880 or 1881)'` 
- `'(circa 1880)'` 
- `'(c. 1880)'`
- `'September 1880'`
- `'1933'` 
- `'NULL'`

In [6]:
# Check date values that do not start with digits
pd.read_sql("""
SELECT DISTINCT born
FROM olympics.olympic_history
WHERE (LENGTH(born) = 4 OR NOT born ~ E'^[0-9]+') AND born != 'NULL' 
""", conn)

Unnamed: 0,born
0,1910
1,January 1885
2,December 1881
3,1933
4,September 1880
...,...
270,(circa 1929)
271,April 1872
272,1875
273,(1942 or 1943)


In [7]:
# Create empty table
cursor.execute("DROP TABLE IF EXISTS olympics.olympic_history_cleaned;")
cursor.execute("""
CREATE TABLE olympics.olympic_history_cleaned
(
    id INTEGER,
    name VARCHAR,
    gender VARCHAR,
    born VARCHAR,
    birth_year INTEGER,
    died VARCHAR,
    death_year INTEGER,
    height_cm INTEGER,
    weight_kg INTEGER,
    noc VARCHAR,
    country VARCHAR,
    game VARCHAR,
    sport VARCHAR,
    event VARCHAR,
    medal VARCHAR
);

""")

cursor.execute("""
INSERT INTO olympics.olympic_history_cleaned
SELECT
    id,
    name,
    gender,
    
    -- Extract Year values from 'born' and 'died' columns 
    -- Handle missing values
    CASE WHEN born = 'NULL' THEN NULL ELSE born END AS born,
    CASE WHEN born = 'NULL' THEN NULL ELSE SUBSTRING(born from '\d{4}')::INTEGER END AS birth_year,
    CASE WHEN died = 'NULL' THEN NULL ELSE died END AS died,
    CASE WHEN died = 'NULL' THEN NULL ELSE SUBSTRING(died from '\d{4}')::INTEGER END AS death_year,
    
    -- Extract measurements
    CASE WHEN height = 'NULL' THEN NULL ELSE SUBSTRING(height from '\d{3}')::INTEGER END AS height_cm,
    CASE WHEN weight = 'NULL' THEN NULL ELSE SUBSTRING(weight from '\d{2}')::INTEGER END AS weight_kg,
    
    noc, 
    team AS country,
    game, 
    sport, 
    event,
    CASE WHEN medal = 'NULL' THEN NULL ELSE medal END AS medal
    
FROM olympics.olympic_history  
""")

conn.commit()

In [8]:
pd.read_sql("""
SELECT *
FROM olympics.olympic_history_cleaned
""", conn)

Unnamed: 0,id,name,gender,born,birth_year,died,death_year,height_cm,weight_kg,noc,country,game,sport,event,medal
0,131892,Meryem Erdoğan,Female,24 April 1990,1990.0,,,172.0,55.0,TUR,Türkiye,2016 Summer Olympics,Athletics,"Athletics, Marathon, Women(Olympic)",
1,131892,Meryem Erdoğan,Female,24 April 1990,1990.0,,,172.0,55.0,TUR,Türkiye,2020 Summer Olympics,Athletics,"Athletics, Marathon, Women(Olympic)",
2,131892,Meryem Erdoğan,Female,24 April 1990,1990.0,,,172.0,55.0,TUR,Türkiye,2020 Summer Olympics,Athletics,"Athletics, Marathon, Women(Olympic)",
3,4300,Maurice Maina,Male,1 January 1963,1963.0,,,158.0,47.0,KEN,Kenya,1988 Summer Olympics,Boxing,"Boxing, Light-Flyweight, Men(Olympic)",
4,4300,Maurice Maina,Male,1 January 1963,1963.0,,,158.0,47.0,KEN,Kenya,1988 Summer Olympics,Boxing,"Boxing, Light-Flyweight, Men(Olympic)",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476343,20989,Caitlin Bilodeaux-Banos,Female,17 March 1965,1965.0,,,170.0,64.0,USA,United States,1988 Summer Olympics,Fencing,"Fencing, Foil, Individual, Women(Olympic)",
476344,20989,Caitlin Bilodeaux-Banos,Female,17 March 1965,1965.0,,,170.0,64.0,USA,United States,1988 Summer Olympics,Fencing,"Fencing, Foil, Team, Women(Olympic)",
476345,20989,Caitlin Bilodeaux-Banos,Female,17 March 1965,1965.0,,,170.0,64.0,USA,United States,1992 Summer Olympics,Fencing,"Fencing, Foil, Individual, Women(Olympic)",
476346,20989,Caitlin Bilodeaux-Banos,Female,17 March 1965,1965.0,,,170.0,64.0,USA,United States,1992 Summer Olympics,Fencing,"Fencing, Foil, Team, Women(Olympic)",


### 3.2 Check duplicates
Using a Window function, the following code confirms that the dataset contains duplicate records.

In [167]:
pd.read_sql("""
WITH count_occurences_cte AS
(
    SELECT 
        id, name, gender, born, died, game, noc, sport, event, medal,
        ROW_NUMBER() OVER (PARTITION BY id, name, gender, born, died, game, noc, sport, event, medal) AS occurence
    FROM olympics.olympic_history_cleaned
)
SELECT *
FROM count_occurences_cte
WHERE occurence > 1
""", conn)

Unnamed: 0,id,name,gender,born,died,game,noc,sport,event,medal,occurence
0,1,Jean-François Blanchy,Male,12 December 1886,2 October 1960,1920 Summer Olympics,FRA,Tennis,"Tennis, Doubles, Men(Olympic)",,2
1,2,Arnaud Boetsch,Male,1 April 1969,,1996 Summer Olympics,FRA,Tennis,"Tennis, Doubles, Men(Olympic)",,2
2,3,Jean Borotra,Male,13 August 1898,17 July 1994,1924 Summer Olympics,FRA,Tennis,"Tennis, Doubles, Men(Olympic)",Bronze,2
3,4,Jacques Brugnon,Male,11 May 1895,20 March 1978,1924 Summer Olympics,FRA,Tennis,"Tennis, Doubles, Men(Olympic)",Silver,2
4,5,Albert Canet,Male,17 April 1878,25 July 1930,1912 Summer Olympics,FRA,Tennis,"Tennis, Doubles, Men(Olympic)",Bronze,2
...,...,...,...,...,...,...,...,...,...,...,...
157436,3100007,E. P. Daniel,Male,,,1900 Summer Olympics,GBR,Cricket,"Cricket, Cricket, Men(Olympic)",,2
157437,3100008,J. M. Willcox,Male,,,1900 Summer Olympics,GBR,Cricket,"Cricket, Cricket, Men(Olympic)",,2
157438,3100009,G. P. Brook,Male,,,1900 Summer Olympics,GBR,Cricket,"Cricket, Cricket, Men(Olympic)",,2
157439,3100010,Seamus Kelly,Male,,28 January 2001,1948 Summer Olympics,IRL,Athletics,"Athletics, 800 metres, Men(Olympic)",,2


In [173]:
cursor.execute("""
DELETE
FROM olympics.olympic_history_cleaned h1
USING olympics.olympic_history_cleaned h2
WHERE 
    h1.id = h2.id 
    AND h1.name = h2.name 
    AND h1.gender = h2.gender
    AND h1.born = h2.born 
    AND h1.died = h2.died
    AND h1.noc = h2.noc 
    AND h1.game = h2.game
    AND h1.event = h2.event
    AND h1.medal = h2.medal 
""")

conn.commit()

In [176]:
pd.read_sql("""
WITH count_occurences_cte AS
(
    SELECT 
        id, name, gender, born, died, game, noc, sport, event, medal,
        ROW_NUMBER() OVER (PARTITION BY id, name, gender, born, died, game, noc, sport, event, medal) AS occurence
    FROM olympics.olympic_history_cleaned
)
SELECT *
FROM count_occurences_cte

""", conn)

Unnamed: 0,id,name,gender,born,died,game,noc,sport,event,medal,occurence
0,1,Jean-François Blanchy,Male,12 December 1886,2 October 1960,1912 Summer Olympics,FRA,Tennis,"Tennis, Doubles, Men(Olympic)",,1
1,1,Jean-François Blanchy,Male,12 December 1886,2 October 1960,1912 Summer Olympics,FRA,Tennis,"Tennis, Singles, Men(Olympic)",,1
2,1,Jean-François Blanchy,Male,12 December 1886,2 October 1960,1920 Summer Olympics,FRA,Tennis,"Tennis, Doubles, Men(Olympic)",,1
3,1,Jean-François Blanchy,Male,12 December 1886,2 October 1960,1920 Summer Olympics,FRA,Tennis,"Tennis, Doubles, Men(Olympic)",,2
4,1,Jean-François Blanchy,Male,12 December 1886,2 October 1960,1920 Summer Olympics,FRA,Tennis,"Tennis, Doubles, Mixed(Olympic)",,1
...,...,...,...,...,...,...,...,...,...,...,...
455368,3100009,G. P. Brook,Male,,,1900 Summer Olympics,GBR,Cricket,"Cricket, Cricket, Men(Olympic)",,2
455369,3100010,Seamus Kelly,Male,,28 January 2001,1948 Summer Olympics,IRL,Athletics,"Athletics, 800 metres, Men(Olympic)",,1
455370,3100010,Seamus Kelly,Male,,28 January 2001,1948 Summer Olympics,IRL,Athletics,"Athletics, 800 metres, Men(Olympic)",,2
455371,22000000,Jaap Kool,Male,31 December 1890,1 December 1959,1924 Summer Olympics,NED,Art Competitions,"Art Competitions, Music, Open(Olympic)",,1


## Case Study Questions

### 1. How many olympics games have been held?

In [202]:
pd.read_sql("""
SELECT COUNT(DISTINCT h.game) AS "Total Number Of Olympic Games"
FROM olympics.olympic_history_cleaned h
JOIN olympics.host_cities c ON h.game = c.game
""", conn)

Unnamed: 0,Total Number Of Olympic Games
0,53


#### 2. List down all Olympics games held so far.

In [203]:
pd.read_sql("""
SELECT DISTINCT c.year, h.game, c.host_city
FROM olympics.olympic_history_cleaned h
JOIN olympics.host_cities c ON h.game = c.game
ORDER BY c.year
""", conn)

Unnamed: 0,year,game,host_city
0,1896,1896 Summer Olympics,Athina
1,1900,1900 Summer Olympics,Paris
2,1904,1904 Summer Olympics,St. Louis
3,1908,1908 Summer Olympics,London
4,1912,1912 Summer Olympics,Stockholm
5,1920,1920 Summer Olympics,Antwerpen
6,1924,1924 Summer Olympics,Paris
7,1924,1924 Winter Olympics,Chamonix
8,1928,1928 Summer Olympics,Amsterdam
9,1928,1928 Winter Olympics,Sankt Moritz


#### 3. Mention the total number of nations who participated in each olympics game?

In [62]:
pd.read_sql("""
SELECT 
    h.game,
    c.host_city,
    COUNT(DISTINCT h.noc) AS nb_nations
FROM olympics.olympic_history_cleaned h
JOIN olympics.host_cities c ON h.game = c.game
JOIN olympics.athletes_roles r ON h.id = r.id
WHERE r.roles NOT LIKE '%(non-medal events)%'
GROUP BY h.game, c.host_city
""", conn)

Unnamed: 0,game,host_city,nb_nations
0,1896 Summer Olympics,Athina,14
1,1900 Summer Olympics,Paris,26
2,1904 Summer Olympics,St. Louis,10
3,1908 Summer Olympics,London,23
4,1912 Summer Olympics,Stockholm,29
5,1920 Summer Olympics,Antwerpen,29
6,1924 Summer Olympics,Paris,45
7,1924 Winter Olympics,Chamonix,20
8,1928 Summer Olympics,Amsterdam,46
9,1928 Winter Olympics,Sankt Moritz,25


#### 4. Which year saw the highest and lowest number of countries participating in olympics?

In [245]:
pd.read_sql("""
WITH count_nations_cte AS
( 
    SELECT h.game, COUNT(DISTINCT h.noc) AS nb_nations
    FROM olympics.olympic_history_cleaned h
    JOIN olympics.host_cities c ON h.game = c.game
    GROUP BY h.game
)
SELECT DISTINCT
    CONCAT(FIRST_VALUE(game) OVER (ORDER BY nb_nations ASC), ' - ', 
        FIRST_VALUE(nb_nations) OVER (ORDER BY nb_nations ASC), ' nations') AS lowest_nb_nations,
    CONCAT(FIRST_VALUE(game) OVER (ORDER BY nb_nations DESC), ' - ', 
        FIRST_VALUE(nb_nations) OVER (ORDER BY nb_nations DESC), ' nations') AS highest_nb_nations
FROM count_nations_cte
""", conn)

Unnamed: 0,lowest_nb_nations,highest_nb_nations
0,1904 Summer Olympics - 10 nations,2016 Summer Olympics - 207 nations


#### 5. Which nation has participated in all of the olympic games?

In [206]:
pd.read_sql("""
SELECT COUNT(DISTINCT h.game) AS "Total Olympic Games"
FROM olympics.olympic_history_cleaned h
JOIN olympics.host_cities c ON h.game = c.game
""", conn) 

Unnamed: 0,Total Olympic Games
0,53


In [209]:
pd.read_sql("""
WITH count_games_cte AS
(
    SELECT h.country, COUNT(DISTINCT h.game) AS total_olympics_attended
    FROM olympics.olympic_history_cleaned h
    JOIN olympics.host_cities c ON h.game = c.game
    GROUP BY h.country
)
SELECT country, total_olympics_attended
FROM count_games_cte
WHERE total_olympics_attended = 
(
    SELECT COUNT(DISTINCT h.game) AS total_games
    FROM olympics.olympic_history_cleaned h
    JOIN olympics.host_cities c ON h.game = c.game
)
""", conn)

Unnamed: 0,country,total_olympics_attended
0,Great Britain,53
1,Italy,53


#### 6. Identify the sport which was played in all summer olympics.

In [210]:
pd.read_sql("""
SELECT COUNT(DISTINCT h.game) AS "Total Number of Summer Games"
FROM olympics.olympic_history_cleaned h
JOIN olympics.host_cities c ON h.game = c.game
WHERE c.season = 'Summer'
""", conn) 

Unnamed: 0,Total Number of Summer Games
0,29


In [211]:
pd.read_sql("""
WITH count_summer_games_cte AS
(
    SELECT h.sport, COUNT(DISTINCT h.game) AS total_summer_games_attended
    FROM olympics.olympic_history_cleaned h
    JOIN olympics.host_cities c ON h.game = c.game
    WHERE c.season = 'Summer'
    GROUP BY h.sport
)
SELECT sport, total_summer_games_attended
FROM count_summer_games_cte
WHERE total_summer_games_attended =
(
    SELECT COUNT(DISTINCT h.game) AS "Total Number of Summer Games"
    FROM olympics.olympic_history_cleaned h
    JOIN olympics.host_cities c ON h.game = c.game
    WHERE c.season = 'Summer'
)
""", conn) 

Unnamed: 0,sport,total_summer_games_attended
0,Artistic Gymnastics (Gymnastics),29
1,Athletics,29
2,Fencing,29
3,Swimming (Aquatics),29


#### 7. Which Sports were just played only once in the olympics?

In [212]:
pd.read_sql("""
WITH count_sport_games_cte AS
(
    SELECT 
        h.sport, 
        COUNT(DISTINCT h.game) AS "nb_games"
    FROM olympics.olympic_history_cleaned h
    JOIN olympics.host_cities c ON h.game = c.game
    GROUP BY h.sport
    HAVING COUNT(DISTINCT h.game) = 1
)
SELECT 
    DISTINCT cte.sport, 
    c.year, 
    h.game, 
    c.host_city, 
    cte.nb_games
FROM count_sport_games_cte cte
LEFT JOIN olympics.olympic_history_cleaned h ON cte.sport = h.sport
JOIN olympics.host_cities c ON h.game = c.game
ORDER BY c.year 
""", conn) 

Unnamed: 0,sport,year,game,host_city,nb_games
0,Automobile Racing,1900,1900 Summer Olympics,Paris,1
1,Ballooning (Air Sports),1900,1900 Summer Olympics,Paris,1
2,Boules,1900,1900 Summer Olympics,Paris,1
3,Cricket,1900,1900 Summer Olympics,Paris,1
4,Croquet,1900,1900 Summer Olympics,Paris,1
5,Equestrian Driving (Equestrian),1900,1900 Summer Olympics,Paris,1
6,Firefighting,1900,1900 Summer Olympics,Paris,1
7,Fishing,1900,1900 Summer Olympics,Paris,1
8,Motorcycle Sports,1900,1900 Summer Olympics,Paris,1
9,Roque,1904,1904 Summer Olympics,St. Louis,1


#### 8. Fetch the total number of sports played in each olympic games.

In [65]:
pd.read_sql("""
SELECT 
    h.game, 
    c.host_city, 
    COUNT(DISTINCT h.sport) AS total_number_disciplines
FROM olympics.olympic_history_cleaned h
JOIN olympics.host_cities c ON h.game = c.game
JOIN olympics.athletes_roles r ON h.id = r.id
WHERE r.roles NOT LIKE '%(non-medal events)%'
GROUP BY h.game, c.host_city
""", conn)

Unnamed: 0,game,host_city,total_number_disciplines
0,1896 Summer Olympics,Athina,10
1,1900 Summer Olympics,Paris,22
2,1904 Summer Olympics,St. Louis,18
3,1908 Summer Olympics,London,24
4,1912 Summer Olympics,Stockholm,19
5,1920 Summer Olympics,Antwerpen,29
6,1924 Summer Olympics,Paris,23
7,1924 Winter Olympics,Chamonix,10
8,1928 Summer Olympics,Amsterdam,21
9,1928 Winter Olympics,Sankt Moritz,8


#### 9. Fetch details of the oldest athletes to win a gold medal.

In [274]:
pd.read_sql("""
SELECT 
    h.id, 
    h.name, 
    CASE 
        WHEN birth_year > 1910 AND death_year IS NOT NULL THEN death_year - birth_year
        WHEN birth_year > 1910 THEN 2023 - birth_year 
        ELSE NULL
    END AS age,
    h.medal
FROM olympics.olympic_history_cleaned h
JOIN olympics.host_cities c ON h.game = c.game
WHERE h.medal = 'Gold' 
GROUP BY h.id, h.name, h.birth_year, h.death_year, h.medal
ORDER BY age DESC
LIMIT 5
""", conn)

Unnamed: 0,id,name,age,medal
0,42764,Gyula Halasy,,Gold
1,62478,Christian Jebe,,Gold
2,18851,John Robinson,,Gold
3,22697,Nedo Nadi,,Gold
4,19145,Richard Allen,,Gold


#### 10. Find the Ratio of male and female athletes participated in all olympic games.

In [67]:
pd.read_sql("""
SELECT 
    TO_CHAR(SUM(CASE WHEN gender = 'Male' THEN 1 ELSE 0 END), 'FM999,999') AS nb_males,
    TO_CHAR(SUM(CASE WHEN gender = 'Female' THEN 1 ELSE 0 END), 'FM999,999') AS nb_females,
    TO_CHAR(COUNT(id), 'FM999,999') AS total_athletes,
    CONCAT(ROUND(SUM(CASE WHEN gender = 'Male' THEN 1 ELSE 0 END)/COUNT(id)::NUMERIC * 100, 1), ' %') AS males_percent,
    CONCAT(ROUND(SUM(CASE WHEN gender = 'Female' THEN 1 ELSE 0 END)/COUNT(id)::NUMERIC * 100, 1), ' %') AS females_percent,
    CONCAT(ROUND(SUM(CASE WHEN gender = 'Male' THEN 1 ELSE 0 END)/SUM(CASE WHEN gender = 'Female' THEN 1 ELSE 0 END)::NUMERIC), ' : 1') AS "Ration M:F"    
FROM 
(
    -- Retrieve unique IDs
    
    SELECT DISTINCT h.id, h.gender, h.game
    FROM olympics.olympic_history_cleaned h
    JOIN olympics.host_cities c ON h.game = c.game
    JOIN olympics.athletes_roles r ON h.id = r.id
    WHERE r.roles NOT LIKE '%(non-medal events)%'
    
) unique_id

""", conn)

Unnamed: 0,nb_males,nb_females,total_athletes,males_percent,females_percent,Ration M:F
0,152880,57796,210676,72.6 %,27.4 %,3 : 1


#### 11. In which year were female athletes permitted to participate in the Olympic Games? Provide a breakdown of the male and female participants for each Olympic Games.

In [159]:
pd.read_sql("""   
WITH unique_id_cte AS
(
    SELECT 
        DISTINCT id, 
        name, 
        gender, 
        game,
        year
    FROM 
    (
        SELECT h.id, h.name, h.gender, h.game, c.year
        FROM olympics.olympic_history_cleaned h
        JOIN olympics.host_cities c ON h.game = c.game
        JOIN olympics.athletes_roles r ON h.id = r.id
        WHERE r.roles NOT LIKE '%Non-starter%' AND h.event LIKE '%(Olympic)%'
    ) e
)
SELECT 
    year,
    game,
    SUM(CASE WHEN gender = 'Male' THEN 1 ELSE 0 END) AS nb_males,
    SUM(CASE WHEN gender = 'Female' THEN 1 ELSE 0 END) AS nb_females,
    COUNT(id) AS total_athletes
FROM unique_id_cte
GROUP BY year, game
ORDER BY year
""", conn)

Unnamed: 0,year,game,nb_males,nb_females,total_athletes
0,1896,1896 Summer Olympics,141,0,141
1,1900,1900 Summer Olympics,1181,22,1203
2,1904,1904 Summer Olympics,636,6,642
3,1908,1908 Summer Olympics,1893,44,1937
4,1912,1912 Summer Olympics,2265,54,2319
5,1920,1920 Summer Olympics,2478,76,2554
6,1924,1924 Winter Olympics,282,13,295
7,1924,1924 Summer Olympics,2999,156,3155
8,1928,1928 Summer Olympics,2888,313,3201
9,1928,1928 Winter Olympics,409,28,437


#### 12. In which sport female athletes first played at the Olympic Games?

In [112]:
pd.read_sql("""
SELECT sport, COUNT(DISTINCT h.id) AS nb_female_athletes
FROM olympics.olympic_history_cleaned h
JOIN olympics.athletes_roles r ON h.id = r.id
WHERE 
    r.roles NOT LIKE '%Non-starter%' 
    AND h.event LIKE '%(Olympic)%'
    AND h.gender = 'Female'
    AND h.game = '1900 Summer Olympics'
GROUP BY sport
ORDER BY nb_female_athletes DESC
""", conn)

Unnamed: 0,sport,nb_female_athletes
0,Golf,10
1,Tennis,5
2,Croquet,3
3,Equestrian Dressage (Equestrian),3
4,Sailing,1


### 13. Fetch the top 5 athletes who have won the most gold medals.

In [179]:
pd.read_sql("""
SELECT h.id, h.name, h.noc, COUNT(h.medal) AS gold_medals
FROM olympics.olympic_history_cleaned h
JOIN olympics.athletes_roles r ON h.id = r.id
WHERE r.roles NOT LIKE '%Non-starter%' AND h.event LIKE '%(Olympic)%' AND h.medal = 'Gold'
GROUP BY h.id, h.name, h.noc
ORDER BY gold_medals DESC
""", conn)

Unnamed: 0,id,name,noc,gold_medals
0,93860,Michael Phelps,USA,24
1,51572,Mark Spitz,USA,10
2,78692,Carl Lewis,USA,10
3,105512,Usain Bolt,JAM,9
4,85378,Bjørn Dæhlie,NOR,9
...,...,...,...,...
7380,36200,Archie MacKinnon,CAN,1
7381,20053,Riaz Ahmed,PAK,1
7382,104562,Taner Sağır,TUR,1
7383,93674,Manon van Rooijen,NED,1


In [175]:
pd.read_sql("""
SELECT h.id, h.game, h.sport, h.event, h.medal
FROM olympics.olympic_history_cleaned h
JOIN olympics.athletes_roles r ON h.id = r.id
WHERE h.id = 93860 AND h.medal = 'Gold'
""", conn)

Unnamed: 0,id,game,sport,event,medal
0,93860,2004 Summer Olympics,Swimming (Aquatics),"Swimming (Aquatics), 4 × 200 metres Freestyle ...",Gold
1,93860,2004 Summer Olympics,Swimming (Aquatics),"Swimming (Aquatics), 100 metres Butterfly, Men...",Gold
2,93860,2004 Summer Olympics,Swimming (Aquatics),"Swimming (Aquatics), 200 metres Butterfly, Men...",Gold
3,93860,2004 Summer Olympics,Swimming (Aquatics),"Swimming (Aquatics), 200 metres Individual Med...",Gold
4,93860,2004 Summer Olympics,Swimming (Aquatics),"Swimming (Aquatics), 400 metres Individual Med...",Gold
5,93860,2004 Summer Olympics,Swimming (Aquatics),"Swimming (Aquatics), 4 × 100 metres Medley Rel...",Gold
6,93860,2008 Summer Olympics,Swimming (Aquatics),"Swimming (Aquatics), 200 metres Freestyle, Men...",Gold
7,93860,2008 Summer Olympics,Swimming (Aquatics),"Swimming (Aquatics), 4 × 100 metres Freestyle ...",Gold
8,93860,2008 Summer Olympics,Swimming (Aquatics),"Swimming (Aquatics), 4 × 200 metres Freestyle ...",Gold
9,93860,2008 Summer Olympics,Swimming (Aquatics),"Swimming (Aquatics), 100 metres Butterfly, Men...",Gold


In [117]:
conn.close()

In [130]:
df = pd.read_csv('../data/athletes.csv')
df[df.duplicated()]

Unnamed: 0,id,name,gender,born,died,height,weight,team,game,noc,sport,event,medal
2,131892,Meryem Erdoğan,Female,24 April 1990,,172 cm,55 kg,Türkiye,2020 Summer Olympics,TUR,Athletics,"Athletics, Marathon, Women(Olympic)",
4,4300,Maurice Maina,Male,1 January 1963,,158 cm,47 kg,Kenya,1988 Summer Olympics,KEN,Boxing,"Boxing, Light-Flyweight, Men(Olympic)",
6,60239,Stanislav Tůma,Male,5 September 1948,,165 cm,62 kg,Czechoslovakia,1972 Summer Olympics,TCH,Wrestling,"Wrestling, Featherweight, Freestyle, Men(Olympic)",
8,129369,Eunice Kirwa,Female,20 May 1984,,155 cm,49 kg,Bahrain,2016 Summer Olympics,BRN,Athletics,"Athletics, Marathon, Women(Olympic)",Silver
10,142670,Sinem Kurtbay,Female,24 May 1991,,,,Finland,2020 Summer Olympics,FIN,Sailing,"Sailing, Multihull, Mixed(Olympic)",
...,...,...,...,...,...,...,...,...,...,...,...,...,...
476335,122196,Aleksa Šaponjić,Male,4 June 1992,,191 cm,98 kg,Serbia,2012 Summer Olympics,SRB,Water Polo (Aquatics),"Water Polo (Aquatics), Water Polo, Men(Olympic)",Bronze
476337,52168,Zhang Yousheng,Male,5 August 1956,,196 cm,85 kg,People's Republic of China,1984 Summer Olympics,CHN,Volleyball (Volleyball),"Volleyball (Volleyball), Volleyball, Men(Olympic)",
476340,18974,Werner Delmes,Male,28 September 1930,13 January 2022,177 cm,75 kg,Germany West Germany,1960 Summer Olympics,GER,Hockey,"Hockey, Hockey, Men(Olympic)",
476342,126253,Tim Payne,Male,10 January 1994,,179 cm,73 kg,New Zealand,2012 Summer Olympics,NZL,Football (Football),"Football (Football), Football, Men(Olympic)",


In [134]:
df[df['id']==126253]

Unnamed: 0,id,name,gender,born,died,height,weight,team,game,noc,sport,event,medal
476341,126253,Tim Payne,Male,10 January 1994,,179 cm,73 kg,New Zealand,2012 Summer Olympics,NZL,Football (Football),"Football (Football), Football, Men(Olympic)",
476342,126253,Tim Payne,Male,10 January 1994,,179 cm,73 kg,New Zealand,2012 Summer Olympics,NZL,Football (Football),"Football (Football), Football, Men(Olympic)",
