## 1. Import Libraries

In [2]:
import os
import pandas as pd
import psycopg2 as pg2
import warnings

warnings.filterwarnings("ignore")

## 2. Tables of the Database
### Connecting the database from Jupyter Notebook

In [197]:
mypassword = os.getenv("POSTGRESQL_PASSWORD")

try:
    conn = pg2.connect(user = "postgres", password = mypassword, database = "olympics")
    cursor = conn.cursor()
    print("Database Connection Successful")
except pg2.connector.Error as err:
    print(f"Error: '{err}'") 

Database Connection Successful


Once the database is successfully connected, let's list the name of the tables that it contains.

In [194]:
cursor.execute("""
SELECT table_schema, table_name
FROM information_schema.tables
WHERE table_schema = 'olympics'
""")

table_names = []
print('--- Tables within "olympics" database --- ')
for table in cursor:
    print(table[1])
    table_names.append(table[1])

--- Tables within "olympics" database --- 
olympic_history
host_cities


Here are the previews of the 2 tables under the olympics database:

In [12]:
for table in table_names:
    print("Table: ", table)
    display(pd.read_sql("SELECT * FROM olympics." + table, conn))

Table:  olympic_history


Unnamed: 0,id,name,gender,born,died,height,weight,noc,game,team,sport,event,medal
0,131892,Meryem Erdoğan,Female,24 April 1990,,172 cm,55 kg,Türkiye,2016 Summer Olympics,TUR,Athletics,"Athletics, Marathon, Women(Olympic)",
1,131892,Meryem Erdoğan,Female,24 April 1990,,172 cm,55 kg,Türkiye,2020 Summer Olympics,TUR,Athletics,"Athletics, Marathon, Women(Olympic)",
2,131892,Meryem Erdoğan,Female,24 April 1990,,172 cm,55 kg,Türkiye,2020 Summer Olympics,TUR,Athletics,"Athletics, Marathon, Women(Olympic)",
3,4300,Maurice Maina,Male,1 January 1963,,158 cm,47 kg,Kenya,1988 Summer Olympics,KEN,Boxing,"Boxing, Light-Flyweight, Men(Olympic)",
4,4300,Maurice Maina,Male,1 January 1963,,158 cm,47 kg,Kenya,1988 Summer Olympics,KEN,Boxing,"Boxing, Light-Flyweight, Men(Olympic)",
...,...,...,...,...,...,...,...,...,...,...,...,...,...
476343,20989,Caitlin Bilodeaux-Banos,Female,17 March 1965,,170 cm,64 kg,United States,1988 Summer Olympics,USA,Fencing,"Fencing, Foil, Individual, Women(Olympic)",
476344,20989,Caitlin Bilodeaux-Banos,Female,17 March 1965,,170 cm,64 kg,United States,1988 Summer Olympics,USA,Fencing,"Fencing, Foil, Team, Women(Olympic)",
476345,20989,Caitlin Bilodeaux-Banos,Female,17 March 1965,,170 cm,64 kg,United States,1992 Summer Olympics,USA,Fencing,"Fencing, Foil, Individual, Women(Olympic)",
476346,20989,Caitlin Bilodeaux-Banos,Female,17 March 1965,,170 cm,64 kg,United States,1992 Summer Olympics,USA,Fencing,"Fencing, Foil, Team, Women(Olympic)",


Table:  host_cities


Unnamed: 0,year,season,game,host_city
0,1896,Summer,1896 Summer Olympics,Athina
1,1900,Summer,1900 Summer Olympics,Paris
2,1904,Summer,1904 Summer Olympics,St. Louis
3,1908,Summer,1908 Summer Olympics,London
4,1912,Summer,1912 Summer Olympics,Stockholm
...,...,...,...,...
57,2010,Winter,2010 Winter Olympics,Vancouver
58,2014,Winter,2014 Winter Olympics,Sochi
59,2018,Winter,2018 Winter Olympics,PyeongChang
60,2022,Winter,2022 Winter Olympics,Beijing


## 3. Data Cleaning

### 3.1 Check duplicates
Using a Window function, the following code confirms that the dataset does not contain duplicate records.

In [3]:
pd.read_sql("""
WITH count_occurence_cte AS
(
    SELECT *, 
    RANK() OVER (PARTITION BY id, name, gender, born, died, height, weight, team, noc, game, sport, event, medal) AS occurence
    FROM olympics.olympic_history
)
SELECT *
FROM count_occurence_cte
WHERE occurence > 1
ORDER BY occurence DESC 
""", conn)

Unnamed: 0,id,name,gender,born,died,height,weight,team,game,noc,sport,event,medal,occurence


### 3.2 Standardize inconsistent date values and Handle missing values

In both the `born` and `died` columns, the date values exhibit inconsistency. The following are the various formats found in both columns:
- `'(1880 or 1881)'` 
- `'(circa 1880)'` 
- `'(c. 1880)'`
- `'September 1880'`
- `'1933'` 
- `'NULL'`

In [173]:
# Check date values that do not start with digits
pd.read_sql("""
SELECT DISTINCT born
FROM olympics.olympic_history
WHERE (LENGTH(born) = 4 OR NOT born ~ E'^[0-9]+') AND born != 'NULL' 
""", conn)

Unnamed: 0,born
0,1910
1,January 1885
2,December 1881
3,1933
4,September 1880
...,...
270,(circa 1929)
271,April 1872
272,1875
273,(1942 or 1943)


In [325]:
# Create empty table
cursor.execute("DROP TABLE IF EXISTS olympics.olympic_history_cleaned;")
cursor.execute("""
CREATE TABLE olympics.olympic_history_cleaned
(
    id INTEGER,
    name VARCHAR,
    gender VARCHAR,
    born VARCHAR,
    birth_year INTEGER,
    died VARCHAR,
    death_year INTEGER,
    height_cm INTEGER,
    weight_kg INTEGER,
    noc VARCHAR,
    country VARCHAR,
    game VARCHAR,
    sport VARCHAR,
    event VARCHAR,
    medal VARCHAR
);

""")

cursor.execute("""
INSERT INTO olympics.olympic_history_cleaned
SELECT
    id,
    name,
    gender,
    
    -- Extract Year values from 'born' and 'died' columns 
    -- Handle missing values
    CASE WHEN born = 'NULL' THEN NULL ELSE born END AS born,
    CASE WHEN born = 'NULL' THEN NULL ELSE SUBSTRING(born from '\d{4}')::INTEGER END AS birth_year,
    CASE WHEN died = 'NULL' THEN NULL ELSE died END AS died,
    CASE WHEN died = 'NULL' THEN NULL ELSE SUBSTRING(died from '\d{4}')::INTEGER END AS death_year,
    
    -- Extract measurements
    CASE WHEN height = 'NULL' THEN NULL ELSE SUBSTRING(height from '\d{3}')::INTEGER END AS height_cm,
    CASE WHEN weight = 'NULL' THEN NULL ELSE SUBSTRING(weight from '\d{2}')::INTEGER END AS weight_kg,
    
    noc, 
    team AS country,
    game, 
    sport, 
    event,
    CASE WHEN medal = 'NULL' THEN NULL ELSE medal END AS medal
    
FROM olympics.olympic_history  
""")

conn.commit()

In [326]:
pd.read_sql("""
SELECT *
FROM olympics.olympic_history_cleaned
""", conn)

Unnamed: 0,id,name,gender,born,birth_year,died,death_year,height_cm,weight_kg,noc,country,game,sport,event,medal
0,61659,Peter Cooke,Male,23 September 1924,1924.0,27 December 2001,2001.0,172.0,69.0,KEN,Kenya,1964 Summer Olympics,Sailing,"Sailing, One Person Dinghy, Open(Olympic)",
1,61659,Peter Cooke,Male,23 September 1924,1924.0,27 December 2001,2001.0,172.0,69.0,KEN,Kenya,1964 Summer Olympics,Sailing,"Sailing, One Person Dinghy, Open(Olympic)",
2,11427,Jennifer Parlevliet,Female,27 April 1960,1960.0,,,165.0,53.0,AUS,Australia,1996 Summer Olympics,Equestrian Jumping (Equestrian),"Equestrian Jumping (Equestrian), Individual, O...",
3,11427,Jennifer Parlevliet,Female,27 April 1960,1960.0,,,165.0,53.0,AUS,Australia,1996 Summer Olympics,Equestrian Jumping (Equestrian),"Equestrian Jumping (Equestrian), Team, Open(Ol...",
4,11427,Jennifer Parlevliet,Female,27 April 1960,1960.0,,,165.0,53.0,AUS,Australia,1996 Summer Olympics,Equestrian Jumping (Equestrian),"Equestrian Jumping (Equestrian), Team, Open(Ol...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476343,90774,Li Na,Female,9 March 1981,1981.0,,,177.0,67.0,CHN,People's Republic of China,2012 Summer Olympics,Fencing,"Fencing, Épée, Team, Women(Olympic)",Gold
476344,90774,Li Na,Female,9 March 1981,1981.0,,,177.0,67.0,CHN,People's Republic of China,2012 Summer Olympics,Fencing,"Fencing, Épée, Team, Women(Olympic)",Gold
476345,114612,Lopez Lomong,Male,1 January 1985,1985.0,,,180.0,69.0,USA,United States,2008 Summer Olympics,Athletics,"Athletics, 1,500 metres, Men(Olympic)",
476346,114612,Lopez Lomong,Male,1 January 1985,1985.0,,,180.0,69.0,USA,United States,2012 Summer Olympics,Athletics,"Athletics, 5,000 metres, Men(Olympic)",


## Case Study Questions

### 1. How many olympics games have been held?

In [202]:
pd.read_sql("""
SELECT COUNT(DISTINCT h.game) AS "Total Number Of Olympic Games"
FROM olympics.olympic_history_cleaned h
JOIN olympics.host_cities c ON h.game = c.game
""", conn)

Unnamed: 0,Total Number Of Olympic Games
0,53


#### 2. List down all Olympics games held so far.

In [203]:
pd.read_sql("""
SELECT DISTINCT c.year, h.game, c.host_city
FROM olympics.olympic_history_cleaned h
JOIN olympics.host_cities c ON h.game = c.game
ORDER BY c.year
""", conn)

Unnamed: 0,year,game,host_city
0,1896,1896 Summer Olympics,Athina
1,1900,1900 Summer Olympics,Paris
2,1904,1904 Summer Olympics,St. Louis
3,1908,1908 Summer Olympics,London
4,1912,1912 Summer Olympics,Stockholm
5,1920,1920 Summer Olympics,Antwerpen
6,1924,1924 Summer Olympics,Paris
7,1924,1924 Winter Olympics,Chamonix
8,1928,1928 Summer Olympics,Amsterdam
9,1928,1928 Winter Olympics,Sankt Moritz


#### 3. Mention the total number of nations who participated in each olympics game?

In [204]:
pd.read_sql("""
SELECT 
    h.game,
    c.host_city,
    COUNT(DISTINCT h.noc) AS nb_nations
FROM olympics.olympic_history_cleaned h
JOIN olympics.host_cities c ON h.game = c.game
GROUP BY h.game, c.host_city
""", conn)

Unnamed: 0,game,host_city,nb_nations
0,1896 Summer Olympics,Athina,14
1,1900 Summer Olympics,Paris,29
2,1904 Summer Olympics,St. Louis,10
3,1908 Summer Olympics,London,24
4,1912 Summer Olympics,Stockholm,30
5,1920 Summer Olympics,Antwerpen,29
6,1924 Summer Olympics,Paris,45
7,1924 Winter Olympics,Chamonix,20
8,1928 Summer Olympics,Amsterdam,46
9,1928 Winter Olympics,Sankt Moritz,25


#### 4. Which year saw the highest and lowest number of countries participating in olympics?

In [245]:
pd.read_sql("""
WITH count_nations_cte AS
( 
    SELECT h.game, COUNT(DISTINCT h.noc) AS nb_nations
    FROM olympics.olympic_history_cleaned h
    JOIN olympics.host_cities c ON h.game = c.game
    GROUP BY h.game
)
SELECT DISTINCT
    CONCAT(FIRST_VALUE(game) OVER (ORDER BY nb_nations ASC), ' - ', 
        FIRST_VALUE(nb_nations) OVER (ORDER BY nb_nations ASC), ' nations') AS lowest_nb_nations,
    CONCAT(FIRST_VALUE(game) OVER (ORDER BY nb_nations DESC), ' - ', 
        FIRST_VALUE(nb_nations) OVER (ORDER BY nb_nations DESC), ' nations') AS highest_nb_nations
FROM count_nations_cte
""", conn)

Unnamed: 0,lowest_nb_nations,highest_nb_nations
0,1904 Summer Olympics - 10 nations,2016 Summer Olympics - 207 nations


#### 5. Which nation has participated in all of the olympic games?

In [206]:
pd.read_sql("""
SELECT COUNT(DISTINCT h.game) AS "Total Olympic Games"
FROM olympics.olympic_history_cleaned h
JOIN olympics.host_cities c ON h.game = c.game
""", conn) 

Unnamed: 0,Total Olympic Games
0,53


In [209]:
pd.read_sql("""
WITH count_games_cte AS
(
    SELECT h.country, COUNT(DISTINCT h.game) AS total_olympics_attended
    FROM olympics.olympic_history_cleaned h
    JOIN olympics.host_cities c ON h.game = c.game
    GROUP BY h.country
)
SELECT country, total_olympics_attended
FROM count_games_cte
WHERE total_olympics_attended = 
(
    SELECT COUNT(DISTINCT h.game) AS total_games
    FROM olympics.olympic_history_cleaned h
    JOIN olympics.host_cities c ON h.game = c.game
)
""", conn)

Unnamed: 0,country,total_olympics_attended
0,Great Britain,53
1,Italy,53


#### 6. Identify the sport which was played in all summer olympics.

In [210]:
pd.read_sql("""
SELECT COUNT(DISTINCT h.game) AS "Total Number of Summer Games"
FROM olympics.olympic_history_cleaned h
JOIN olympics.host_cities c ON h.game = c.game
WHERE c.season = 'Summer'
""", conn) 

Unnamed: 0,Total Number of Summer Games
0,29


In [211]:
pd.read_sql("""
WITH count_summer_games_cte AS
(
    SELECT h.sport, COUNT(DISTINCT h.game) AS total_summer_games_attended
    FROM olympics.olympic_history_cleaned h
    JOIN olympics.host_cities c ON h.game = c.game
    WHERE c.season = 'Summer'
    GROUP BY h.sport
)
SELECT sport, total_summer_games_attended
FROM count_summer_games_cte
WHERE total_summer_games_attended =
(
    SELECT COUNT(DISTINCT h.game) AS "Total Number of Summer Games"
    FROM olympics.olympic_history_cleaned h
    JOIN olympics.host_cities c ON h.game = c.game
    WHERE c.season = 'Summer'
)
""", conn) 

Unnamed: 0,sport,total_summer_games_attended
0,Artistic Gymnastics (Gymnastics),29
1,Athletics,29
2,Fencing,29
3,Swimming (Aquatics),29


#### 7. Which Sports were just played only once in the olympics?

In [212]:
pd.read_sql("""
WITH count_sport_games_cte AS
(
    SELECT 
        h.sport, 
        COUNT(DISTINCT h.game) AS "nb_games"
    FROM olympics.olympic_history_cleaned h
    JOIN olympics.host_cities c ON h.game = c.game
    GROUP BY h.sport
    HAVING COUNT(DISTINCT h.game) = 1
)
SELECT 
    DISTINCT cte.sport, 
    c.year, 
    h.game, 
    c.host_city, 
    cte.nb_games
FROM count_sport_games_cte cte
LEFT JOIN olympics.olympic_history_cleaned h ON cte.sport = h.sport
JOIN olympics.host_cities c ON h.game = c.game
ORDER BY c.year 
""", conn) 

Unnamed: 0,sport,year,game,host_city,nb_games
0,Automobile Racing,1900,1900 Summer Olympics,Paris,1
1,Ballooning (Air Sports),1900,1900 Summer Olympics,Paris,1
2,Boules,1900,1900 Summer Olympics,Paris,1
3,Cricket,1900,1900 Summer Olympics,Paris,1
4,Croquet,1900,1900 Summer Olympics,Paris,1
5,Equestrian Driving (Equestrian),1900,1900 Summer Olympics,Paris,1
6,Firefighting,1900,1900 Summer Olympics,Paris,1
7,Fishing,1900,1900 Summer Olympics,Paris,1
8,Motorcycle Sports,1900,1900 Summer Olympics,Paris,1
9,Roque,1904,1904 Summer Olympics,St. Louis,1


#### 8. Fetch the total number of sports played in each olympic games.

In [213]:
pd.read_sql("""
SELECT 
    h.game, 
    c.host_city, 
    COUNT(DISTINCT h.sport) AS total_number_disciplines
FROM olympics.olympic_history_cleaned h
JOIN olympics.host_cities c ON h.game = c.game
GROUP BY h.game, c.host_city
""", conn)

Unnamed: 0,game,host_city,total_number_disciplines
0,1896 Summer Olympics,Athina,10
1,1900 Summer Olympics,Paris,28
2,1904 Summer Olympics,St. Louis,20
3,1908 Summer Olympics,London,26
4,1912 Summer Olympics,Stockholm,21
5,1920 Summer Olympics,Antwerpen,30
6,1924 Summer Olympics,Paris,27
7,1924 Winter Olympics,Chamonix,10
8,1928 Summer Olympics,Amsterdam,23
9,1928 Winter Olympics,Sankt Moritz,10


#### 9. Fetch details of the oldest athletes to win a gold medal.

In [274]:
pd.read_sql("""
SELECT 
    h.id, 
    h.name, 
    CASE 
        WHEN birth_year > 1910 AND death_year IS NOT NULL THEN death_year - birth_year
        WHEN birth_year > 1910 THEN 2023 - birth_year 
        ELSE NULL
    END AS age,
    h.medal
FROM olympics.olympic_history_cleaned h
JOIN olympics.host_cities c ON h.game = c.game
WHERE h.medal = 'Gold' 
GROUP BY h.id, h.name, h.birth_year, h.death_year, h.medal
ORDER BY age DESC
LIMIT 5
""", conn)

Unnamed: 0,id,name,age,medal
0,42764,Gyula Halasy,,Gold
1,62478,Christian Jebe,,Gold
2,18851,John Robinson,,Gold
3,22697,Nedo Nadi,,Gold
4,19145,Richard Allen,,Gold


#### 10. Find the Ratio of male and female athletes participated in all olympic games.

In [238]:
pd.read_sql("""
SELECT 
    TO_CHAR(SUM(CASE WHEN gender = 'Male' THEN 1 ELSE 0 END), 'FM999,999') AS nb_males,
    TO_CHAR(SUM(CASE WHEN gender = 'Female' THEN 1 ELSE 0 END), 'FM999,999') AS nb_females,
    TO_CHAR(COUNT(id), 'FM999,999') AS total_athletes,
    CONCAT(ROUND(SUM(CASE WHEN gender = 'Male' THEN 1 ELSE 0 END)/COUNT(id)::NUMERIC * 100, 1), ' %') AS males_percent,
    CONCAT(ROUND(SUM(CASE WHEN gender = 'Female' THEN 1 ELSE 0 END)/COUNT(id)::NUMERIC * 100, 1), ' %') AS females_percent,
    CONCAT(ROUND(SUM(CASE WHEN gender = 'Male' THEN 1 ELSE 0 END)/SUM(CASE WHEN gender = 'Female' THEN 1 ELSE 0 END)::NUMERIC), ': 1') AS "Ration M:F"    
FROM 
(
    -- Retrieve unique IDs
    
    SELECT DISTINCT h.id, h.gender, h.game
    FROM olympics.olympic_history_cleaned h
    JOIN olympics.host_cities c ON h.game = c.game
    
) unique_id

""", conn)

Unnamed: 0,nb_males,nb_females,total_athletes,males_percent,females_percent,Ration M:F
0,156845,58644,215489,72.8 %,27.2 %,3: 1


#### 11. In which year were female athletes permitted to participate in the Olympic Games? Provide a breakdown of the male and female participants for each Olympic Games.

In [386]:
pd.read_sql("""    

    
SELECT DISTINCT h.id, h.name, h.gender, h.country, h.game

FROM olympics.olympic_history_cleaned h
JOIN olympics.host_cities c ON h.game = c.game
    
WHERE 
    -- h.noc = 'CZE' AND
    h.gender = 'Female'
    -- h.gender = 'Male' 
    -- AND c.year = 1896 
    -- AND event LIKE '%(Olympic)'
    AND h.game = '1924 Winter Olympics'

""", conn)

Unnamed: 0,id,name,gender,country,game
0,80676,Theresa Weld-Blanchard,Female,United States,1924 Winter Olympics
1,81010,Ludowika Jakobsson-Eilers,Female,Finland,1924 Winter Olympics
2,81024,Helene Engelmann,Female,Austria,1924 Winter Olympics
3,81280,Sonja Henie,Female,Norway,1924 Winter Olympics
4,81285,Gérardine Herbos,Female,Belgium,1924 Winter Olympics
5,81421,Andrée Brunet-Joly,Female,France,1924 Winter Olympics
6,81707,Beatrix Loughran,Female,United States,1924 Winter Olympics
7,81885,Ethel Muckelt,Female,Great Britain,1924 Winter Olympics
8,81948,Svea Norén,Female,Sweden,1924 Winter Olympics
9,82188,Mildred Richardson,Female,Great Britain,1924 Winter Olympics


In [382]:
pd.read_sql("""   
WITH unique_id_cte AS
(
    SELECT 
        DISTINCT id, 
        name, 
        gender, 
        
        game,
        year
    FROM 
    (
        SELECT h.id, h.name, h.gender, h.game, c.year
        FROM olympics.olympic_history_cleaned h
        JOIN olympics.host_cities c ON h.game = c.game
        WHERE h.event LIKE '%(Olympic)' 
    ) e
)
SELECT 
    year,
    game,
    SUM(CASE WHEN gender = 'Male' THEN 1 ELSE 0 END) AS nb_males,
    SUM(CASE WHEN gender = 'Female' THEN 1 ELSE 0 END) AS nb_females,
    COUNT(id) AS total_athletes
FROM unique_id_cte

GROUP BY year, game
ORDER BY year
""", conn)

Unnamed: 0,year,game,nb_males,nb_females,total_athletes
0,1896,1896 Summer Olympics,242,2,244
1,1900,1900 Summer Olympics,1303,26,1329
2,1904,1904 Summer Olympics,746,6,752
3,1908,1908 Summer Olympics,2351,53,2404
4,1912,1912 Summer Olympics,2803,62,2865
5,1920,1920 Summer Olympics,2759,81,2840
6,1924,1924 Winter Olympics,360,16,376
7,1924,1924 Summer Olympics,3757,170,3927
8,1928,1928 Summer Olympics,3422,345,3767
9,1928,1928 Winter Olympics,457,28,485


In [365]:
pd.read_sql("""    
SELECT 
    year,
    game,
    SUM(CASE WHEN gender = 'Male' THEN 1 ELSE 0 END) AS nb_males,
    SUM(CASE WHEN gender = 'Female' THEN 1 ELSE 0 END) AS nb_females,
    COUNT(id) AS total_athletes
FROM 
(
    -- Retrieve unique IDs
    
    SELECT DISTINCT h.id, h.gender, h.game, c.year
    FROM olympics.olympic_history_cleaned h
    JOIN olympics.host_cities c ON h.game = c.game
    WHERE 
        h.event LIKE '%(Olympic)' 
        AND NOT h.event LIKE '%Open%' 
        -- AND NOT h.event LIKE '%Mixed%' 

    
) i
GROUP BY year, game
ORDER BY year
""", conn)

Unnamed: 0,year,game,nb_males,nb_females,total_athletes
0,1896,1896 Summer Olympics,242,2,244
1,1900,1900 Summer Olympics,1131,19,1150
2,1904,1904 Summer Olympics,746,6,752
3,1908,1908 Summer Olympics,2275,51,2326
4,1912,1912 Summer Olympics,2662,61,2723
5,1920,1920 Summer Olympics,2649,79,2728
6,1924,1924 Winter Olympics,339,16,355
7,1924,1924 Summer Olympics,3526,146,3672
8,1928,1928 Summer Olympics,2918,303,3221
9,1928,1928 Winter Olympics,457,28,485


In [117]:
conn.close()