In [1]:
import os
import pandas as pd
import psycopg2 as pg2
import warnings

warnings.filterwarnings("ignore")

In [2]:
mypassword = os.getenv("POSTGRESQL_PASSWORD")

try:
    conn = pg2.connect(user = "postgres", password = mypassword, database = "olympics")
    cursor = conn.cursor()
    print("Database Connection Successful")
except pg2.connector.Error as err:
    print(f"Error: '{err}'") 

Database Connection Successful


Once the database is successfully connected, let's list the name of the tables that it contains.

In [9]:
cursor.execute("""
SELECT table_schema, table_name
FROM information_schema.tables
WHERE table_schema = 'olympics'
""")

table_names = []
print('--- Tables within "olympics" database --- ')
for table in cursor:
    print(table[1])
    table_names.append(table[1])

--- Tables within "olympics" database --- 
olympic_history
host_cities


In [12]:
for table in table_names:
    print("Table: ", table)
    display(pd.read_sql("SELECT * FROM olympics." + table, conn))

Table:  olympic_history


Unnamed: 0,id,name,gender,born,died,height,weight,noc,game,team,sport,event,medal
0,131892,Meryem Erdoğan,Female,24 April 1990,,172 cm,55 kg,Türkiye,2016 Summer Olympics,TUR,Athletics,"Athletics, Marathon, Women(Olympic)",
1,131892,Meryem Erdoğan,Female,24 April 1990,,172 cm,55 kg,Türkiye,2020 Summer Olympics,TUR,Athletics,"Athletics, Marathon, Women(Olympic)",
2,131892,Meryem Erdoğan,Female,24 April 1990,,172 cm,55 kg,Türkiye,2020 Summer Olympics,TUR,Athletics,"Athletics, Marathon, Women(Olympic)",
3,4300,Maurice Maina,Male,1 January 1963,,158 cm,47 kg,Kenya,1988 Summer Olympics,KEN,Boxing,"Boxing, Light-Flyweight, Men(Olympic)",
4,4300,Maurice Maina,Male,1 January 1963,,158 cm,47 kg,Kenya,1988 Summer Olympics,KEN,Boxing,"Boxing, Light-Flyweight, Men(Olympic)",
...,...,...,...,...,...,...,...,...,...,...,...,...,...
476343,20989,Caitlin Bilodeaux-Banos,Female,17 March 1965,,170 cm,64 kg,United States,1988 Summer Olympics,USA,Fencing,"Fencing, Foil, Individual, Women(Olympic)",
476344,20989,Caitlin Bilodeaux-Banos,Female,17 March 1965,,170 cm,64 kg,United States,1988 Summer Olympics,USA,Fencing,"Fencing, Foil, Team, Women(Olympic)",
476345,20989,Caitlin Bilodeaux-Banos,Female,17 March 1965,,170 cm,64 kg,United States,1992 Summer Olympics,USA,Fencing,"Fencing, Foil, Individual, Women(Olympic)",
476346,20989,Caitlin Bilodeaux-Banos,Female,17 March 1965,,170 cm,64 kg,United States,1992 Summer Olympics,USA,Fencing,"Fencing, Foil, Team, Women(Olympic)",


Table:  host_cities


Unnamed: 0,year,season,game,host_city
0,1896,Summer,1896 Summer Olympics,Athina
1,1900,Summer,1900 Summer Olympics,Paris
2,1904,Summer,1904 Summer Olympics,St. Louis
3,1908,Summer,1908 Summer Olympics,London
4,1912,Summer,1912 Summer Olympics,Stockholm
...,...,...,...,...
57,2010,Winter,2010 Winter Olympics,Vancouver
58,2014,Winter,2014 Winter Olympics,Sochi
59,2018,Winter,2018 Winter Olympics,PyeongChang
60,2022,Winter,2022 Winter Olympics,Beijing


## Data Cleaning

In [3]:
pd.read_sql("""
WITH count_occurence_cte AS
(
    SELECT *, RANK() OVER (PARTITION BY id, name, gender, born, died,height, weight, team, noc, game, sport, event, medal) AS occurence
    FROM olympics.olympic_history
)
SELECT *
FROM count_occurence_cte
WHERE occurence > 1
ORDER BY occurence DESC 
""", conn)

Unnamed: 0,id,name,gender,born,died,height,weight,team,game,noc,sport,event,medal,occurence


## Case Study Questions

### 1. How many olympics games have been held?

In [4]:
pd.read_sql("""
SELECT COUNT(DISTINCT h.game) AS "Total Number Of Olympic Games"
FROM olympics.olympic_history h
JOIN olympics.host_cities c ON h.game = c.game
""", conn)

Unnamed: 0,Total Number Of Olympic Games
0,53


#### 2. List down all Olympics games held so far.

In [5]:
pd.read_sql("""
SELECT DISTINCT c.year, h.game, c.host_city
FROM olympics.olympic_history h
JOIN olympics.host_cities c ON h.game = c.game
ORDER BY c.year
""", conn)

Unnamed: 0,year,game,host_city
0,1896,1896 Summer Olympics,Athina
1,1900,1900 Summer Olympics,Paris
2,1904,1904 Summer Olympics,St. Louis
3,1908,1908 Summer Olympics,London
4,1912,1912 Summer Olympics,Stockholm
5,1920,1920 Summer Olympics,Antwerpen
6,1924,1924 Summer Olympics,Paris
7,1924,1924 Winter Olympics,Chamonix
8,1928,1928 Winter Olympics,Sankt Moritz
9,1928,1928 Summer Olympics,Amsterdam


#### 3. Mention the total number of nations who participated in each olympics game?

In [6]:
pd.read_sql("""
SELECT 
    h.game,
    c.host_city,
    COUNT(DISTINCT h.noc) AS nb_nations
FROM olympics.olympic_history h
JOIN olympics.host_cities c ON h.game = c.game
GROUP BY h.game, c.host_city
""", conn)

Unnamed: 0,game,host_city,nb_nations
0,1896 Summer Olympics,Athina,14
1,1900 Summer Olympics,Paris,29
2,1904 Summer Olympics,St. Louis,10
3,1908 Summer Olympics,London,24
4,1912 Summer Olympics,Stockholm,30
5,1920 Summer Olympics,Antwerpen,29
6,1924 Summer Olympics,Paris,45
7,1924 Winter Olympics,Chamonix,20
8,1928 Summer Olympics,Amsterdam,46
9,1928 Winter Olympics,Sankt Moritz,25


#### 4. Which year saw the highest and lowest number of countries participating in olympics?

In [47]:
pd.read_sql("""
WITH count_nations_cte AS
( 
    SELECT h.game, COUNT(DISTINCT h.noc) AS nb_nations
    FROM olympics.olympic_history h
    JOIN olympics.host_cities c ON h.game = c.game
    GROUP BY h.game
)
SELECT 
    CONCAT(FIRST_VALUE(game) OVER (ORDER BY nb_nations ASC), ' - ', 
        FIRST_VALUE(nb_nations) OVER (ORDER BY nb_nations ASC), ' nations') AS lowest_nb_nations,
    CONCAT(FIRST_VALUE(game) OVER (ORDER BY nb_nations DESC), ' - ', 
        FIRST_VALUE(nb_nations) OVER (ORDER BY nb_nations DESC), ' nations') AS highest_nb_nations
FROM count_nations_cte
LIMIT 1
""", conn)

Unnamed: 0,lowest_nb_nations,highest_nb_nations
0,1904 Summer Olympics - 10 nations,2016 Summer Olympics - 207 nations


#### 5. Which nation has participated in all of the olympic games?

In [15]:
pd.read_sql("""
SELECT COUNT(DISTINCT h.game) AS "Total Olympic Games"
FROM olympics.olympic_history h
JOIN olympics.host_cities c ON h.game = c.game
""", conn) 

Unnamed: 0,Total Olympic Games
0,53


In [71]:
pd.read_sql("""
WITH count_games_cte AS
(
    SELECT h.team, COUNT(DISTINCT h.game) AS total_olympics_attended
    FROM olympics.olympic_history h
    JOIN olympics.host_cities c ON h.game = c.game
    GROUP BY h.team
)
SELECT team, total_olympics_attended
FROM count_games_cte
WHERE total_olympics_attended = 
(
    SELECT COUNT(DISTINCT h.game) AS total_games
    FROM olympics.olympic_history h
    JOIN olympics.host_cities c ON h.game = c.game
)
""", conn)

Unnamed: 0,team,total_olympics_attended
0,Great Britain,53
1,Italy,53


#### 6. Identify the sport which was played in all summer olympics.

In [92]:
pd.read_sql("""
SELECT COUNT(DISTINCT h.game) AS "Total Number of Summer Games"
FROM olympics.olympic_history h
JOIN olympics.host_cities c ON h.game = c.game
WHERE c.season = 'Summer'
""", conn) 

Unnamed: 0,Total Number of Summer Games
0,29


In [95]:
pd.read_sql("""
WITH count_summer_games_cte AS
(
    SELECT h.sport, COUNT(DISTINCT h.game) AS total_summer_games_attended
    FROM olympics.olympic_history h
    JOIN olympics.host_cities c ON h.game = c.game
    WHERE c.season = 'Summer'
    GROUP BY h.sport
)
SELECT sport, total_summer_games_attended
FROM count_summer_games_cte
WHERE total_summer_games_attended =
(
    SELECT COUNT(DISTINCT h.game) AS "Total Number of Summer Games"
    FROM olympics.olympic_history h
    JOIN olympics.host_cities c ON h.game = c.game
    WHERE c.season = 'Summer'
)
""", conn) 

Unnamed: 0,sport,total_summer_games_attended
0,Artistic Gymnastics (Gymnastics),29
1,Athletics,29
2,Fencing,29
3,Swimming (Aquatics),29


#### 7. Which Sports were just played only once in the olympics?

In [111]:
pd.read_sql("""
WITH count_sport_games_cte AS
(
    SELECT 
        h.sport, 
        COUNT(DISTINCT h.game) AS "nb_games"
    FROM olympics.olympic_history h
    JOIN olympics.host_cities c ON h.game = c.game
    GROUP BY h.sport
    HAVING COUNT(DISTINCT h.game) = 1
)
SELECT 
    DISTINCT cte.sport, 
    c.year, 
    h.game, 
    c.host_city, 
    cte.nb_games
FROM count_sport_games_cte cte
LEFT JOIN olympics.olympic_history h ON cte.sport = h.sport
JOIN olympics.host_cities c ON h.game = c.game
ORDER BY c.year 
""", conn) 

Unnamed: 0,sport,year,game,host_city,nb_games
0,Automobile Racing,1900,1900 Summer Olympics,Paris,1
1,Ballooning (Air Sports),1900,1900 Summer Olympics,Paris,1
2,Boules,1900,1900 Summer Olympics,Paris,1
3,Cricket,1900,1900 Summer Olympics,Paris,1
4,Croquet,1900,1900 Summer Olympics,Paris,1
5,Equestrian Driving (Equestrian),1900,1900 Summer Olympics,Paris,1
6,Firefighting,1900,1900 Summer Olympics,Paris,1
7,Fishing,1900,1900 Summer Olympics,Paris,1
8,Motorcycle Sports,1900,1900 Summer Olympics,Paris,1
9,Roque,1904,1904 Summer Olympics,St. Louis,1


#### 8. Fetch the total number of sports played in each olympic games.

In [114]:
pd.read_sql("""
SELECT 
    h.game, 
    c.host_city, 
    COUNT(DISTINCT h.sport) AS total_number_disciplines
FROM olympics.olympic_history h
JOIN olympics.host_cities c ON h.game = c.game
GROUP BY h.game, c.host_city
""", conn)

Unnamed: 0,game,host_city,total_number_disciplines
0,1896 Summer Olympics,Athina,10
1,1900 Summer Olympics,Paris,28
2,1904 Summer Olympics,St. Louis,20
3,1908 Summer Olympics,London,26
4,1912 Summer Olympics,Stockholm,21
5,1920 Summer Olympics,Antwerpen,30
6,1924 Summer Olympics,Paris,27
7,1924 Winter Olympics,Chamonix,10
8,1928 Summer Olympics,Amsterdam,23
9,1928 Winter Olympics,Sankt Moritz,10


#### 9. Fetch details of the oldest athletes to win a gold medal.

In [None]:
pd.read_sql("""
SELECT 
    h.game, 
    c.host_city, 
    COUNT(DISTINCT h.sport) AS total_number_disciplines
FROM olympics.olympic_history h
JOIN olympics.host_cities c ON h.game = c.game
GROUP BY h.game, c.host_city
""", conn)

#### 10. Find the Ratio of male and female athletes participated in all olympic games.

In [116]:
pd.read_sql("""
SELECT 
    SUM(CASE WHEN h.gender = 'Male' THEN 1 ELSE 0 END) AS males,
    SUM(CASE WHEN h.gender = 'Female' THEN 1 ELSE 0 END) AS females
FROM olympics.olympic_history h
JOIN olympics.host_cities c ON h.game = c.game
""", conn)

Unnamed: 0,males,females
0,338549,129524


In [117]:
conn.close()