In [1]:
import pandas as pd
from sqlalchemy import create_engine,text
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# DB connection
src_db = 'postgresql://postgres:postgres@pgdb:5432/Olympics'
trg_db = 'postgresql://postgres:postgres@pgdb:5432/OlympicsDW'

src_engine = create_engine(src_db)
trg_engine = create_engine(trg_db)

In [2]:
def extract(query, engine):
    """Extract data from the source database."""
    try:
        df = pd.read_sql_query(query, con=engine)
        logging.info(f"Data extracted successfully for query: {query}")
        return df
    except Exception as e:
        logging.error(f"Error extracting data: {e}")
        raise

def load(df, table_name, engine):
    """Load data into the target database."""
    try:
        df.to_sql(table_name, con=engine, if_exists='replace', index=False)
        logging.info(f"Data loaded into {table_name}")
    except Exception as e:
        logging.error(f"Error loading data into {table_name}: {e}")
        raise

In [3]:
# Extract source data
global_population_df = extract('SELECT * FROM globalpopulation', src_engine)
life_expectancy_df = extract('SELECT * FROM lifeexpectancy', src_engine)
list_of_countries_df = extract('SELECT * FROM listofcountriesareasbycontinent', src_engine)
mental_illness_df = extract('SELECT * FROM mentalillness', src_engine)
olympic_hosts_df = extract('SELECT * FROM olympichosts', src_engine)
olympic_medals_df = extract('SELECT * FROM olympicmedals', src_engine)
economic_df = extract('SELECT * FROM economic', src_engine)

2024-03-23 12:19:11,496 - INFO - Data extracted successfully for query: SELECT * FROM globalpopulation
2024-03-23 12:19:11,887 - INFO - Data extracted successfully for query: SELECT * FROM lifeexpectancy
2024-03-23 12:19:11,901 - INFO - Data extracted successfully for query: SELECT * FROM listofcountriesareasbycontinent
2024-03-23 12:19:12,039 - INFO - Data extracted successfully for query: SELECT * FROM mentalillness
2024-03-23 12:19:12,053 - INFO - Data extracted successfully for query: SELECT * FROM olympichosts
2024-03-23 12:19:12,345 - INFO - Data extracted successfully for query: SELECT * FROM olympicmedals
2024-03-23 12:19:12,356 - INFO - Data extracted successfully for query: SELECT * FROM economic


# DimCountry

In [44]:
# DimCountry(country_name, country_code, region)

# Rename col for consistency
olympic_medals_df_ = olympic_medals_df.rename(columns={'country_code': 'country_code_2', 'country_3_letter_code': 'country_code'})
life_expectancy_df_ = life_expectancy_df.rename(columns={'entity': 'country_name', 'country_3_letter_code': 'country_code'})
economic_df_ = economic_df.rename(columns={'country_3_letter_code': 'country_code'})

combined_df = list_of_countries_df.rename(columns={'country': 'country_name'}).copy()

# Merge with olympic medals for country code
combined_df = pd.merge(combined_df, olympic_medals_df_[['country_name', 'country_code']].drop_duplicates(), on='country_name', how='left')

# Merge + Prioritize non-null values
combined_df = pd.merge(combined_df, life_expectancy_df_[['country_name', 'country_code']].drop_duplicates(), on='country_name', how='left', suffixes=('', '_from_life'))
combined_df['country_code'] = combined_df.apply(lambda row: row['country_code'] if pd.notna(row['country_code']) else row['country_code_from_life'], axis=1)
combined_df.drop(columns='country_code_from_life', inplace=True)
combined_df = pd.merge(combined_df, economic_df_[['country_name', 'country_code']].drop_duplicates(), on='country_name', how='left', suffixes=('', '_from_econ'))
combined_df['country_code'] = combined_df.apply(lambda row: row['country_code'] if pd.notna(row['country_code']) else row['country_code_from_econ'], axis=1)
combined_df.drop(columns='country_code_from_econ', inplace=True)

# No duplicate country_name
combined_df = combined_df.drop_duplicates(subset=['country_name'], keep='first')
combined_df = combined_df[['country_name', 'country_code', 'region']]

# Prioritize country code over NaN
combined_df = combined_df.sort_values(by=['country_name', 'country_code']).drop_duplicates(subset=['country_name'], keep='last')

combined_df.to_sql('dimcountry', trg_engine, if_exists='append', index=False, method='multi')
print("Data successfully loaded into DimCountry.")
combined_df

Data successfully loaded into DimCountry.


Unnamed: 0,country_name,country_code,region
35,Afghanistan,AFG,Asia
138,Albania,ALB,Europe
33,Algeria,ALG,Africa
213,American Samoa,ASM,Oceania
202,Andorra,AND,Europe
...,...,...,...
226,Wallis and Futuna,WLF,Oceania
171,Western Sahara,ESH,Africa
43,Yemen,YEM,Asia
62,Zambia,ZAM,Africa


# DimTime

In [54]:
# DimTime (year, game_season)
dim_time_df = olympic_hosts_df.drop_duplicates().rename(columns={'game_year': 'year'})
dim_time_df = dim_time_df[['year', 'game_season']]
dim_time_df.to_sql('dimtime', trg_engine, if_exists='append', index=False)
print("DimTime populated successfully.")
dim_time_df

DimTime populated successfully.


Unnamed: 0,year,game_season
0,2022,Winter
1,2020,Summer
2,2018,Winter
3,2016,Summer
4,2014,Winter
5,2012,Summer
6,2010,Winter
7,2008,Summer
8,2006,Winter
9,2004,Summer


# DimAthlete

In [82]:
# DimAthlete (fullname, gender)
gender_map = {
    'Mixed': 'Mixed',
    'Women': 'Female',
    'Men': 'Male'
}
olympic_medals_df['gender'] = olympic_medals_df['event_gender'].map(gender_map)
dim_athlete_df = olympic_medals_df[['athlete_full_name', 'gender']].drop_duplicates().rename(columns={'athlete_full_name': 'full_name'})

# Replace NULL with 'Team Event' for 'full_name'
dim_athlete_df.fillna({'full_name': 'Team Event'}, inplace=True)

dim_athlete_df = dim_athlete_df[['full_name', 'gender']]
dim_athlete_df.to_sql('dimathlete', trg_engine, if_exists='append', index=False)
print("DimAthlete populated successfully.")
dim_athlete_df

DimAthlete populated successfully.


Unnamed: 0,full_name,gender
0,Stefania CONSTANTINI,Mixed
1,Amos MOSANER,Mixed
2,Kristin SKASLIEN,Mixed
3,Magnus NEDREGOTTEN,Mixed
4,Almida DE VAL,Mixed
...,...,...
12830,Konstantinos PASPATIS,Male
12832,Dimitrios PETROKOKKINOS,Male
12834,Georgios TSITAS,Male
12835,Stefanos Khristopoulos,Male


# DimEvent

In [86]:
# DimEvent (event_id, discipline, game_slug)
dim_event_df = olympic_medals_df[['event_title', 'discipline_title', 'slug_game']].drop_duplicates()
dim_event_df = dim_event_df.rename(columns={'discipline_title': 'discipline', 'slug_game': 'game_slug'})
dim_event_df.to_sql('dimevent', con=trg_engine, if_exists='append', index=False)
print("DimEvent populated successfully.")
dim_event_df

DimEvent populated successfully.


Unnamed: 0,event_title,discipline,game_slug
0,Mixed Doubles,Curling,beijing-2022
6,Women,Curling,beijing-2022
8,Men,Curling,beijing-2022
10,Men's Moguls,Freestyle Skiing,beijing-2022
13,Men's Freeski Halfpipe,Freestyle Skiing,beijing-2022
...,...,...,...
12828,Singles men,Tennis,athens-1896
12831,doubles men,Tennis,athens-1896
12833,"Unlimited Class, Greco-Roman Men",Wrestling,athens-1896
12836,heavyweight - one hand lift men,Weightlifting,athens-1896


# FactMedalWins

In [114]:
dim_country_df = extract('SELECT * FROM DimCountry', trg_engine)
dim_athlete_df = extract('SELECT * FROM DimAthlete', trg_engine)
dim_event_df = extract('SELECT * FROM DimEvent', trg_engine)
dim_time_df = extract('SELECT * FROM DimTime', trg_engine)

2024-03-23 09:25:29,963 - INFO - Data extracted successfully for query: SELECT * FROM DimCountry
2024-03-23 09:25:29,985 - INFO - Data extracted successfully for query: SELECT * FROM DimAthlete
2024-03-23 09:25:30,029 - INFO - Data extracted successfully for query: SELECT * FROM DimEvent
2024-03-23 09:25:30,050 - INFO - Data extracted successfully for query: SELECT * FROM DimTime


In [115]:
country_id_map = dim_country_df.set_index('country_name')['country_id'].to_dict()
athlete_id_map = dim_athlete_df.set_index('full_name')['athlete_id'].to_dict()
event_id_map = dim_event_df.set_index('event_title')['event_id'].to_dict()
time_id_map = dim_time_df.set_index('year')['time_id'].to_dict()

In [116]:
olympic_medals_df['country_id'] = olympic_medals_df['country_name'].map(country_id_map)
olympic_medals_df.fillna({'athlete_full_name': 'Team Event'}, inplace=True)
olympic_medals_df['athlete_id'] = olympic_medals_df['athlete_full_name'].map(athlete_id_map)
olympic_medals_df['event_id'] = olympic_medals_df['event_title'].map(event_id_map)
olympic_medals_df['year'] = olympic_medals_df['slug_game'].str.extract(r'-(\d{4})$').astype(int)
olympic_medals_df['time_id'] = olympic_medals_df['year'].map(time_id_map)

In [117]:
fact_medal_wins_df = olympic_medals_df[['country_id', 'athlete_id', 'event_id', 'time_id', 'medal_type']]

In [118]:
fact_medal_wins_df

Unnamed: 0,country_id,athlete_id,event_id,time_id,medal_type
0,100,8,479,1,GOLD
1,100,9,479,1,GOLD
2,155,10,479,1,SILVER
3,155,11,479,1,SILVER
4,201,12,479,1,BRONZE
...,...,...,...,...,...
12835,78,7732,5818,53,BRONZE
12836,53,7726,5819,53,SILVER
12837,78,7733,5819,53,BRONZE
12838,53,7726,5820,53,GOLD


In [120]:
fact_medal_wins_df.to_sql('factmedalwins', trg_engine, if_exists='append', index=False)
print("FactMedalWins populated successfully.")

FactMedalWins populated successfully.


# OLAP 
## Cube Set Up

In [135]:
import atoti as tt
session = tt.Session(
    user_content_storage=".content",
    port=9092,
    java_options=["-Xms1G", "-Xmx10G"]
)

2024-03-23 13:56:57,013 - INFO - Closing down clientserver connection
2024-03-23 13:56:57,014 - INFO - Closing down clientserver connection
2024-03-23 13:56:57,017 - INFO - Closing down clientserver connection
2024-03-23 13:56:57,023 - INFO - Closing down clientserver connection
2024-03-23 13:56:57,025 - INFO - Callback Server Shutting Down
2024-03-23 13:56:57,162 - INFO - Callback Server Starting
2024-03-23 13:56:57,164 - INFO - Socket listening on ('127.0.0.1', 42679)
2024-03-23 13:56:57,164 - INFO - Closing down clientserver connection
2024-03-23 13:57:00,171 - INFO - Callback Server Starting
2024-03-23 13:57:00,172 - INFO - Socket listening on ('127.0.0.1', 42405)
2024-03-23 13:57:00,819 - INFO - Closing down clientserver connection


In [136]:
dim_country_df = extract('SELECT * FROM DimCountry', trg_engine)
dim_athlete_df = extract('SELECT * FROM DimAthlete', trg_engine)
dim_event_df = extract('SELECT * FROM DimEvent', trg_engine)
dim_time_df = extract('SELECT * FROM DimTime', trg_engine)
fact_medal_wins_df = extract('SELECT * FROM FactMedalWins', trg_engine)

# Load your dimension tables
dim_country_table = session.read_pandas(
    dim_country_df,
    table_name="Country",
    keys=["country_id"]
)
dim_athlete_table = session.read_pandas(
    dim_athlete_df,
    table_name="Athlete",
    keys=["athlete_id"]
)
dim_event_table = session.read_pandas(
    dim_event_df,
    table_name="Event",
    keys=["event_id"]
)
dim_time_table = session.read_pandas(
    dim_time_df,
    table_name="Time",
    keys=["time_id"]
)
fact_medal_wins_table = session.read_pandas(
    fact_medal_wins_df,
    table_name="MedalWins",
    keys=["medal_win_id"]
)

2024-03-23 13:57:13,495 - INFO - Data extracted successfully for query: SELECT * FROM DimCountry
2024-03-23 13:57:13,555 - INFO - Data extracted successfully for query: SELECT * FROM DimAthlete
2024-03-23 13:57:13,585 - INFO - Data extracted successfully for query: SELECT * FROM DimEvent
2024-03-23 13:57:13,590 - INFO - Data extracted successfully for query: SELECT * FROM DimTime
2024-03-23 13:57:13,637 - INFO - Data extracted successfully for query: SELECT * FROM FactMedalWins


In [137]:
# Join your fact table with the dimension tables
fact_medal_wins_table.join(dim_country_table, fact_medal_wins_table["country_id"] == dim_country_table["country_id"])
fact_medal_wins_table.join(dim_athlete_table, fact_medal_wins_table["athlete_id"] == dim_athlete_table["athlete_id"])
fact_medal_wins_table.join(dim_event_table, fact_medal_wins_table["event_id"] == dim_event_table["event_id"])
fact_medal_wins_table.join(dim_time_table, fact_medal_wins_table["time_id"] == dim_time_table["time_id"])

# Create the cube
cube = session.create_cube(fact_medal_wins_table)

# Define measures and levels
m = cube.measures
l = cube.levels
h = cube.hierarchies

In [138]:
session.tables.schema

```mermaid
erDiagram
  "MedalWins" {
    _ long PK "medal_win_id"
    nullable long "country_id"
    nullable long "athlete_id"
    nullable long "event_id"
    nullable long "time_id"
    _ String "medal_type"
  }
  "Country" {
    _ long PK "country_id"
    _ String "country_name"
    _ String "country_code"
    _ String "region"
  }
  "Event" {
    _ long PK "event_id"
    _ String "event_title"
    _ String "discipline"
    _ String "game_slug"
  }
  "Time" {
    _ long PK "time_id"
    nullable long "year"
    _ String "game_season"
  }
  "Athlete" {
    _ long PK "athlete_id"
    _ String "full_name"
    _ String "gender"
  }
  "MedalWins" }o--o| "Event" : "`event_id` == `event_id`"
  "MedalWins" }o--o| "Athlete" : "`athlete_id` == `athlete_id`"
  "MedalWins" }o--o| "Country" : "`country_id` == `country_id`"
  "MedalWins" }o--o| "Time" : "`time_id` == `time_id`"
```


# Hierarchies Clean Up

In [139]:
del h[('MedalWins', 'medal_type')]
del h[('MedalWins', 'medal_win_id')]

In [140]:
h["Athlete"] = [l["Athlete", "gender", "gender"], l["Athlete", "full_name", "full_name"]]
h["Country"] = [l["Country", "region", "region"], l["Country", "country_name", "country_name"], l["Country", "country_code", "country_code"]]
h["Event"] = [l["Event", "discipline", "discipline"], l["Event", "game_slug", "game_slug"], l["Event", "event_title", "event_title"]]
h["Time"] = [l["Time", "game_season", "game_season"]]

In [141]:
del h[('Athlete', 'full_name')]
del h[('Athlete', 'gender')]
del h[('Country', 'country_code')]
del h[('Country', 'country_name')]
del h[('Country', 'region')]
del h[('Event', 'event_title')]
del h[('Event', 'discipline')]
del h[('Event', 'game_slug')]
del h[('Time', 'game_season')]

In [142]:
h

# Measures Cleanup

In [143]:
del m["country_id.MEAN"]
del m["country_id.SUM"]
del m["athlete_id.SUM"]
del m["athlete_id.MEAN"]
del m["event_id.SUM"]
del m["event_id.MEAN"]
del m["time_id.SUM"]
del m["time_id.MEAN"]

In [169]:
m["Total Medals"] = tt.agg.count_distinct(fact_medal_wins_table["medal_win_id"])
m["Total Athletes"] = tt.agg.count_distinct(fact_medal_wins_table["athlete_id"])

# Which country has won the most medals in a specific discipline over all Olympic Games?
m["Top Country by Discipline"] = tt.agg.max_member(
    m["Total Medals"], 
    l["country_name"]
)

# Which discipline has gotten the most medals for a specific country for all Olympic Games?
m["Total Medals by Discipline"] = tt.agg.sum(
    m["Total Medals"],
    scope=tt.OriginScope(l["discipline"], l["region"])
)
m["Top Discipline per Region"] = tt.agg.max_member(
    m["Total Medals by Discipline"], 
    l["discipline"]
)
del m["Top Discipline per Country"]
#########
# How has the performance (in terms of medals won) of a specific country evolved over different Olympic Games?
m["Medals by Country"] = tt.agg.count_distinct(
    m["Total Medals"],
    scope=tt.OriginScope( l["country_name"])
)

# Which event has the most gender diversity in terms of medal winners across all Olympic Games?
# You will need to create a measure that could be used in combination with a level representing the gender diversity, like:
m["Gender Diversity"] = tt.agg.count_distinct(
    m["Total Athletes"],
    scope=tt.OriginScope(l["event_title"])
)

# What is the distribution of medals won by region?
m["Medals by Region"] = tt.agg.count_distinct(
    m["Total Medals"],
    scope=tt.OriginScope(l["region"])
)

# Which are the top-performing countries in each region?
m["Top Countries by Region"] = tt.agg.count_distinct(
    m["Total Medals"],
    scope=tt.OriginScope(l["region"])
)

# Define a measure to calculate the distinct count of medal types for the gender diversity measure
m["Distinct Medal Types"] = tt.agg.count_distinct(fact_medal_wins_table["medal_type"])

  s.__pydantic_validator__.validate_python(ArgsKwargs(args, kwargs), self_instance=s)


In [170]:
m

In [146]:
session.link

http://localhost:9092

_Note_: This is the session's local URL: it may not be reachable if Atoti is running on another machine.

# Business Query

In [147]:
import pandas as pd

pd.set_option('display.max_rows', None)

In [161]:
# Which country has won the most medals in a specific discipline over all Olympic Games?
top_country_by_discipline = cube.query(
    m["Total Medals"],
    m["Top Country by Discipline"],
    levels=[l["discipline"]],
    filter=l["discipline"] == "Athletics"
)
top_country_by_discipline.sort_values("Total Medals", ascending=False)

Unnamed: 0_level_0,Total Medals,Top Country by Discipline
discipline,Unnamed: 1_level_1,Unnamed: 2_level_1
Athletics,1566,Finland


In [177]:
top_country_by_discipline = cube.query(
    m["Total Medals"],
    m["Top Country by Discipline"],
    levels=[l["discipline"]],
)
top_country_by_discipline.sort_values("Total Medals", ascending=False)

Unnamed: 0_level_0,Total Medals,Top Country by Discipline
discipline,Unnamed: 1_level_1,Unnamed: 2_level_1
Athletics,1566,Finland
Wrestling,844,Sweden
Swimming,828,Australia
Rowing,684,Germany
Boxing,641,Cuba
Canoe Sprint,614,Hungary
Cycling Track,593,France
Sailing,538,France
Fencing,486,Italy
Gymnastics Artistic,481,Japan


In [173]:
# Which discipline has gotten the most medals for a specific region for all Olympic Games?
top_discipline_per_region = cube.query(
    m["Top Discipline per Region"],
    m["Total Medals"],
    levels=[l["region"]],
    filter=l["region"] == "Asia"
)
result.sort_values("Total Medals", ascending=False)

Unnamed: 0_level_0,Top Discipline per Region,Total Medals
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Asia,Wrestling,1172


In [178]:
top_discipline_per_region = cube.query(
    m["Top Discipline per Region"],
    m["Total Medals"],
    levels=[l["region"]],
)
top_discipline_per_region.sort_values("Total Medals", ascending=False)

Unnamed: 0_level_0,Top Discipline per Region,Total Medals
region,Unnamed: 1_level_1,Unnamed: 2_level_1
Europe,Athletics,8989
Asia,Wrestling,1172
North America,Athletics,1079
Oceania,Swimming,805
Africa,Athletics,436
South America,Sailing,359


In [None]:
# How has the performance (in terms of medals won) of a specific country evolved over different Olympic Games?
