In [None]:
import pandas as pd
import yaml

# Transforming Data for the dashboard

This notebook simulates a pipeline that takes in the CSV files with the medal information, and returns clean files to limit the amount of processing carried from within the app.

In [None]:
df_olympic_cities = pd.read_csv("../original_data/olympic_cities.csv")
df_olympic_medals = pd.read_csv("../original_data/olympic_medals.csv")

In [None]:
df_olympic_medals.sample(3)

In [None]:
df_olympic_medals.dtypes

## Transform Stockholm

Stockholm 1956 was only equestrian. The dashboard taook care of that dynamically, but it's kindof heavy for a a little curious event... 

In [None]:
df_olympic_medals.loc[df_olympic_medals["Olympiad"] == "Stockholm 1956", "Olympiad"] = (
    "Melbourne 1956 (*)"
)

In [None]:
df_olympic_medals.loc[df_olympic_medals["Olympiad"] == "Melbourne 1956", "Olympiad"] = (
    "Melbourne 1956 (*)"
)

## Changing Data Types

In [None]:
def change_column_dtypes(df, dtype, columns):
    """
    Convert specified columns in a DataFrame to a given dtype.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - dtype (str or type): The target data type (e.g., 'int', 'float', 'datetime64[ns]', etc.).
    - columns (list): List of column names to convert.

    Returns:
    - pd.DataFrame: A new DataFrame with updated column types.

    Raises:
    - ValueError: If any column is missing or if dtype is invalid.
    """
    df_copy = df.copy()

    missing_cols = [col for col in columns if col not in df_copy.columns]
    if missing_cols:
        raise ValueError(f"Columns not found in DataFrame: {missing_cols}")

    try:
        pd.Series([0]).astype(dtype)  # simple test to validate dtype
    except Exception as e:
        raise ValueError(f"Invalid dtype '{dtype}': {e}")

    for col in columns:
        try:
            df_copy[col] = df_copy[col].astype(dtype)
        except Exception as e:
            raise ValueError(f"Could not convert column '{col}' to {dtype}: {e}")

    return df_copy

In [None]:
df_olympic_medals = change_column_dtypes(
    df_olympic_medals,
    "category",
    [
        "Olympiad",
        "Discipline",
        "Event",
        "Olympic_city",
        "Olympic_season",
        "Gender",
        "Code",
        "Committee",
        "Committee_type",
        "Medal_type",
    ],
)

df_olympic_medals = change_column_dtypes(
    df_olympic_medals,
    "str",
    ["Winner"],
)

In [None]:
df_olympic_medals.sample(3)

In [None]:
df_olympic_cities.dtypes

In [None]:
df_olympic_cities = change_column_dtypes(
    df_olympic_cities,
    "category",
    [
        "Olympiad",
        "Olympic_city",
        "Olympic_season",
        "Country",
        "Continent",
        "ISO_code_mapping",
    ],
)

## Create transformed DataFrames 

In [None]:
df_grouped_medals = (
    df_olympic_medals.groupby(["Committee", "Medal_type"], observed=True)
    .size()
    .unstack(fill_value=0)
    .reset_index()
)
df_grouped_medals["Total"] = (
    df_grouped_medals["Gold"]
    + df_grouped_medals["Silver"]
    + df_grouped_medals["Bronze"]
)

In [None]:
# Small DataFrame to display as summary table
df_olympic_cities_simplified = df_olympic_cities[
    [
        "Olympiad",
        "Olympic_year",
        "Olympic_season",
        "total_medals",
        "total_medals_gold",
        "total_medals_silver",
        "total_medals_bronze",
        "number_committees",
        "number_disciplines",
        "number_events",
        "Country",
        "Continent",
    ]
]

In [None]:
# Define a custom sorting order for 'Medal_type'
medal_order = {"Bronze": 0, "Silver": 1, "Gold": 2}

df_medals_by_olympiad = (
    df_olympic_medals.groupby(
        ["Olympiad", "Olympic_year", "Medal_type", "Olympic_season"], observed=True
    )
    .size()
    .reset_index(name="Medal_count")
)

# Sort the DataFrame first by 'Olympic_year' and then by 'Medal_type' using the custom sorting order
df_medals_by_olympiad["Medal_type_code"] = df_medals_by_olympiad["Medal_type"].map(
    medal_order
)
df_medals_by_olympiad = df_medals_by_olympiad.sort_values(
    by=["Olympic_year", "Medal_type_code"]
)

# Reset index without creating a new column
df_medals_by_olympiad.reset_index(drop=True, inplace=True)

## Create lists

In [None]:
list_olympiads = ["All"] + df_olympic_medals["Olympiad"].unique().tolist()

with open("../parameters/list_olympiads.yml", "w") as file:
    yaml.dump(list_olympiads, file, default_flow_style=False)

In [None]:
list_committees = sorted(df_olympic_medals["Committee"].unique().tolist())

with open("../parameters/list_committees.yml", "w") as file:
    yaml.dump(list_committees, file, default_flow_style=False)

## Save as Parquet files

In [None]:
df_olympic_medals.to_parquet("../data/olympic_medals.parquet", index=False)

In [None]:
df_olympic_cities.to_parquet("../data/olympic_cities.parquet", index=False)

In [None]:
df_grouped_medals.to_parquet("../data/grouped_medals.parquet", index=False)

In [None]:
df_olympic_cities_simplified.to_parquet(
    "../data/olympic_cities_simplified.parquet", index=False
)

In [None]:
df_medals_by_olympiad.to_parquet("../data/medals_by_olympiad.parquet", index=False)