In [None]:
import pandas as pd

# Transforming Data for the dashboard

This notebook simulates a pipeline that takes in the CSV files with the medal information, and returns clean files to limit the amount of processing carried from within the app.

In [None]:
df_olympic_cities = pd.read_csv("../original_data/olympic_cities.csv")
df_olympic_medals = pd.read_csv("../original_data/olympic_medals.csv")

In [None]:
df_olympic_medals.sample(3)

In [None]:
df_olympic_medals.dtypes

## Changing Data Types

In [None]:
def change_column_dtypes(df, dtype, columns):
    """
    Convert specified columns in a DataFrame to a given dtype.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - dtype (str or type): The target data type (e.g., 'int', 'float', 'datetime64[ns]', etc.).
    - columns (list): List of column names to convert.

    Returns:
    - pd.DataFrame: A new DataFrame with updated column types.

    Raises:
    - ValueError: If any column is missing or if dtype is invalid.
    """
    df_copy = df.copy()

    missing_cols = [col for col in columns if col not in df_copy.columns]
    if missing_cols:
        raise ValueError(f"Columns not found in DataFrame: {missing_cols}")

    try:
        pd.Series([0]).astype(dtype)  # simple test to validate dtype
    except Exception as e:
        raise ValueError(f"Invalid dtype '{dtype}': {e}")

    for col in columns:
        try:
            df_copy[col] = df_copy[col].astype(dtype)
        except Exception as e:
            raise ValueError(f"Could not convert column '{col}' to {dtype}: {e}")

    return df_copy

In [None]:
df_olympic_medals = change_column_dtypes(
    df_olympic_medals,
    "category",
    [
        "Olympiad",
        "Discipline",
        "Event",
        "Olympic_city",
        "Olympic_season",
        "Gender",
        "Code",
        "Committee",
        "Committee_type",
        "Medal_type"
    ],
)

df_olympic_medals = change_column_dtypes(
    df_olympic_medals,
    "str",
    ["Winner"],
)


In [None]:
df_olympic_medals.sample(3)

## Save as Parquet files

In [None]:
df_olympic_medals.to_parquet("../data/olympic_medals.parquet")