In [6]:
from pathlib import Path

import pandas as pd


def read_raw_dataset():
    """Reads the dataset."""
    dataset_path = Path().absolute() / "datasets"
    df = pd.read_csv(dataset_path / "test_dataset.csv")
    return df


def calculate_memory_usage(df_input: pd.DataFrame) -> pd.Series:
    """Returns the real memory usage of a DataFrame including object types."""

    memory_usage = df_input.memory_usage(deep=True)
    print("--- Memory consumption ---")
    print(memory_usage, end="\n\n")

    return memory_usage


def convert_to_categorical(df_to_convert: pd.DataFrame, columns: list[str], column_to_datatype_map: dict) -> pd.DataFrame:
    """Converts all the specified columns of a dataframe to categorical types."""

    df_out = df_to_convert.copy()
    for column in columns:
        df_out[column] = df_out[column].astype(column_to_datatype_map.get(column, "category"))

    return df_out


def calculate_percentage_difference(inicial: pd.Series, final: pd.Series) -> pd.Series:
    """Return the percentage difference of two pandas Series."""
    return (final - inicial) / inicial * 100





In [9]:
df_before_type_conversion = read_raw_dataset()
print("--- Types before conversion ---")
print(df_before_type_conversion.dtypes, end="\n\n")

initial_memory_usage = calculate_memory_usage(df_before_type_conversion)

# Possible Column datatypes could be (int64, string, float64, boolean, datetime[timezone], category)
column_to_datatype_map = {
    "Datetime": "object",
    "Open": "float64",
    "High": "float64",
    "Low": "float64",
    "Close": "float64",
    "Adj Close": "float64",
    "Volume": "int64"
}

df_after_type_conversion = convert_to_categorical(
    df_to_convert=df_before_type_conversion,
    columns=[
        "Datetime",
    ],
    column_to_datatype_map=column_to_datatype_map
)


print("--- Types after conversion ---")
print(df_after_type_conversion.dtypes, end="\n\n")

final_memory_usage = calculate_memory_usage(df_after_type_conversion)

memory_reduction_percent = calculate_percentage_difference(
    inicial=initial_memory_usage, final=final_memory_usage
)

print("--- Memory reduction percentage ---")
print(memory_reduction_percent)


--- Types before conversion ---
Datetime      object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

--- Memory consumption ---
Index          132
Datetime     31980
Open          3120
High          3120
Low           3120
Close         3120
Adj Close     3120
Volume        3120
dtype: int64

--- Types after conversion ---
Datetime      object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume         int64
dtype: object

--- Memory consumption ---
Index          132
Datetime     31980
Open          3120
High          3120
Low           3120
Close         3120
Adj Close     3120
Volume        3120
dtype: int64

--- Memory reduction percentage ---
Index        0.0
Datetime     0.0
Open         0.0
High         0.0
Low          0.0
Close        0.0
Adj Close    0.0
Volume       0.0
dtype: float64
