In [None]:
from pathlib import Path

import pandas as pd

from analytics.plotting.common.dataset_histogram import (
    build_countplot,
    build_histogram_multicategory_barnorm,
    build_pieplot,
)
from analytics.plotting.common.save import save_plot

%load_ext autoreload
%autoreload 2

In [None]:
# use interactive plotly
interactive = False

In [None]:
yb_samples: list[(int, int)] = []
"""year, label"""

for year in range(1930, 2014 + 1):
    file1 = Path(f"/Users/robinholzinger/robin/dev/eth/modyn-2/.data/datasets/yearbook/all/{year}.bin")
    file1_bytes = file1.read_bytes()

    record_size = 12288 + 4  # 32 * 32 * 3 * 4 (width*height*channels*float32) + 4 (label)
    label_size = 4

    num_items = len(file1_bytes) // record_size

    images_bin_year = []
    for i in range(num_items):
        label = file1_bytes[i * record_size : i * record_size + label_size]
        yb_samples.append((year, int.from_bytes(label, byteorder="big")))

In [None]:
yb_df = pd.DataFrame(yb_samples, columns=["year", "label"])
yb_df

In [None]:
# polished
fig1 = build_countplot(
    yb_df,
    x="year",
    x_ticks=[1950, 1975, 2000],
    y_ticks_bins=3,
    height_factor=0.45,
    width_factor=0.48,
    x_label="Sample Time",
    y_label="Num. Samples",
)

save_plot(fig1, "yearbook_samples_over_time")

In [None]:
label_map = {0: "Male", 1: "Female"}

In [None]:
sorted_categories = yb_df["label"].value_counts().reset_index().sort_values("count", ascending=False)
sorted_categories["ratio"] = sorted_categories["count"] / sorted_categories["count"].sum()
sorted_categories["label"] = sorted_categories["label"].map(label_map)
sorted_categories

In [None]:
yb_df["label"] = yb_df["label"].map(label_map)
yb_df["date"] = pd.to_datetime(yb_df["year"], format="%Y")

In [None]:
ratio = build_pieplot(
    x=list(sorted_categories["count"]),
    labels=tuple(sorted_categories["label"]),
    width_factor=0.4,
    height_factor=0.35,
)
save_plot(ratio, "yearbook_samples_ratio")

In [None]:
# we want the legend to have different sorting

from analytics.plotting.common.color import main_color

fig_ratio = build_histogram_multicategory_barnorm(
    yb_df,
    x="year",
    label="label",
    sorted_coloring_categories=sorted_categories["label"],
    height_factor=0.45,
    width_factor=0.48,
    legend=False,
    legend_labels=["Male", "Female"],
    # x_ticks=[pd.to_datetime(f"{y}-01-01") for y in list(range(1930, 2014, 10))],
    x_label="Sample Time",
    y_label="Label Distribution   ",
    y_ticks=[1.0, 0.75, 0.5, 0.25, 0.0],
    legend_title="Article Category",
    nbins=84,
    manual_color_map={
        "Male": main_color(0, light=True),
        "Female": main_color(1),
    },
    grid_opacity=0.5,
    col_alpha=1.0,
)

# Add manual text labels into the plot
fig_ratio.text(0.28, 0.77, "Male: 53.3% (19808)", color="white")
fig_ratio.text(0.28, 0.33, "Female: 46.7% (17382)", color="white")

save_plot(fig_ratio, "yearbook_label_distribution")