In [None]:
from collections import deque
from pathlib import Path

import pandas as pd
import plotly.express as px

from analytics.plotting.common.dataset_histogram import (
    build_countplot,
    build_cum_barplot,
    build_histogram_multicategory_barnorm,
    build_histogram_multicategory_facets,
)
from analytics.plotting.common.save import save_plot
from benchmark.huffpost_kaggle.data_generation import HuffpostKaggleDataGenerator

%load_ext autoreload
%autoreload 2

In [None]:
# use interactive plotly
interactive = False

In [None]:
huffpost_dataset = HuffpostKaggleDataGenerator(
    Path("/Users/robinholzinger/robin/dev/eth/modyn-2/.data/datasets/huffpost_kaggle/"),
    Path("/Users/robinholzinger/robin/dev/eth/modyn-2/.data/datasets/huffpost_kaggle/raw/news-category-dataset.zip"),
)
# huffpost_dataset.extract_data(Path("/Users/robinholzinger/robin/dev/eth/modyn-2/.data/datasets/huffpost_kaggle/raw/news-category-dataset.zip"))
hp_df = huffpost_dataset.load_into_dataframe(keep_true_category=True)

In [None]:
hp_df

In [None]:
hp_df["category"].unique()

In [None]:
hp_df["month"] = hp_df["date"].dt.to_period("M")

# x = build_histogram(
#     hp_df,
#     x="month",
# )

In [None]:
# number of samples over time
hp_df["year"] = hp_df["date"].dt.year

if interactive:
    px.histogram(hp_df, x="date")
else:
    # polished
    fig1 = build_countplot(
        hp_df,
        x="year",
        x_ticks=[y for y in range(2012, 2022, 2)],
        y_ticks_bins=3,
        height_factor=0.4,
        width_factor=1.0,
        x_label="Sample Time",
        y_label="Num. Samples",
        palette_strip=None,
    )

    save_plot(fig1, "huffpost_kaggle_samples_over_time")

In [None]:
category_and_years = hp_df[["category", "date"]]
category_and_years["year"] = category_and_years["date"].dt.year
category_and_years = category_and_years[["category", "year"]].drop_duplicates()
category_and_years = category_and_years.groupby("category").size().reset_index()
category_and_years
category_and_years.columns = ["category", "num_years"]
category_and_years[category_and_years["num_years"] > 9]

In [None]:
hp_df_reduced = hp_df.merge(category_and_years, on="category")

In [None]:
# Cut at 2018
hp_df_reduced_before_2018 = hp_df_reduced[hp_df_reduced["date"] < "2018-01-01"]
hp_df_reduced_after_2018 = hp_df_reduced[hp_df_reduced["date"] >= "2018-01-01"]

category_counts_before_2018 = (
    hp_df_reduced_before_2018["category"].value_counts().reset_index().sort_values("count", ascending=False)
)
category_counts_after_2018 = (
    hp_df_reduced_after_2018["category"].value_counts().reset_index().sort_values("count", ascending=False)
)

In [None]:
def find_category_ratios(df: pd.DataFrame) -> pd.DataFrame:
    total_samples = df.shape[0]
    category_counts = df["category"].value_counts().reset_index().sort_values("count", ascending=False)
    category_counts["ratio"] = category_counts["count"] / total_samples
    return category_counts

In [None]:
# Analyse ratio of categories
category_counts = find_category_ratios(hp_df_reduced)
category_counts_before_2018 = find_category_ratios(hp_df_reduced_before_2018)
category_counts_after_2018 = find_category_ratios(hp_df_reduced_after_2018)

category_counts = category_counts.merge(
    category_counts_before_2018, on="category", suffixes=("", "_before_2018"), how="left"
).merge(category_counts_after_2018, on="category", suffixes=("", "_after_2018"), how="left")
category_counts.head()

In [None]:
sorted_categories = (category_counts.sort_values("count", ascending=False))["category"]
sorted_categories


hp_df_reduced["sort_idx"] = pd.Categorical(hp_df_reduced["category"], categories=sorted_categories, ordered=True)
hp_df_reduced = hp_df_reduced.sort_values("sort_idx", ascending=False)

In [None]:
# # we want to find out the ratio of the dataset (all, <2018, >=2018) that we cover when only
# # show the top 12 categories from <2018 and the top 4 from >=2018 (some might overlap)
# top_12_before_2018 = category_counts_before_2018.head(12)
# df_top_12_before_2018 = hp_df_reduced_before_2018[hp_df_reduced_before_2018["category"].isin(top_12_before_2018["category"])]

# top_4_after_2018 = category_counts_after_2018.head(4)
# df_top_4_after_2018 = hp_df_reduced_after_2018[hp_df_reduced_after_2018["category"].isin(top_4_after_2018["category"])]

# percentage_before_2018 = df_top_12_before_2018.shape[0] / hp_df_reduced_before_2018.shape[0]
# percentage_after_2018 = df_top_4_after_2018.shape[0] / hp_df_reduced_after_2018.shape[0]
# percentage_total = (df_top_12_before_2018.shape[0] + df_top_4_after_2018.shape[0]) / hp_df_reduced.shape[0]

# print(percentage_before_2018, percentage_after_2018, percentage_total)

In [None]:
# Export for thesis table
from analytics.plotting.common.save import save_csv_df

# select top 8 and bottom 2
export_csv = pd.concat([category_counts.head(8)])[["category", "count", "ratio"]]  # , category_counts.tail(2)
export_csv["ratio"] = export_csv["ratio"].apply(lambda x: round(x * 100, 1))
print(export_csv)

save_csv_df(export_csv, "hp_kaggle_category_ratios")

In [None]:
plotting_threshold = category_counts.reset_index()[["index", "ratio"]]
plotting_threshold["index"] = plotting_threshold["index"] + 1
# add first row: 0
plotting_threshold = pd.concat([pd.DataFrame({"index": [0], "ratio": [0]}), plotting_threshold])

# cumulative sum
plotting_threshold["ratio"] = plotting_threshold["ratio"].cumsum() * 100
plotting_threshold.head(n=10)

In [None]:
# Plot coverage of categories
label_hist = build_cum_barplot(
    plotting_threshold,
    x="index",
    y="ratio",
    x_label="Categories",
    y_label="% of Dataset",
    height_factor=0.4,
    width_factor=0.4,
    y_ticks_bins=3,
    x_ticks_bins=4,
)
save_plot(label_hist, "huffpost_kaggle_category_coverage")

In [None]:
if interactive:
    px.histogram(hp_df_reduced, x="date", color="category")
    fig = px.histogram(
        hp_df_reduced,
        x="date",
        color="category",
        facet_col="category",
        facet_col_wrap=4,
        height=2000,
        facet_row_spacing=0.05,
        category_orders={"category": (category_counts["category"].tolist())},
        color_discrete_sequence=px.colors.qualitative.Safe,
    )
    fig.update_yaxes(matches=None, showticklabels=True)
    fig.update_xaxes(showticklabels=True)
    fig.show()
else:
    fig_all = build_histogram_multicategory_facets(
        hp_df_reduced,
        x="date",
        label="category",
        sorted_categories=sorted_categories,
        height_factor=2.25,
        width_factor=1.5,
        # legend_labels=list(merged),
        x_label="Sample Time",
        y_label="Number of Samples",
        x_ticks=[pd.to_datetime(d) for d in ["2014-05-01", "2018-06-01"]],
        sharey=False,
    )

    save_plot(fig_all, "huffpost_kaggle_label_distribution_over_time")

In [None]:
if interactive:
    fig = px.histogram(
        hp_df_reduced,
        x="date",
        color="category",
        height=500,
        barnorm="percent",
        category_orders={"category": (category_counts["category"].tolist())},
        color_discrete_sequence=px.colors.qualitative.Safe,
    )
    fig.update_yaxes(matches=None, showticklabels=True)
    fig.update_xaxes(showticklabels=True)
    fig.show()
else:
    # legend:
    # find the top 5 labels before 2018 and the top 5 labels after 2018, merge them and use them in the legend
    before_labels = deque(category_counts_before_2018["category"].tolist())
    after_labels = deque(category_counts_after_2018["category"].tolist())

    # iteratively take first element from each list and append to merged SET until 10 distinct elements are in the set
    merged = set()
    while len(merged) < 8:
        if before_labels:
            merged.add(before_labels.popleft())
        if len(merged) == 8:
            break
        if after_labels:
            merged.add(after_labels.popleft())
    fig_labels_distribution = build_histogram_multicategory_barnorm(
        hp_df_reduced,
        x="date",
        label="category",
        sorted_coloring_categories=sorted_categories,
        height_factor=0.55,
        width_factor=1.0,
        legend_labels=list(merged),
        x_label="Sample Time",
        y_label="Label Distribution",
        y_ticks=[1.0, 0.75, 0.5, 0.25, 0.0],
        y_ticks_bins=4,
        x_ticks=[pd.to_datetime(d) for d in ["2014-05-01", "2015-07-01", "2018-06-01", "2021-01-01"]],
        legend_title="Article Category",
    )

    save_plot(fig_labels_distribution, "huffpost_kaggle_label_distribution_over_time_relative")

In [None]:
if interactive:
    fig = px.histogram(
        hp_df_reduced_before_2018,
        x="date",
        color="category",
        height=500,
        barnorm="percent",
        category_orders={"category": (category_counts_before_2018["category"].tolist())},
        color_discrete_sequence=px.colors.qualitative.Safe,
    )
    fig.update_yaxes(matches=None, showticklabels=True)
    fig.update_xaxes(showticklabels=True)
    fig.show()

    fig = px.histogram(
        hp_df_reduced_after_2018,
        x="date",
        color="category",
        height=500,
        barnorm="percent",
        category_orders={"category": (category_counts_after_2018["category"].tolist())},
        # color palette
        color_discrete_sequence=px.colors.qualitative.Safe,
    )
    fig.update_yaxes(matches=None, showticklabels=True)
    fig.update_xaxes(showticklabels=True)
    fig.show()

else:
    # -------------------------------------------------- Before 2018 ------------------------------------------------- #
    category_counts_before_2018["sort_idx"] = pd.Categorical(
        category_counts_before_2018["category"],
        categories=category_counts_before_2018["category"].tolist(),
        ordered=True,
    )
    category_counts_before_2018.sort_values("sort_idx", ascending=True, inplace=True)

    fig_before_2018 = build_histogram_multicategory_barnorm(
        hp_df_reduced_before_2018,
        x="date",
        label="category",
        sorted_coloring_categories=sorted_categories,
        sorted_ordering_categories=category_counts_before_2018["category"].tolist(),
        height_factor=0.55,
        width_factor=1.0,
        legend_labels=category_counts_before_2018["category"].tolist()[:8],
        x_label="Sample Time",
        y_label="Label Distribution",
        y_ticks=[1.0, 0.75, 0.5, 0.25, 0.0],
        legend_title="Article Category",
        nbins=60,
    )
    save_plot(fig_before_2018, "huffpost_kaggle_label_distribution_over_time_relative_before_2018")

    # -------------------------------------------------- After 2018 -------------------------------------------------- #

    hp_df_reduced_after_2018["sort_idx"] = pd.Categorical(
        hp_df_reduced_after_2018["category"], categories=category_counts_after_2018["category"].tolist(), ordered=True
    )
    hp_df_reduced_after_2018.sort_values("sort_idx", ascending=True, inplace=True)

    # we want the legend to have different sorting
    fig_after_2018 = build_histogram_multicategory_barnorm(
        hp_df_reduced_after_2018,
        x="date",
        label="category",
        sorted_coloring_categories=sorted_categories,
        sorted_ordering_categories=category_counts_after_2018["category"].tolist(),
        height_factor=0.55,
        width_factor=1.0,
        legend_labels=category_counts_after_2018["category"].tolist()[:8],
        x_ticks=[pd.to_datetime(d) for d in ["2019-01-01", "2020-01-01", "2021-01-01", "2022-01-01"]],
        x_label="Sample Time",
        y_label="Label Distribution",
        y_ticks=[1.0, 0.75, 0.5, 0.25, 0.0],
        legend_title="Article Category",
        nbins=60,
    )
    save_plot(fig_after_2018, "huffpost_kaggle_label_distribution_over_time_relative_after_2018")