In [None]:
import pandas as pd
import altair as alt
import numpy as np
from pathlib import Path

# Frame = all languages in GB
# k = range from 5 to 195, steps of 5
# distances = all language in GB
EVAL_NAME = "tokenizer"  # or 'intrinsic'

# df = pd.read_csv(f"experiments/{EVAL_NAME}-eval.csv")
df = pd.read_csv(f"experiments/{EVAL_NAME}-eval-equal-lang.csv")

# pretty label for the legend
df = df.rename({"method": "Method"}, axis=1)

df["Method"] = df["Method"].map(
    {
        "mmdp": "MaxMin",
        "mdp": "MaxSum",
        "random_genus": "RandomGenus*",
        "random_family": "RandomFamily*",
        "convenience": "Convenience*",
        "random": "Random*",
    }
)

rand_methods = ["Convenience*", "RandomFamily*", "RandomGenus*", "Random*"]
legend_order = [
    "MaxMin",
    "MaxSum",
    "Convenience*",
    "Random*",
    "RandomFamily*",
    "RandomGenus*",
]

OPACITY = 0.7
COLORS = ["steelblue", "#7D3C98", "chartreuse", "#F4D03F", "red", "#D35400"]
Y_LABELS = {
    "entropy_with_missing": "Entropy (H)",
    "entropy_without_missing": "Entropy (H)",
    "fvi": "FVI",
    "fvo": "FVO",
    "mpd": "MPD",
}
METRICS = [
    # "entropy_with_missing",
    "entropy_without_missing",
    "fvi",
    "fvo",
    "mpd",
]

plots = []
for metric in METRICS:
    legend = alt.Legend(
        orient="none",
        legendX=130,
        legendY=-40,
        direction="horizontal",
        titleAnchor="middle",
    )

    err_bars = (
        alt.Chart(df[(df["Method"].isin(rand_methods))])
        .mark_errorbar(extent="stdev", opacity=OPACITY)
        # .mark_boxplot(extent="min-max")
        .encode(
            x=alt.X("k", title="Sample size"),
            y=alt.Y(metric, title=Y_LABELS[metric]),
            color=alt.Color("Method", legend=legend, sort=legend_order).scale(
                range=COLORS
            ),
        )
    )

    points = (
        alt.Chart(df)
        .mark_point(filled=True, opacity=OPACITY)
        .encode(
            x=alt.X("k", title="Sample size"),
            y=alt.Y(f"mean({metric})", title=Y_LABELS[metric]),
            color=alt.Color("Method", legend=legend, sort=legend_order).scale(
                range=COLORS
            ),
        )
    )
    plots.append(err_bars + points)

In [None]:
top = plots.pop() | plots.pop()
bottom = plots.pop() | plots.pop()

combined = alt.vconcat(top, bottom)

In [None]:
combined.save(f"plots/{EVAL_NAME}-eval-plots-equal-lang.pdf")

In [None]:
# the remainder of this notebook is just for the tokenizer results

frame = Path('../data/frames/equal_lang_frame.txt').read_text().split('\n')

# tok_df = pd.read_csv("../data/avg_subwords_mBERT.csv", sep=";")
tok_df = pd.read_csv("../data/avg_subwords_XLM-R-base.csv", sep=";")

tok_df = tok_df[tok_df['glottocode'].isin(frame)]
glot2score = {row["glottocode"]: float(row["avg_subwords"]) for _, row in tok_df.iterrows()}

In [None]:
def calc_var(sample):
    return np.var([glot2score[lang] for lang in sample])

df['sample_split'] = df['sample'].str.split(',')
df['scores'] = df['sample_split'].apply(lambda x: [glot2score[y] for y in x])
df['variance'] = df['sample_split'].apply(calc_var)

In [None]:
# variance of variance
domain = [df['variance'].min(), df['variance'].max()]
tok_plots = []

for i, method in enumerate(rand_methods):
    box_plot = (
        alt.Chart(df[(df['Method'] == method)], title=method)
        .mark_boxplot(extent="min-max", opacity=OPACITY, color=COLORS[i])
        .encode(
            x=alt.X("k", title="Sample size"),
            y=alt.Y("variance", title="Variance").scale(domain=domain),
        )
    )
    tok_plots.append(box_plot)

top = tok_plots.pop() | tok_plots.pop()
bottom = tok_plots.pop() | tok_plots.pop()

combined = alt.vconcat(top, bottom)
combined

In [None]:
plot_df = df[(df["Method"].isin(["MaxMin", "MaxSum"])) | (df["run"] == 0)]
plot_df = plot_df.explode("scores")

In [None]:
tok_plots = []
alt.data_transformers.disable_max_rows()

for i, method in enumerate(legend_order):
    box_plot = (
        alt.Chart(plot_df[(plot_df['Method'] == method)], title=method)
        # .mark_boxplot(extent='min-max', opacity=OPACITY, color=COLORS[i]) # for tukey
        .mark_boxplot(opacity=OPACITY, color=COLORS[i])
        .encode(
            x=alt.X("k", title="Sample size").scale(domain=[0, 100]),
            y=alt.Y("scores", title="Average subwords"), #.scale(domain=domain),
        )
    )
    tok_plots.append(box_plot)
top = tok_plots.pop() | tok_plots.pop()
mid = tok_plots.pop() | tok_plots.pop()
bottom = tok_plots.pop() | tok_plots.pop()

combined = alt.vconcat(alt.vconcat(top, mid), bottom)
combined

In [None]:
entire_frame_plot = (
    alt.Chart(tok_df, title="Entire Frame")
    .mark_boxplot(opacity=OPACITY, color="green")
    .encode(y=alt.Y("avg_subwords", title="Average subwords"))
)
entire_frame_plot

In [None]:
box_plots = []
for i, method in enumerate(legend_order):
    box_plot = (
        alt.Chart(
            plot_df[
                (plot_df["Method"] == method)
                & (plot_df["k"] == 10)
                & (plot_df["run"] == 0)
            ],
            title=method,
        )
        .mark_boxplot(opacity=OPACITY, color=COLORS[i])
        .encode(y=alt.Y("scores", title=None).scale(domain=[0, 120]))
    )
    box_plots.append(box_plot)

combined = (
    entire_frame_plot
    | box_plots.pop()
    | box_plots.pop()
    | box_plots.pop()
    | box_plots.pop()
    | box_plots.pop()
    | box_plots.pop()
)
combined

In [None]:
combined.save(f"plots/xlmr-{EVAL_NAME}-scores-for-k-10-equal-frame.pdf")