In [53]:
# imports
import numpy as np
import pandas as pd
from pathlib import Path
import json
import altair as alt

In [54]:
# paths
RESULTS = Path('results')
DATA = Path('../../data')
TOK_MODELS = ["google-bert/bert-base-multilingual-cased",
             "FacebookAI/xlm-roberta-large",
             "openai-community/gpt2",
             "intfloat/multilingual-e5-large",
             "microsoft/Phi-3-medium-128k-instruct"
             ]

In [55]:
# choose which tokenizer to evaluate
model = TOK_MODELS[4].split('/')[-1]

In [56]:
# load grambank info 
gb_df = pd.read_csv(DATA.parent / "grambank/cldf/languages.csv")
gltc = set(gb_df["Glottocode"].tolist())
assert len(gltc) == len(set(gltc))

In [57]:
# retrieve results
res_df = pd.read_csv(RESULTS / f'avg_subwords_{model}.csv', sep=';')
res_df = res_df[res_df['glottocode'].isin(gltc)]
glot2score = {row["glottocode"]: float(row["avg_subwords"]) for _, row in res_df.iterrows()}

In [58]:
# make langs_tok
with open('data/langs_tok.txt', 'w') as langs_file:
    for lang in res_df["glottocode"]:
        langs_file.write(lang+"\n")

In [59]:
# calculate distances with adapted frame
!python ../../data/compute_all_distances.py -b -n -r -c 0.75 \
                             -g ../../data/gb_lang_feat_vals.csv \
                             -o data/gb_lang_dists-bnrc75-tok.csv \
                             -l data/langs_tok.txt \
                             -d data/gb_processed_tok.csv

In [60]:
!python ../../evaluation/experiment.py \
    --results_path ../../evaluation/results/tokenizer-eval.csv \
    --dist_path data/gb_lang_dists-bnrc75-tok.csv \
    -gb_path ../../grambank/cldf/languages.csv \
    -wals_path ../../data/wals_dedup.csv \
    -gb_features_path data/gb_processed_tok.csv \
    -counts_path ../../data/convenience/convenience_counts.json \
    -rand_runs 1 \
    -s 20 \
    -e 20 \
    -st 1

Processing: 100%|█████████████████████████████████| 6/6 [00:00<00:00,  6.48it/s]


In [61]:
df = pd.read_csv("../../evaluation/results/tokenizer-eval.csv")
df

Unnamed: 0,method,run,entropy_with_missing,entropy_without_missing,fvi,mpd,fvo,k,sample
0,convenience,0,0.714888,0.600079,0.925373,0.711041,0.709697,20,"czec1258,dani1285,dutc1256,fili1244,finn1318,h..."
1,mdp,0,1.081476,0.823408,0.987562,0.861946,0.577802,20,"auuu1241,bali1278,barg1252,bemb1257,brib1243,c..."
2,mmdp,0,1.044921,0.792702,0.982587,0.842348,0.604404,20,"akha1245,apur1254,bali1278,bemb1257,cent2292,c..."
3,random,0,1.012428,0.691058,0.960199,0.779386,0.648249,20,"aona1235,bord1248,cent1992,cent2084,chot1239,d..."
4,random_family,0,1.003857,0.707236,0.960199,0.781943,0.643261,20,"acol1236,amap1240,bian1252,cent2127,cher1273,c..."
5,random_genus,0,0.977653,0.689049,0.942786,0.780111,0.652832,20,"acha1249,apin1244,bata1289,bugi1244,busa1253,c..."


In [62]:
# rename columns
df = df.rename({"method": "Method"}, axis=1)

df


df["Method"] = df["Method"].map(
    {
        "mmdp": "MaxMin",
        "mdp": "MaxSum",
        "random_genus": "RandomGenus*",
        "random_family": "RandomFamily*",
        "convenience": "Convenience",
        "random": "Random*",
    }

)

df['sample_split'] = df['sample'].str.split(',')
df['scores'] = df['sample_split'].apply(lambda x: [glot2score[y] for y in x])

plot_df = df[(df["Method"].isin(["MaxMin", "MaxSum"])) | (df["run"] == 0)]
plot_df = plot_df.explode("scores")

# select one run (we compare deterministic methods only), k = 20
plot_df = plot_df[
                  (plot_df["k"] == 20)
            ]

plot_df

Unnamed: 0,Method,run,entropy_with_missing,entropy_without_missing,fvi,mpd,fvo,k,sample,sample_split,scores
0,Convenience,0,0.714888,0.600079,0.925373,0.711041,0.709697,20,"czec1258,dani1285,dutc1256,fili1244,finn1318,h...","[czec1258, dani1285, dutc1256, fili1244, finn1...",45.534
0,Convenience,0,0.714888,0.600079,0.925373,0.711041,0.709697,20,"czec1258,dani1285,dutc1256,fili1244,finn1318,h...","[czec1258, dani1285, dutc1256, fili1244, finn1...",41.921
0,Convenience,0,0.714888,0.600079,0.925373,0.711041,0.709697,20,"czec1258,dani1285,dutc1256,fili1244,finn1318,h...","[czec1258, dani1285, dutc1256, fili1244, finn1...",42.692
0,Convenience,0,0.714888,0.600079,0.925373,0.711041,0.709697,20,"czec1258,dani1285,dutc1256,fili1244,finn1318,h...","[czec1258, dani1285, dutc1256, fili1244, finn1...",58.303
0,Convenience,0,0.714888,0.600079,0.925373,0.711041,0.709697,20,"czec1258,dani1285,dutc1256,fili1244,finn1318,h...","[czec1258, dani1285, dutc1256, fili1244, finn1...",53.127
...,...,...,...,...,...,...,...,...,...,...,...
5,RandomGenus*,0,0.977653,0.689049,0.942786,0.780111,0.652832,20,"acha1249,apin1244,bata1289,bugi1244,busa1253,c...","[acha1249, apin1244, bata1289, bugi1244, busa1...",55.292
5,RandomGenus*,0,0.977653,0.689049,0.942786,0.780111,0.652832,20,"acha1249,apin1244,bata1289,bugi1244,busa1253,c...","[acha1249, apin1244, bata1289, bugi1244, busa1...",86.593
5,RandomGenus*,0,0.977653,0.689049,0.942786,0.780111,0.652832,20,"acha1249,apin1244,bata1289,bugi1244,busa1253,c...","[acha1249, apin1244, bata1289, bugi1244, busa1...",69.527
5,RandomGenus*,0,0.977653,0.689049,0.942786,0.780111,0.652832,20,"acha1249,apin1244,bata1289,bugi1244,busa1253,c...","[acha1249, apin1244, bata1289, bugi1244, busa1...",67.681


In [63]:
# add 'entire frame' to df
avg_subwords = res_df['avg_subwords']
entire_frame_rows = []
for x in avg_subwords:
    entire_frame_rows.append({'Method': 'All',
                              'scores':x,
                             })

plot_df = plot_df.loc[plot_df['Method'].isin(["Convenience","MaxMin","MaxSum"])]
plot_df = plot_df._append(entire_frame_rows)

In [64]:
# plot relevant comparison
colors = ['grey','chartreuse','steelblue',"#7D3C98"]
plot = alt.Chart(plot_df).mark_boxplot(
    ).encode(
        y=alt.Y("scores", title="").scale(domain=[0,350]),
        x="Method",
        color=alt.Color("Method", legend=None).scale(range=colors)
).properties(
    title=model
)
plot

In [65]:
plot.save(f"plots/{model}.pdf")