In [None]:
import os
from os.path import join, isfile
from typing import List, Dict, Any

import sys
sys.path.insert(0,'..')
from src.test_data import TEST_EXAMPLES


from ipywidgets import interact, Checkbox

from plots import get_models_in_output_dir
from plots import tokenize_hf
from plots import tokenize_sp
from plots import display
from plots import decode_hack

from plots import get_models_multilinguality
from plots import split_models_multilinguality
from plots import get_intersection
from plots import get_intersections

from plots import plot_histogram, compare_vocab, plot_overview, plot_timelines, plot_overview_data, plot_vocab_size

from plots import get_list_of_results
from plots import read_results
from plots import retrieve_bf_cc_from_results
from plots import retrieve_parameters_from_results

import numpy as np
import seaborn as sns
from itertools import product

In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
OUTPUT_DIR = "../output"

In [None]:
SUBDIRS = ["multilinguality", "p_w", "bf_cc", "min_frequency"]

# 0. Models

In [None]:
models = get_models_in_output_dir(SUBDIRS)
for subdir in SUBDIRS:
    for model in models[subdir]:
        print(model)

In [None]:
# STOP

# 1. Tokenization examples

In [None]:
def show_example_model(example: str, 
                       model: str, 
                       show_tokenization: bool = True, 
                       verbose: bool = False):
    
    _id = model.split("_")[0]
    
    if isfile(join(OUTPUT_DIR, model, "tokenizer.json")):
        texample = tokenize_hf(model, example)
    elif isfile(join(OUTPUT_DIR, model, "model.model")):
        texample = tokenize_sp(model, example)
    else:
        raise Exception(f"ERROR! could not find tokenizer for model = {model}.")
    assert "".join(texample['de-tokenized_elementwise']) == texample['de-tokenized'], f"ERROR de-tokenized elementwise!"
    
    print(f"\n============ {model}")
    if verbose:
        print(f"example: '{example}'")
        print(f"\nencoded: {texample['encoded']}")
        print(f"\ntokenized: {texample['tokenized']} --- {len(texample['tokenized'])}")
        print(f"\nde-tokenized: '{texample['de-tokenized']}'")
        print(f"\nde-tokenized elementwise: {texample['de-tokenized_elementwise']}")
        print()
    
    if show_tokenization: 
        print("\ntokenized:")
        display(texample['tokenized'])
        print("\nde-tokenized:")
        display(texample['de-tokenized_elementwise'], 
                show_linebreak=True, 
                equal_to_original=example == texample['de-tokenized'])
        if 'de-tokenized_elementwise_hack' in texample.keys():
            print("\ndecoded + hack:")
            display(texample['de-tokenized_elementwise_hack'], show_linebreak=True)

In [None]:
@interact
def show_examples(example=TEST_EXAMPLES, 
                  model=models['all'], 
                  show_tokenization=True, 
                  verbose=False):
    show_example_model(example, model, show_tokenization, verbose)

In [None]:
STOP

# 2. Vocabulary

### 2a. Subword Length Histograms

In [None]:
@interact
def show_histogram(model_1=models['all'], model_2=[None] + models['all'], xlim=20, ylim=15000):
    plot_histogram(model_1, model_2, xlim, ylim)

### 2b. Overlap

In [None]:
@interact
def show_compare_vocab(model_1=models['all'], model_2=models['all'], nr=30):
    v, ex1, ex2 = compare_vocab(model_1, model_2, 1000000, 1000000)
    print(v)
    print()
    print("=== only model 1 ===")
    print(ex1[:nr])
    print()
    print("=== only model 2 ===")
    print(ex2[:nr])

### 2c. Min Frequency: Vocabulary Size & Subword Length Mean

In [None]:
@interact
def show_vocab_size(model=models['min_frequency']):
    if model:
        plot_vocab_size(model)

In [None]:
# STOP

# 3. Vocab Size & Multilinguality

In [None]:
models_multilinguality = get_models_multilinguality(models['multilinguality'], verbose=False)
models_multilinguality

In [None]:
ml = split_models_multilinguality(models_multilinguality)
ml

In [None]:
# overview_corpus(models_multilinguality)

### 3a. Time

In [None]:
if len(models_multilinguality):
    plot_overview_data(ml["models_pure"].values(), verbose=False)

In [None]:
if len(models_multilinguality):
    plot_overview(ml["models_pure"].values(), verbose=False)

### 3b. Evaluation #1: Vocabulary Intersection

In [None]:
vocabs = [10000, 20000, 30000, 40000, 51200, 64000, 80000, 96000, 112000, 128000]
vocabs_1 = vocabs
vocabs_2 = vocabs

In [None]:
timelines = get_intersections(models_multilinguality, ml, vocabs_1, vocabs_2)

print(type(timelines))
print(list(timelines.keys()))

In [None]:
@interact
def show_evaluation_1(tokenizer=ml['lang_all'], vocab_size=vocabs_2, absolute=[False, True]):
    if tokenizer is not None:
        lang_1 = tokenizer
        vocab_2 = vocab_size
        t_abs = timelines['abs'][lang_1][vocab_2]
        t_rel = timelines['rel'][lang_1][vocab_2]

        if absolute:
            plot_timelines(
                vocabs_1,
                vocab_2,
                [t_rel, t_abs],
                ml['lang_pure'], 
                ylim=[1.1, 1.1*100000],
                ylabel=["relative", "absolute"], 
                title=["Coverage of single-language tokenizer vocabulary"]*2,
            )
        else:
            plot_timelines(
                vocabs_1,
                vocab_2,
                [t_rel],
                ml['lang_pure'], 
                ylim=[1.1],
                ylabel=["relative"], 
                title=["Coverage of single-language tokenizer vocabulary"],
            )
    else:
        print("> lang_all is []")

### 3c. Evaluation #2: unk_rate & closeness_to_character_level

In [None]:
def plot_evaluation_2(_unk_rate, _ctcl, _vocabs, _languages, _ymin, _ymax):
    import matplotlib.pyplot as plt
    colors = {"da": "r", "en": "g", "is": "b", "no": "purple", "sv": "orange"}
    fig, ax = plt.subplots(1, 2, figsize=(12, 4))
    for language in _languages:
        ax[0].plot(_vocabs, _unk_rate[language], linestyle=None, marker="s", color=colors[language], label=language)
        ax[1].plot(_vocabs, _ctcl[language], linestyle=None, marker="s", color=colors[language], label=language)
    for i in range(2):
        ax[i].set_xlim([0, 150000])
        ax[i].set_ylim([_ymin, _ymax])
        ax[i].legend()
    ax[0].set_title("unknown rate (lower = better)")
    ax[1].set_title("closeness to character level (lower = better)")

In [None]:
@interact
def show_evaluation_2(result=get_list_of_results()):
    r = read_results(result)
    bfs, ccs = retrieve_bf_cc_from_results(r)
    
    @interact
    def show_evaluation_2_detail(bf=bfs, cc=ccs, ymin=0.0, ymax=1.0):
        vocabs, vocabs_models, files, languages, languages_files = retrieve_parameters_from_results(bf, cc, r, verbose=False)
        # print(bf, cc, result)
        results_filtered = {k: v for k, v in r.items() if f"-bf{bf}-cc{cc}" in k}
        # print()
        # print(results_filtered)
        # print()
        
        unk_rate = {
            language: [
                results_filtered[vocabs_models[vocab]][languages_files[language]]["unk_rate"]
                for vocab in vocabs
            ]
            for language in languages
        }
        closeness_to_character_level = {
            language: [
                results_filtered[vocabs_models[vocab]][languages_files[language]]["closeness_to_character_level"]
                for vocab in vocabs
            ]
            for language in languages
        }
        # print(unk_rate)
        # print(closeness_to_character_level)

        plot_evaluation_2(unk_rate, closeness_to_character_level, vocabs, languages, ymin, ymax)