# Inspecting normalizations

Note: This notebook should only be used to generate predictions on git branches created from a dvc experiment. On the main/dev branch this notebook will only be updated when changes to the code are necessary, but it will not be used to generate predictions there.

## Prepare data and get normalizations

In [None]:
import os
import random
import tomli

import datasets
import numpy as np
import pandas as pd
import torch
import transformers
from transnormer.models.train_model import tokenize_datasets
from transnormer.evaluation.analysis import get_spans_of_unknown_tokens
from transnormer.visualization.formatting import markup_spans

In [None]:
# Number of examples to generate predictions for
N = 20

In [None]:
# Load configs
ROOT = "../../"
CONFIGFILE = os.path.join(ROOT, "training_config.toml")
with open(CONFIGFILE, mode="rb") as fp:
    CONFIGS = tomli.load(fp)

# OR: Use custom configs (if so: uncomment the following)
# CONFIGS = {
#     "gpu": "cuda:0",
#     "random_seed": 42,
#     "tokenizer": {
#         "max_length_input": 128,
#         "max_length_output": 128,
#         "input_transliterator": "Transliterator1",
#     },
#     "language_models": {
#         "checkpoint_encoder": "prajjwal1/bert-tiny",
#         "checkpoint_decoder": "prajjwal1/bert-tiny",
#     },
#     "beam_search_decoding": {
#         "no_repeat_ngram_size": 3,
#         "early_stopping": True,
#         "length_penalty": 2.0,
#         "num_beams": 4,
#     },
# }

In [None]:
# Fix seeds for reproducibilty
random.seed(CONFIGS["random_seed"])
np.random.seed(CONFIGS["random_seed"])
torch.manual_seed(CONFIGS["random_seed"])

# GPU set-up
device = torch.device(CONFIGS["gpu"] if torch.cuda.is_available() else "cpu")

In [None]:
# Load data 
data_files = {
    "1600to1699": os.path.join(ROOT, "data/interim/dtak-1600-1699/dtak-1600-1699-validation.jsonl"),
    "1700to1799": os.path.join(ROOT, "data/interim/dtak-1700-1799/dtak-1700-1799-validation.jsonl"),
    "1800to1899": os.path.join(ROOT, "data/interim/dtak-1800-1899/dtak-1800-1899-validation.jsonl"),
}
ds = datasets.load_dataset("json", data_files=data_files)

ds["1600to1699"] = ds["1600to1699"].shuffle().select(range(N))
ds["1700to1799"] = ds["1700to1799"].shuffle().select(range(N))
ds["1800to1899"] = ds["1800to1899"].shuffle().select(range(N))

In [None]:
# Tokenize data 

# In case we use locally saved models for the tokenizers
# the relative path must be completed. Uncomment the respective line.
# CONFIGS["language_models"]["checkpoint_encoder"] = os.path.join(ROOT, CONFIGS["language_models"]["checkpoint_encoder"])
# CONFIGS["language_models"]["checkpoint_decoder"] = os.path.join(ROOT, CONFIGS["language_models"]["checkpoint_decoder"])

prepared_dataset, tokenizer_input, tokenizer_output = tokenize_datasets(ds, CONFIGS)

In [None]:
# Load model
checkpoint = os.path.join(ROOT, "models/model/model_final") # TODO
model = transformers.EncoderDecoderModel.from_pretrained(checkpoint).to(device)

In [None]:
# Generate normalizations
# TODO: Do we have to include a configuration for beam search decoding here?
def generate_normalization(batch):
    inputs = tokenizer_input(batch["orig"], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer_output.batch_decode(outputs, skip_special_tokens=True)

    batch["norm_pred_str"] = output_str

    return batch


ds = ds.map(
    generate_normalization, 
    batched=True, 
    batch_size=8, 
    load_from_cache_file=False,
    )


### Apply visual modifications 



1. Markup unknown tokens in red
2. Add token-separator ("|") in input text
2. Add token-separator ("|") in output text

In [None]:
# Apply HTML markup to unknown tokens in original text
def markup_unknown_tokens(batch):
    spans = get_spans_of_unknown_tokens(batch["orig"], tokenizer_input)
    text_marked_up = markup_spans(batch["orig"], spans, opening_tag="<span style='color:#FF0000'>")

    batch["orig_marked_up"] = text_marked_up
    return batch

ds = ds.map(markup_unknown_tokens, batched=False, )

In [None]:
# Add token-separator 
def separate_tokens(batch, column, tokenizer):
    # We have to do the normalization explicitly before getting the encoding
    # to avoid mismatches in case the normalization changes the string length, e.g. "æ -> ae"
    norm_str = tokenizer.backend_tokenizer.normalizer.normalize_str(batch[column])
    encoding = tokenizer(norm_str, add_special_tokens=False)
    spans = [
        # map a token index to a pair of character indices
        encoding.token_to_chars(token_index)[:] for token_index in range(len(encoding["input_ids"]))
    ]
    text_marked_up = markup_spans(
        tokenizer.backend_tokenizer.normalizer.normalize_str(batch[column]),
        spans,
        opening_tag="",
        closing_tag="<span style='color:#FFA500'>|</span>",
        )

    batch[f"{column}_xlit_tok"] = text_marked_up
    return batch

ds = ds.map(separate_tokens, fn_kwargs={"tokenizer":tokenizer_input, "column" : "orig"}, batched=False, load_from_cache_file=False)
ds = ds.map(separate_tokens, fn_kwargs={"tokenizer":tokenizer_output, "column" : "norm_pred_str"}, batched=False, load_from_cache_file=False)

In [None]:
# Create pandas dataframes from predictions

# Do no truncate cells with long text
pd.set_option('display.max_colwidth', None)

part = "1600to1699"
df1600to1699 = pd.DataFrame(
    data={
        "orig_xlit" : ds[part]["orig_xlit_tok"], 
        "norm_tok" : ds[part]["norm_pred_str_xlit_tok"], 
        }
    )

part = "1700to1799"
df1700to1799 = pd.DataFrame(
    data={
        "orig_xlit" : ds[part]["orig_xlit_tok"], 
        "norm_tok" : ds[part]["norm_pred_str_xlit_tok"], 
        }
    )

part = "1800to1899"
df1800to1899 = pd.DataFrame(
    data={
        "orig_xlit" : ds[part]["orig_xlit_tok"], 
        "norm_tok" : ds[part]["norm_pred_str_xlit_tok"], 
        }
    )

---

## Look at the dataframes

### 1600 to 1699

In [None]:
from IPython.core.display import HTML
display(HTML(df1600to1699.head(N).to_html(escape=False)))

### 1700 to 1799

In [None]:
from IPython.core.display import HTML
display(HTML(df1700to1799.head(N).to_html(escape=False)))

### 1800 to 1899

In [None]:
from IPython.core.display import HTML
display(HTML(df1800to1899.head(N).to_html(escape=False)))

---