In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path

from typing import List

params = {
    'axes.grid' : True,
    "grid.linestyle": '--',
    "font.family": "serif",
    "font.serif": "Times New Roman",
}
sns.set_style("ticks", params)
sns.set_context("paper", font_scale=1.5)
sns.set_palette("Set2")

In [None]:
"""This file was copied from https://github.com/Lightning-AI/lit-llama/blob/main/lit_llama/tokenizer.py."""
import os
from pathlib import Path
from typing import Optional

import torch
from sentencepiece import SentencePieceProcessor, SentencePieceTrainer


class Tokenizer:
    """Tokenizer for LLaMA."""

    def __init__(self, model_path: Path) -> None:
        self.processor = SentencePieceProcessor(model_file=str(model_path))
        self.bos_id = self.processor.bos_id()
        self.eos_id = self.processor.eos_id()
        self.pad_id = self.processor.pad_id()

    @property
    def vocab_size(self) -> int:
        return self.processor.vocab_size()

    def encode(
        self,
        string: str,
        bos: bool = True,
        eos: bool = False,
        max_length: int = -1,
        pad: bool = False,
        device: Optional[torch.device] = None
    ) -> torch.Tensor:
        tokens = self.processor.encode(string)
        if bos:
            tokens = [self.bos_id] + tokens
        if eos:
            tokens = tokens + [self.eos_id]
        if max_length > 0:
            tokens = tokens[:max_length]
        if pad and len(tokens) < max_length:
            tokens += [self.pad_id] * (max_length - len(tokens))

        return torch.tensor(tokens, dtype=torch.int, device=device)

    def decode(self, tokens: torch.Tensor) -> str:
        return self.processor.decode(tokens.tolist())

    @staticmethod
    def train(input: str, destination: str, vocab_size=32000) -> None:
        model_prefix = os.path.join(destination, "tokenizer")
        SentencePieceTrainer.Train(input=input, model_prefix=model_prefix, vocab_size=vocab_size)

tokenizer = Tokenizer(Path("<path to sentence piece tokenizer>"))
def tokenize(text: str) -> List[str]:
    return tokenizer.processor.encode_as_pieces(text)

In [None]:
data_root = Path("<path to data>")
pretrained_7B_path = Path("<path to pretrained 7B results>")
finetuned_lora_path = Path("<path to best adapters zero-shot model results>")
icl_finetuned_lora_path = Path("<path to best adapters few-shot model results>")

In [None]:
def read_lines(path: Path, unescape_newline: bool = False) -> List[str]:
    with open(path) as f:
        lines = [l[:-1] for l in f.readlines()]
    if unescape_newline:
        lines = [l.replace("\\n", "\n") for l in lines]
    return lines

def load_scores(scores_file: Path):
    lines = scores_file.read_text().splitlines()
    scores = {}
    for line in lines:
        key, value = line.split(": ")
        scores[key] = float(value)
    return scores

def load_lp(dataset_root: Path, model_dataset_root: Path, lp: str, ckpt: str, instructions: str):
    sources = read_lines(dataset_root / lp / "train_eval.input.txt", unescape_newline=True)
    references = read_lines(dataset_root / lp / "train_eval.output.txt", unescape_newline=True)
    instructions_lines = read_lines(dataset_root / lp / f"{instructions}.txt", unescape_newline=True)
    
    translations = read_lines(model_dataset_root / lp / ckpt / instructions / "translations.txt", unescape_newline=True)

    records = [
        {
            "lp": lp,
            "source": s,
            "reference": r,
            "translation": t,
            "instruction": i,
        }
        for s, r, t, i in zip(sources, references, translations, instructions_lines)
    ]

    return records

def longest_common_substring(s1, s2):
    longest = ""
    for i in range(len(s1)):
        for j in range(len(s2)):
            k = 0
            while i + k < len(s1) and j + k < len(s2) and s1[i + k] == s2[j + k]:
                k += 1
            if k > len(longest):
                longest = s1[i:i + k]
    return longest

def compute_features(df):
    df["tok-translation"] = df["translation"].apply(tokenize)
    df["tok-reference"] = df["reference"].apply(tokenize)
    df["tok-instruction"] = df["instruction"].apply(tokenize)
    #df["tok-lcs"] = df.apply(lambda x: longest_common_substring(x["tok-translation"], x["tok-instruction"]), axis=1)
    #df["len-lcs"] = df["tok-lcs"].apply(lambda x: len(x))
    #df["lcs-instruction"] = df.apply(lambda x: longest_common_substring(x["translation"], x["instruction"]), axis=1)
    df["translation-len"] = df["tok-translation"].apply(lambda x: len(x))
    df["newline-count"] = df["translation"].apply(lambda x: x.count("\n"))
    return df

def load_results(data_root: Path, model_root: Path, dataset: str, ckpt: str, instructions: str):
    dataset_root = data_root / dataset
    model_dataset_root = model_root / dataset

    results = []
    lps_dirs = [d for d in model_dataset_root.iterdir() if d.is_dir()]
    for lp_dir in lps_dirs:
        lp = lp_dir.name
        results.extend(load_lp(dataset_root, model_dataset_root, lp, ckpt, instructions))
    df = pd.DataFrame(results)
    df = compute_features(df)
    return df

In [None]:
pretrained_results = load_results(data_root, pretrained_7B_path, "flores", "0", "zero_shot_instructions")
pretrained_results["Model"] = "Pretrained"
pretrained_results

In [None]:
finetuned_results = load_results(data_root, finetuned_lora_path, "flores", "20000", "zero_shot_instructions")
finetuned_results["Model"] = "FT w/o\nfew-shot"
finetuned_results

In [None]:
icl_results = load_results(data_root, icl_finetuned_lora_path, "flores", "20000", "zero_shot_instructions")
icl_results["Model"] = "FT w\nfew-shot"
icl_results

In [None]:
oracle = icl_results.copy()
oracle["translation"] = oracle["reference"]
oracle["tok-translation"] = oracle["tok-reference"]
oracle["Model"] = "Reference"

In [None]:
results = pd.concat([pretrained_results, finetuned_results, icl_results, oracle])
results.to_csv("translation_length.csv", index=False)
results = results[~results["lp"].str.contains("zh")]

In [None]:
_, ax = plt.subplots(figsize=(5, 3))
ax = sns.boxenplot(data=results, y="translation-len",x="Model", ax=ax) #stat="probability", multiple="dodge", kde=True, bins=100)
ax.set_xlabel("")
ax.set_ylim(0, 210)
ax.set_ylabel("Translation Length")
#plt.savefig("figures/translation_length.pdf", bbox_inches="tight", dpi=200)