In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from itertools import chain
from pathlib import Path

from typing import List, Optional

params = {
    'axes.grid' : True,
    "grid.linestyle": '--',
    "font.family": "serif",
    "font.serif": "Times New Roman",
}

sns.set_style("ticks", params)
sns.set_context("paper", font_scale=1.5)
sns.set_palette("Set2")

In [None]:
data_root = Path("<path to data>")
model_path = Path("<path to model results>")
step = "<checkpoint to evaluate>"

In [None]:
def read_lines(path: Path, unescape_newline: bool = False) -> List[str]:
    with open(path) as f:
        lines = [l[:-1] for l in f.readlines()]
    if unescape_newline:
        lines = [l.replace("\\n", "\n") for l in lines]
    return lines

def load_scores(scores_file: Path):
    lines = scores_file.read_text().splitlines()
    scores = {}
    for line in lines:
        key, value = line.split(": ")
        scores[key] = float(value)
    return scores

def load_lp(dataset_root: Path, model_dataset_root: Path, lp: str, ckpt: str, instructions: str):
    sources = read_lines(dataset_root / lp / "train_eval.input.txt", unescape_newline=True)
    references = read_lines(dataset_root / lp / "train_eval.output.txt", unescape_newline=True)
    instructions_lines = read_lines(dataset_root / lp / f"{instructions}.txt", unescape_newline=True)

    scores = pd.read_csv(model_dataset_root / lp / ckpt / instructions / "seg_scores.txt")
    bleu_scores = scores["BLEU"]

    translations = read_lines(model_dataset_root / lp / ckpt / instructions / "translations.txt", unescape_newline=True)

    records = [
        {
            "lp": lp,
            "source": s,
            "reference": r,
            "translation": t,
            "instruction": i,
            "score": c,
        }
        for s, r, t, i, c in zip(sources, references, translations, instructions_lines, bleu_scores)
    ]

    return records

def load_results(data_root: Path, model_root: Path, dataset: str, ckpt: str, instructions: str):
    dataset_root = data_root / dataset
    model_dataset_root = model_root / dataset

    results = []
    lps_dirs = [d for d in model_dataset_root.iterdir() if d.is_dir()]
    for lp_dir in lps_dirs:
        lp = lp_dir.name
        results.extend(load_lp(dataset_root, model_dataset_root, lp, ckpt, instructions))
    df = pd.DataFrame(results)
    return df

In [None]:
results = []

domains = ["flores", "medical", "law", "tico", "chat_wmt"]#, "nllb_md_chat", "nllb_md_health", "nllb_md_news",]
domain2label = {
    "flores": "Flores",
    "medical": "Medical",
    "law": "Law",
    "nllb_md_chat": "NLLB Chat",
    "nllb_md_health": "NLLB Health",
    "nllb_md_news": "NLLB News",
    "tico": "Tico",
    "chat_wmt": "Chat",
}

for domain in domains:
    zero_shot = load_results(data_root, model_path, domain, step, "zero_shot_instructions")
    zero_shot.rename(columns={"instruction": "zero_shot_instruction", "translation": "zero_shot_translation", "score": "zero_shot_score" }, inplace=True)
    few_shot = load_results(data_root, model_path, domain, step, "few_shot_instructions2")
    few_shot.rename(columns={"instruction": "few_shot_instruction", "translation": "few_shot_translation", "score": "few_shot_score" }, inplace=True)

    df = pd.concat([zero_shot, few_shot.drop(columns=["source", "reference", "lp"])], axis=1)
    df = df[~df["lp"].str.contains("zh")]
    df["Domain"] = domain2label[domain]
    upper_threshold = 30
    lower_threshold = 3
    df["is_candidate"] = (df["zero_shot_score"] > upper_threshold)
    df["is_hallucination"] = df["is_candidate"] & (df["few_shot_score"] < lower_threshold)
    df["is_reverse_candidate"] = (df["few_shot_score"] > upper_threshold)
    df["is_reverse_hallucination"] = df["is_reverse_candidate"] & (df["zero_shot_score"] < lower_threshold)
    results.append(df)

results = pd.concat(results)
results

In [None]:
candidates = results[results["is_candidate"]]
candidates[["Domain", "is_hallucination"]].groupby("Domain").mean() * 100

In [None]:
candidates = results[results["is_reverse_candidate"]]
candidates[["Domain", "is_reverse_hallucination"]].groupby("Domain").mean() * 100

In [None]:
hallucinations = results[results["is_hallucination"]].copy()
for row in hallucinations.iterrows():
    print("-" * 80)
    print("Instruction:")
    print(row[1]["few_shot_instruction"])
    print("Reference:", row[1]["reference"])
    print("Zero-shot:", row[1]["zero_shot_translation"])
    print("Few-shot:", row[1]["few_shot_translation"])
    print() 

In [None]:
# Create a column to check if the few_shot_translation is in the few_shot_instruction
is_copy_df = hallucinations.copy()
is_copy_df["is_copy"] = is_copy_df.apply(lambda row: row["few_shot_translation"] in row["few_shot_instruction"], axis=1)
copies = is_copy_df[is_copy_df["is_copy"]]
for row in copies.iterrows():
    print("-" * 80)
    print("Instruction:")
    print(row[1]["few_shot_instruction"])
    print("Reference:", row[1]["reference"])
    print("Zero-shot:", row[1]["zero_shot_translation"])
    print("Few-shot:", row[1]["few_shot_translation"])
    print() 