In [None]:
import os
import pandas as pd
import numpy as np

from pathlib import Path
from tqdm import tqdm, trange

from templates import *

from typing import List

## Load all the data

In [None]:
df_all_w_instruction = pd.read_pickle("<path to processed train data>")

In [None]:
# create a lang map that maps the lang codes "en" "fr" "de" "pt" "nl" "ru" "zh" to the full language name
lang_map = {"en": "English", "fr": "French", "de": "German", "pt": "Portuguese", "nl": "Dutch", "ru": "Russian", "zh": "Chinese"}
# create a list of possible languages by pairing the languages in the lang_map with English
lang_pairs = [f"{lang}-en" for lang in lang_map.keys() if lang != "en"] + [f"en-{lang}" for lang in lang_map.keys() if lang != "en"]

Filter according to "filtering" score -- 0.8 is the default value

In [None]:
df_filtered = df_all_w_instruction.loc[df_all_w_instruction.score >= 0.8]
df_filtered.head()

In [None]:
val_data_root = Path("<path to processed val data>")

def load_data():
    lps = ["fr-en", "de-en", "pt-en", "nl-en", "ru-en", "zh-en", "en-fr", "en-de", "en-pt", "en-nl", "en-ru", "en-zh"]
    dfs = []
    for lp in lps:
        tsv_path = val_data_root / lp / "dev.tsv"
        df = pd.read_csv(tsv_path, sep="\t")
        dfs.append(df)

    return pd.concat(dfs)

df_val = load_data()
df_val

In [None]:
root_dir = Path("<path to root dir>")

### Create data by sampling N examples for each language pair

In [None]:
N_per_LP = 250000

train_records = {}
few_shot_records = {}
for lang_pair in df_filtered["lp"].unique():
    df_lang_pair = df_filtered.loc[df_filtered.lp == lang_pair]
    lp_train_df = df_lang_pair.sample(N_per_LP, random_state=42)
    # Few shot records are the ones not in the train set
    lp_few_shot_df = df_lang_pair.loc[~df_lang_pair.index.isin(lp_train_df.index)]

    train_records[lang_pair] = lp_train_df
    few_shot_records[lang_pair] = lp_few_shot_df

In [None]:
def choose_k(n: int, strategy: str, seed: int = 42) -> List[int]:
    """Choose the number of examples according to two strategies:
    
    * balanced: uniform sampling between 0 and 5
    * unbalanced: 50% of the examples have 0 and the rest are uniformly sampled between 1 and 5
    """
    rng = np.random.default_rng(seed)
    if strategy == "balanced":
        return list(rng.integers(low=0, high=6, size=n))
    elif strategy == "unbalanced":
        return list(rng.choice([0, 1, 2, 3, 4, 5], size=n, p=[0.5, 0.1, 0.1, 0.1, 0.1, 0.1]))
    else:
        raise ValueError(f"Unknown strategy {strategy}")

def uniform_sample_examples(
    few_shot_df: pd.DataFrame, lp: str, ks: List[int], remove_self_idx: bool = False, seed: int = 42,
) -> List[List[Tuple[str, str]]]:
    rng = np.random.default_rng(seed)
    few_shot_df = few_shot_df[few_shot_df["lp"] == lp]

    if not remove_self_idx:
        idxs = [rng.choice(few_shot_df.index, size=k, replace=False) for k in ks]
    else:
        idxs = [rng.choice(few_shot_df.index[few_shot_df.index != i], size=k, replace=False) for i, k in enumerate(ks)]
    rows = [few_shot_df.loc[idx] for idx in idxs]
    examples = [
        [(row["src"], row["ref"]) for _, row in r.iterrows()]
        for r in rows
    ]
    return examples

def write_escaped_lines(lines, path):
    lines = [line.replace("\n", "\\n") for line in lines]
    with open(path, "w") as f:
        f.write("\n".join(lines))

### Zero shot Train Data

In [None]:
def make_zero_shot_train_data():
    data_dir = root_dir / "zero_shot_train_data"
    data_dir.mkdir(exist_ok=True)

    for lang_pair in tqdm(df_filtered["lp"].unique()):
        lp_train_df = train_records[lang_pair]

        lp_train_df["zero_shot_instruction"] = lp_train_df.apply(
            lambda x: instruction_template(lang_pair, x["src"]), axis=1,
        )

    lp_train_df = pd.concat(train_records.values())

    write_escaped_lines(
        lp_train_df["zero_shot_instruction"].tolist(),
        data_dir / "zero_shot_instructions.txt",
    )

    write_escaped_lines(
        lp_train_df["src"].tolist(),
        data_dir / "sources.txt",
    )

    write_escaped_lines(
        lp_train_df["ref"].tolist(),
        data_dir / "references.txt",
    )

make_zero_shot_train_data()

### Zero Shot Validation Data

In [None]:
def make_zero_shot_val_data():
    data_dir = root_dir / "zero_shot_val_data"
    data_dir.mkdir(exist_ok=True)

    lp_val_dfs = []
    for lang_pair in tqdm(df_val["lp"].unique()):
        lp_val_df = df_val.loc[df_val.lp == lang_pair].copy()

        lp_val_df["zero_shot_instruction"] = lp_val_df.apply(
            lambda x: instruction_template(lang_pair, x["src"]), axis=1,
        )
        lp_val_dfs.append(lp_val_df)
    
    zero_shot_val_df = pd.concat(lp_val_dfs)

    write_escaped_lines(
        zero_shot_val_df["zero_shot_instruction"].tolist(),
        data_dir / "zero_shot_instructions.txt",
    )

    write_escaped_lines(
        zero_shot_val_df["src"].tolist(),
        data_dir / "sources.txt",
    )

    write_escaped_lines(
        zero_shot_val_df["ref"].tolist(),
        data_dir / "references.txt",
    )

make_zero_shot_val_data()

### Few-shot train data

In [None]:
def make_few_shot_train_data(data_dir: Path, k_strategy: str):
    data_dir.mkdir(exist_ok=True)

    for lang_pair in df_filtered["lp"].unique():
        lp_train_df = train_records[lang_pair]
        lp_few_shot_df = few_shot_records[lang_pair]

        ks = choose_k(len(lp_train_df), k_strategy)

        few_shot_examples = uniform_sample_examples(lp_few_shot_df, lp=lang_pair, ks=ks)
        
        lp_train_df["few_shot_examples"] = few_shot_examples

        lp_train_df["few_shot_1"] = lp_train_df.apply(
            lambda x: format1_few_shot_instruction_template(lang_pair, x["src"], x["few_shot_examples"]), axis=1,
        )
        lp_train_df["few_shot_2"] = lp_train_df.apply(
            lambda x: format2_few_shot_instruction_template(lang_pair, x["src"], x["few_shot_examples"]), axis=1,
        )
        lp_train_df["few_shot_3"] = lp_train_df.apply(
            lambda x: format3_few_shot_instruction_template(lang_pair, x["src"], x["few_shot_examples"]), axis=1,
        )

    lp_train_df = pd.concat(train_records.values())

    write_escaped_lines(
        lp_train_df["few_shot_1"].tolist(),
        data_dir / "few_shot_instructions_1.txt",
    )
    write_escaped_lines(
        lp_train_df["few_shot_2"].tolist(),
        data_dir / "few_shot_instructions_2.txt",
    )
    write_escaped_lines(
        lp_train_df["few_shot_3"].tolist(),
        data_dir / "few_shot_instructions_3.txt",
    )

    write_escaped_lines(
        lp_train_df["src"].tolist(),
        data_dir / "sources.txt",
    )

    write_escaped_lines(
        lp_train_df["ref"].tolist(),
        data_dir / "references.txt",
    )

def make_few_shot_val_data(data_dir: Path, k_strategy: str):
    data_dir.mkdir(exist_ok=True)

    lp_val_dfs = []
    for lang_pair in tqdm(df_val["lp"].unique()):
        lp_val_df = df_val.loc[df_val.lp == lang_pair].copy()

        ks = choose_k(len(lp_val_df), k_strategy)

        few_shot_examples = uniform_sample_examples(lp_val_df, lp=lang_pair, ks=ks, remove_self_idx=True)

        lp_val_df["few_shot_examples"] = few_shot_examples

        lp_val_df["few_shot_1"] = lp_val_df.apply(
            lambda x: format1_few_shot_instruction_template(lang_pair, x["src"], x["few_shot_examples"]), axis=1,
        )
        lp_val_df["few_shot_2"] = lp_val_df.apply(
            lambda x: format2_few_shot_instruction_template(lang_pair, x["src"], x["few_shot_examples"]), axis=1,
        )
        lp_val_df["few_shot_3"] = lp_val_df.apply(
            lambda x: format3_few_shot_instruction_template(lang_pair, x["src"], x["few_shot_examples"]), axis=1,
        )
        lp_val_dfs.append(lp_val_df)
    
    few_shot_val_df = pd.concat(lp_val_dfs)

    write_escaped_lines(
        few_shot_val_df["few_shot_1"].tolist(),
        data_dir / "few_shot_instructions_1.txt",
    )
    write_escaped_lines(
        few_shot_val_df["few_shot_2"].tolist(),
        data_dir / "few_shot_instructions_2.txt",
    )
    write_escaped_lines(
        few_shot_val_df["few_shot_3"].tolist(),
        data_dir / "few_shot_instructions_3.txt",
    )

    write_escaped_lines(
        few_shot_val_df["src"].tolist(),
        data_dir / "sources.txt",
    )

    write_escaped_lines(
        few_shot_val_df["ref"].tolist(),
        data_dir / "references.txt",
    )

In [None]:
make_few_shot_train_data(
    data_dir=root_dir / "few_shot_balanced_uniform_train_data",
    k_strategy="balanced",
    select_strategy="uniform",
)
make_few_shot_val_data(
    data_dir=root_dir / "few_shot_balanced_uniform_val_data",
    k_strategy="balanced",
    select_strategy="uniform",
)

In [None]:
make_few_shot_train_data(
    data_dir=root_dir / "few_shot_unbalanced_uniform_train_data",
    k_strategy="unbalanced",
    select_strategy="uniform",
)
make_few_shot_val_data(
    data_dir=root_dir / "few_shot_unbalanced_uniform_val_data",
    k_strategy="unbalanced",
    select_strategy="uniform",
)