In [14]:
import pandas as pd
from scipy import stats
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sentence_transformers import SentenceTransformer
import torch
from torch import nn
from collections import defaultdict
from pprint import pprint
import pickle
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from collections import defaultdict
from pathlib import Path
import os

RANDOM_SEED=42
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [11]:
lang_codes = {
    "amh": "amh_Ethi",
    "ary": "ary_Arab",
    "eng": "eng_Latn",
    "esp": "spa_Latn",
    "hau": "hau_Latn",
    "kin": "kin_Latn",
    "mar": "mar_Deva",
    "tel": "tel_Telu"
}

TRANSLATION_MODELS = [
    ("facebook/nllb-200-3.3B", 8),
    ("facebook/nllb-200-1.3B", 16),
    ("facebook/nllb-200-distilled-600M", 16), # bsz fixed
    ("facebook/nllb-200-distilled-1.3B", 16), # bsz fixed
]

In [9]:
LANGS = [
    # 'amh',
    # 'arq',
    'ary', 'eng', 'esp', 'hau', 'kin', 'mar', 'tel']

# Make train, val, dev data

In [None]:
tot_train = 0
tot_val = 0

for lang in LANGS:
    # if lang in ["amh", "arq"]: continue
    if not os.path.isdir(f"./data/Track A/{lang}"): continue

    df = pd.read_csv(f"./data/Track A/{lang}/{lang}_train.csv")
    train_df, val_df = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)
    train_df.to_csv(f"./data/Track A/{lang}/{lang}_train_split.csv")
    val_df.to_csv(f"./data/Track A/{lang}/{lang}_val_split.csv")

    def write_translation(mode):
        file_name = f"{lang}_{mode}"
        if mode not in ["dev", "test"]: file_name += '_split'
            
        df = pd.read_csv(f"./data/Track A/{lang}/{file_name}.csv")
        df["text1"] = df["Text"].map(lambda x: x.split("\n")[0].strip('"'))
        df["text2"] = df["Text"].map(lambda x: x.split("\n")[1].strip('"'))
    
        print(lang, mode, len(df))
    
        all_translations = defaultdict(list)
        for tmodel_name, batch_size in tqdm(TRANSLATION_MODELS):
            tmodel = AutoModelForSeq2SeqLM.from_pretrained(tmodel_name)
            ttokenizer = AutoTokenizer.from_pretrained(tmodel_name)
            source = lang_codes[lang]
            target = "eng_Latn"
            task_name = 'translation'
            # if tmodel_name.index("mbart") != -1: task_name = "translation_te_to_en"
            translator = pipeline(task_name, model=tmodel, tokenizer=ttokenizer, src_lang=source, tgt_lang=target, batch_size=batch_size, device=DEVICE)
        
            texts1 = []
            texts2 = []
            for i, row in df.iterrows():
                text1 = row['text1']
                text2 = row['text2']
                texts1.append(text1)
                texts2.append(text2)
            translations1 = translator(texts1, max_length=800)
            translations1 = [x['translation_text'] for x in translations1]
    
            translations2 = translator(texts2, max_length=800)
            translations2 = [x['translation_text'] for x in translations2]

            for i, (_, row) in enumerate(df.iterrows()):
                all_translations['text1'].append(translations1[i])
                all_translations['text2'].append(translations2[i])
                all_translations['PairID'].append(row['PairID'])
                all_translations['model'].append(tmodel_name)
                if mode != 'dev':
                    all_translations['Score'].append(row['Score'])

            if lang == 'eng': break
    
        out_df = pd.DataFrame(all_translations)
        out_df.to_csv(f"./data/Track A/{lang}/{mode}_translation.csv")

    for mode in ['train', 'val', 'dev']:
        write_translation(mode)

In [5]:
all_trains = []
all_vals = []
all_devs = []

for d in dirs:
    df = pd.read_csv(f"./data/Track A/{d}/train_translation.csv")
    df['lang'] = d
    all_trains.append(df)

    df = pd.read_csv(f"./data/Track A/{d}/val_translation.csv")
    df['lang'] = d
    all_vals.append(df)

    df = pd.read_csv(f"./data/Track A/{d}/dev_translation.csv")
    df['lang'] = d
    all_devs.append(df)

train = pd.concat(all_trains)
val = pd.concat(all_vals)
dev = pd.concat(all_devs)

train.to_csv("./data/Track A/train_all.csv")
val.to_csv("./data/Track A/val_all.csv")
dev.to_csv("./data/Track A/dev_all.csv")

# Make test data

In [17]:
TRANSLATION_MODELS = [
    ("facebook/nllb-200-3.3B", 8),
]

In [19]:
for lang in LANGS:
    if not os.path.isdir(f"./test_data/Track A/{lang}"): continue

    def write_translation(mode):
        file_name = f"{lang}_{mode}"
            
        df = pd.read_csv(f"./test_data/Track A/{lang}/{file_name}.csv")
        df["text1"] = df["Text"].map(lambda x: x.split("\n")[0].strip('"'))
        df["text2"] = df["Text"].map(lambda x: x.split("\n")[1].strip('"'))
    
        print(lang, mode, len(df))
    
        all_translations = defaultdict(list)
        for tmodel_name, batch_size in tqdm(TRANSLATION_MODELS):
            tmodel = AutoModelForSeq2SeqLM.from_pretrained(tmodel_name)
            ttokenizer = AutoTokenizer.from_pretrained(tmodel_name)
            source = lang_codes[lang]
            target = "eng_Latn"
            task_name = 'translation'
            
            translator = pipeline(task_name, model=tmodel, tokenizer=ttokenizer, src_lang=source, tgt_lang=target, batch_size=batch_size, device=DEVICE)
        
            texts1 = []
            texts2 = []
            for i, row in df.iterrows():
                text1 = row['text1']
                text2 = row['text2']
                texts1.append(text1)
                texts2.append(text2)
            translations1 = translator(texts1, max_length=800)
            translations1 = [x['translation_text'] for x in translations1]
    
            translations2 = translator(texts2, max_length=800)
            translations2 = [x['translation_text'] for x in translations2]

            for i, (_, row) in enumerate(df.iterrows()):
                all_translations['text1'].append(translations1[i])
                all_translations['text2'].append(translations2[i])
                all_translations['PairID'].append(row['PairID'])
                all_translations['model'].append(tmodel_name)

            if lang == 'eng': break
    
        out_df = pd.DataFrame(all_translations)
        out_df.to_csv(f"./test_data/Track A/{lang}/{mode}_translation.csv")

    for mode in ['test']:
        write_translation(mode)

ary test 426


  0%|          | 0/1 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

eng test 2600


  0%|          | 0/1 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

esp test 600


  0%|          | 0/1 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

hau test 603


  0%|          | 0/1 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

kin test 222


  0%|          | 0/1 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

mar test 298


  0%|          | 0/1 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

tel test 297


  0%|          | 0/1 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [20]:
all_tests = []

for d in dirs:
    df = pd.read_csv(f"./test_data/Track A/{d}/test_translation.csv")
    df['lang'] = d
    all_tests.append(df)
    
test = pd.concat(all_tests)

train.to_csv("./test_data/Track A/test_all.csv")