In [None]:
from google.colab import drive
import os
if not os.path.exists('/gd'):
    drive.mount('/gd')

In [None]:
!pip install sentencepiece transformers==4.33 datasets sacremoses sacrebleu -q


In [None]:
import locale
def gpe(x=None):
    return "UTF-8"
locale.getpreferredencoding = gpe

In [None]:
import pandas as pd

In [None]:

trans_df = pd.read_excel('/content/sau_std.xlsx')
print(trans_df.shape)
print(trans_df.columns)

In [None]:
pd.options.display.max_colwidth = 100

In [None]:
trans_df.sample(10)

In [None]:
trans_df.isnull().sum()

In [None]:
trans_df.split.value_counts()

In [None]:
df_train = trans_df[trans_df.split=='train'].copy()
df_dev = trans_df[trans_df.split=='dev'].copy()
df_test = trans_df[trans_df.split=='test'].copy()

In [None]:
from transformers import NllbTokenizer
from tqdm.auto import tqdm, trange

In [None]:
tokenizer = NllbTokenizer.from_pretrained('facebook/m2m100_418M')

In [None]:
import re

def word_tokenize(text):
    return re.findall('(\w+|[^\w\s])', text)

In [None]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def tokenize_text(text):
    return word_tokenize(text)

smpl = df_train.sample(400, random_state=1)
smpl['std_toks'] = smpl['std'].apply(tokenize_text)
smpl['sau_toks'] = smpl['sau'].apply(tokenize_text)

smpl['std_words'] = smpl['std'].apply(word_tokenize)
smpl['sau_words'] = smpl['sau'].apply(word_tokenize)

In [None]:
smpl.sample(5)[['sau', 'sau_words', 'sau_toks', 'std', 'std_words', 'std_toks']]

In [None]:
stats = smpl[['std_toks', 'sau_toks', 'std_words', 'sau_words']].applymap(len).describe()
stats

In [None]:
print(stats.std_toks['mean'] / stats.std_words['mean'])
print(stats.sau_toks['mean'] / stats.sau_words['mean'])

In [None]:
print(tokenizer.unk_token, tokenizer.unk_token_id)

In [None]:
texts_with_unk = [text for text in tqdm(trans_df.sau) if tokenizer.unk_token_id in tokenizer(text).input_ids]
print(len(texts_with_unk))

In [None]:
import random
s = random.sample(texts_with_unk, min(len(texts_with_unk), 5))
s

In [None]:


import re
import sys
import typing as tp
import unicodedata
from sacremoses import MosesPunctNormalizer


mpn = MosesPunctNormalizer(lang="en")
mpn.substitutions = [
    (re.compile(r), sub) for r, sub in mpn.substitutions
]


def get_non_printing_char_replacer(replace_by: str = " ") -> tp.Callable[[str], str]:
    non_printable_map = {
        ord(c): replace_by
        for c in (chr(i) for i in range(sys.maxunicode + 1))
        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
    }

    def replace_non_printing_char(line) -> str:
        return line.translate(non_printable_map)

    return replace_non_printing_char

replace_nonprint = get_non_printing_char_replacer(" ")

def preproc(text):
    clean = mpn.normalize(text)
    clean = replace_nonprint(clean)
    clean = unicodedata.normalize("NFKC", clean)
    return clean

In [None]:
texts_with_unk_normed = [text for text in tqdm(texts_with_unk) if tokenizer.unk_token_id in tokenizer(preproc(text)).input_ids]
print(len(texts_with_unk_normed))

In [None]:
from transformers import AutoModelForSeq2SeqLM
from transformers import NllbTokenizer

In [None]:
len(tokenizer)

In [None]:
tokenizer = NllbTokenizer.from_pretrained('facebook/nllb-200-distilled-600M')
print(len(tokenizer))
print(tokenizer.convert_ids_to_tokens([256202, 256203]))

In [None]:
def fix_tokenizer(tokenizer, new_lang='sau_Cyrl'):

    old_len = len(tokenizer) - int(new_lang in tokenizer.added_tokens_encoder)
    tokenizer.lang_code_to_id[new_lang] = old_len-1
    tokenizer.id_to_lang_code[old_len-1] = new_lang

    tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset

    tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
    tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
    if new_lang not in tokenizer._additional_special_tokens:
        tokenizer._additional_special_tokens.append(new_lang)

    tokenizer.added_tokens_encoder = {}
    tokenizer.added_tokens_decoder = {}

In [None]:
print(tokenizer.convert_ids_to_tokens([256202, 256203, 256204]))
print(tokenizer.convert_tokens_to_ids(['zul_Latn', 'tyv_Cyrl', '<mask>']))


In [None]:
added_token_id = tokenizer.convert_tokens_to_ids('sau_Cyrl')
similar_lang_id = tokenizer.convert_tokens_to_ids('kir_Cyrl')
print(added_token_id, similar_lang_id)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/nllb-200-distilled-600M')
model.resize_token_embeddings(len(tokenizer))

In [None]:

model.model.shared.weight.data[added_token_id+1] = model.model.shared.weight.data[added_token_id]
model.model.shared.weight.data[added_token_id] = model.model.shared.weight.data[similar_lang_id]

In [None]:
import gc
import random
import numpy as np
import torch
from tqdm.auto import tqdm, trange
from transformers.optimization import Adafactor
from transformers import get_constant_schedule_with_warmup

def cleanup():
    """Try to free GPU memory"""
    gc.collect()
    torch.cuda.empty_cache()

cleanup()

In [None]:
model.cuda();

In [None]:
optimizer = Adafactor(
    [p for p in model.parameters() if p.requires_grad],
    scale_parameter=False,
    relative_step=False,
    lr=1e-4,
    clip_threshold=1.0,
    weight_decay=1e-3,
)

In [None]:
batch_size = 16
max_length = 32
warmup_steps = 1_000
training_steps = 5700

In [None]:
losses = []
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)

In [None]:
LANGS = [('std', 'guj_Gujr'), ('sau', 'sau_Cyrl')]

def get_batch_pairs(batch_size, data=df_train):
    (l1, long1), (l2, long2) = random.sample(LANGS, 2)
    xx, yy = [], []
    for _ in range(batch_size):
        item = data.iloc[random.randint(0, len(data)-1)]
        xx.append(preproc(item[l1]))
        yy.append(preproc(item[l2]))
    return xx, yy, long1, long2

print(get_batch_pairs(1))


In [None]:
MODEL_SAVE_PATH = '/gd/MyDrive/models/nllb-sau-guj-v1'

In [None]:
model.train()
x, y, loss = None, None, None
cleanup()

tq = trange(len(losses), training_steps)
for i in tq:
    xx, yy, lang1, lang2 = get_batch_pairs(batch_size)
    try:
        tokenizer.src_lang = lang1
        x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        tokenizer.src_lang = lang2
        y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        y.input_ids[y.input_ids == tokenizer.pad_token_id] = -100

        loss = model(**x, labels=y.input_ids).loss
        loss.backward()
        losses.append(loss.item())

        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        scheduler.step()

    except RuntimeError as e:
        optimizer.zero_grad(set_to_none=True)
        x, y, loss = None, None, None
        cleanup()
        print('error', max(len(s) for s in xx + yy), e)
        continue

    if i % 1000 == 0:
        print(i, np.mean(losses[-1000:]))

    if i % 1000 == 0 and i > 0:
        model.save_pretrained(MODEL_SAVE_PATH)
        tokenizer.save_pretrained(MODEL_SAVE_PATH)

In [None]:
pd.Series(losses).ewm(100).mean().plot();

In [None]:
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM

In [None]:
model_load_name = '/gd/MyDrive/models/nllb-sau-guj-v1'
model = AutoModelForSeq2SeqLM.from_pretrained(model_load_name).cuda()
tokenizer = NllbTokenizer.from_pretrained(model_load_name)
fix_tokenizer(tokenizer)

In [None]:
def translate(
    text, src_lang='guj_Gujr', tgt_lang='eng_Latn',
    a=32, b=3, max_input_length=1024, num_beams=4, **kwargs
):
    """Turn a text or a list of texts into a list of translations"""
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(
        text, return_tensors='pt', padding=True, truncation=True,
        max_length=max_input_length
    )
    model.eval() # turn off training mode
    result = model.generate(
        **inputs.to(model.device),
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams, **kwargs
    )
    return tokenizer.batch_decode(result, skip_special_tokens=True)

# Example usage:
t = "ખાડા માં ખાબકીને કેમ તારો ખીસ્સો ખાલી થઇ ગયો  "
print(translate(t, 'sau_Cyrl', 'guj_Gujr'))
