# Install 

In [None]:
! pip install transformers datasets evaluate
! pip install sacrebleu
! pip install googletrans==3.1.0a0
# ! pip install huggingface_hub

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

In [15]:
from datasets import load_dataset, DatasetDict, Dataset
import evaluate
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import DataCollatorForSeq2Seq
from transformers import pipeline
from googletrans import Translator

# Back Translation

## Loading Dataset

As soon as we achieve a datafile of data augmentation, I will only load the dataset into the model and re-train the model to estimate the BLEU after augmenting the data. The process of loading data is as following:
1. Import files and get the data
2. Convert data into our correct type in order to feed the corpus into the model
3. Re-training our model and calculate the BLEU score.

In [4]:
import pathlib
path_to_file = pathlib.Path('../data/gloss-text-augmented.txt')
def load_data(path):
  text = path.read_text(encoding='utf-8')

  lines = text.splitlines()
  pairs = [line.split('\t') for line in lines]

  context = np.array([context.lower() for target,context in pairs])
  target = np.array([target.lower() for target,context in pairs])

  return target, context

gloss, text = load_data(path_to_file)

In [58]:
gloss = list(gloss)
text = list(text)

In [75]:
from datasets.dataset_dict import DatasetDict
from datasets import Dataset


dataset = {'train':Dataset.from_dict({'gloss':gloss,'text': text}),
     'test':Dataset.from_dict({'gloss':gloss,'text': text})
     }

DatasetDict(dataset)
dataset = {'train':Dataset.from_dict({'gloss':gloss,'text': text})
     }

In [76]:
dataset = dataset["train"].train_test_split(train_size=0.8)

In [73]:
dataset = DatasetDict

In [77]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['gloss', 'text'],
        num_rows: 73696
    })
    test: Dataset({
        features: ['gloss', 'text'],
        num_rows: 18424
    })
})


# Training Model

This section is where we re-train our model. Since the dataset has been changed, there are chances that the evaluation outcome will also vary. Our new BLEU after backtranslating is as shown.

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [80]:
source_lang = "gloss"
target_lang = "text"


def preprocess_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs,  text_target=targets, max_length=128, truncation=True)
    print(type(model_inputs))
    return model_inputs

In [None]:
tokenized_data = dataset.map(preprocess_function, batched=True)

In [82]:
sacrebleu = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [84]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [85]:
training_args = Seq2SeqTrainingArguments(
    output_dir="transformer_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  1%|          | 93/13818 [2:12:06<380:56:28, 99.92s/it] 

KeyboardInterrupt: 

In [None]:
trainer.push_to_hub()

# Predict

This is our predict section. However the point of this file is to backtranslate and calculate the BLEU changed, we will temporarily leave this section here unused. However if you are willing to run this section, we will still push the model after training to huggingface site for your convenience.

In [4]:
tokenizer = AutoTokenizer.from_pretrained("junowhite/transformer_model")

model = AutoModelForSeq2SeqLM.from_pretrained("junowhite/transformer_model")

In [5]:
translators = Translator()

In [6]:
translator = pipeline("translation", model=model, tokenizer = tokenizer)

prediction = []
answer = []

def testing_data():
  for i in range(7):
    text = dataset["train"][i]["gloss"]
    answer.append(dataset["train"][i]["text"])
    prediction.append(translators.translate(translator(text)[0]["translation_text"]).text)



In [7]:
testing_data()
print(prediction)
print(answer)

['c we have ended the entire day discussing a decent strategic framework for the european union .', 'this is a great difference .', 'i will therefore be speaking on him about this incident .', 'i particularly recall your resolution of july 2006 and the written declaration of the september 200ORIGIN mark should therefore be reflected in that spirit .', 'they cannot .', 'it is undoubtedly why .', 'SWIFT debate']
['cs we have spent the entire day discussing a desirable strategic focus for the european union .\n', 'this is the great difference .\n', 'i will therefore be speaking on his behalf this evening .\n', 'i particularly recall your resolution of july 2006 and the written declaration of september 200origin marking should therefore be considered in that spirit .\n', 'they cannot .\n', 'it is understandable why .\n', 'swift debate \n']
