# BART-Base - Machine Translation

In [None]:
!pip install transformers datasets -qq
!pip install evaluate sacrebleu rouge_score jiwer -qq
!pip install --upgrade accelerate -qq
!pip install wandb -Uqq

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import load_dataset, load_from_disk
import numpy as np
import pandas as pd

## Configuration

In [5]:
class cfc:

  data_files_dir = "/data/datafiles/"
  test_file_path = "/data/datafiles/test_data.json"

  checkpoint = "facebook/bart-base"
  max_input = 512
  max_target = 256

  #Naming convention {model_name}-finetuned-{source_lang}-to-{target_lang}
  model_name = "Bart-Base_latex-to-text_finetuned-hyper"
  model_dir = f"/content/drive/MyDrive/models/{model_name}"

  # Hyperparameters
  # lr_rate = 4e-5
  # batch_size = 32
  # epochs = 10
  # weight_decay = 0.01

  # Tuned Hyperparameters
  lr_rate = 0.0000247
  batch_size = 8
  epochs = 5
  weight_decay = 0.2

  wandb_project = "NLG"
  run_name = model_name

## Data preprocessing

In [None]:
data_files = {
    "train": cfc.data_files_dir + "train_data.csv",
    "valid": cfc.data_files_dir + "valid_data.csv",
    "test" : cfc.data_files_dir + "test_data.csv"
    }

data = load_dataset("csv", data_files=data_files)
print(data)
print("Example training data:\n", data["train"][0])

In [None]:
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(cfc.checkpoint)

In [8]:
def preprocess_data(data_to_process):
  # get all the formulas
  inputs = [formula for formula in data_to_process['formula']]

  # tokenize the formulas
  model_inputs = tokenizer(inputs, max_length=cfc.max_input, padding='max_length', truncation=True)

  # tokenize the texts
  with tokenizer.as_target_tokenizer():
    targets = tokenizer(data_to_process['label'], max_length=cfc.max_target, padding='max_length', truncation=True)

  # set labels
  model_inputs['labels'] = targets['input_ids']
  return model_inputs

In [None]:
tokenized_data = data.map(preprocess_data, batched = True, remove_columns=["image_name","formula","label","label_list"])
tokenized_data

## Metrics for evaluation

In [None]:
import numpy as np
import evaluate

bleu = evaluate.load("bleu")
ter = evaluate.load("ter")
rouge = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    bleu_res = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    ter_res = ter.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_res = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    ter_acc = (1-(ter_res["score"]/100))

    metrics = {
        "BLEU": bleu_res["bleu"],
        "TER" : ter_res["score"],
        "TER-ACC" : ter_acc,
        "ROUGE-1" : rouge_res["rouge1"],
        "ROUGE-2" : rouge_res["rouge2"],
        "ROUGE-L" : rouge_res["rougeL"],
        }

    return metrics

# Fine Tuning the model

#### Configure Weigths and Biases für Hyperparameter Tuning

In [None]:
import wandb
wandb.login()

In [None]:
wandb.init(
    project=cfc.wandb_project,
    name = cfc.run_name,

    config={
        "architecture": "Bart-Base",
        "dataset": "Formula2Text-4k",
        }
    )

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(cfc.checkpoint)

In [16]:
args = Seq2SeqTrainingArguments(
    cfc.model_dir,
    report_to = "wandb",
    evaluation_strategy='steps',
    eval_steps=200,
    logging_strategy="steps",
    logging_steps=200,
    save_strategy="steps",
    save_steps=200,
    per_device_train_batch_size=cfc.batch_size,
    per_device_eval_batch_size= cfc.batch_size,
    predict_with_generate=True,
    fp16=False,
    num_train_epochs=cfc.epochs,
    learning_rate=cfc.lr_rate,
    weight_decay=cfc.weight_decay,
    save_total_limit=1,
    load_best_model_at_end=True,
    )

In [17]:
# Define data_collator
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [None]:
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(cfc.checkpoint)

trainer = Seq2SeqTrainer(
    model_init = model_init,
    args=args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['valid'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model()

In [None]:
wandb.finish()

# Hyperparameter Tuning

#### Weights and Biases - Sweep configuration

In [None]:
import wandb
wandb.login()

In [None]:
%env WANDB_PROJECT=NLG_Sweeps
%env WANDB_LOG_MODEL=true

In [None]:
import pprint

sweep_config = {'method': 'random'}
metric = {'name': 'BLEU', 'goal': 'maximize'}

sweep_config['metric'] = metric

# define the hyperparameters
parameters_dict = {
    "epochs": {
        "values": [2,5,10,15,20]
        },
    "batch_size" : {
        "values":[4,8,16,32]
        },
    "learning_rate" : {
        "distribution" : "log_uniform_values",
        "min" : 1e-5, "max" : 1e-3
    },
    "weight_decay" :{
        "values" : [0.0,0.01,0.1,0.2]
        },
}
sweep_config["parameters"] = parameters_dict
pprint.pprint(sweep_config)

In [None]:
# Initialize the sweep
sweep_id = wandb.sweep(sweep_config, project="NLG_Sweeps")

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(cfc.checkpoint)

In [18]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

### Training Function for hyperparameter tuning

In [19]:
def train(config=None):
  with wandb.init(config=config):
    config = wandb.config

    args = Seq2SeqTrainingArguments(
      output_dir = "/content/drive/MyDrive/sweeps/BART-Base-sweeps",
      report_to = "wandb",
      run_name="BART-Base",
      evaluation_strategy='steps',
      eval_steps=200,
      logging_strategy="steps",
      logging_steps=200,
      save_strategy="steps",
      save_steps=400,
      per_device_train_batch_size=config.batch_size,
      per_device_eval_batch_size= config.batch_size,
      predict_with_generate=True,
      fp16=False,
      num_train_epochs=config.epochs,
      learning_rate=config.learning_rate,
      weight_decay=config.weight_decay,
      save_total_limit=1,
      load_best_model_at_end=True,
    )

    def model_init():
      return AutoModelForSeq2SeqLM.from_pretrained(cfc.checkpoint)

    trainer = Seq2SeqTrainer(
      model_init = model_init,
      args=args,
      train_dataset=tokenized_data['train'],
      eval_dataset=tokenized_data['valid'],
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
    )

    trainer.train()
    wandb.log({"BLEU": metric})
    trainer.save_model()

In [None]:
wandb.agent(sweep_id, train, count=5)

In [21]:
wandb.finish()

# Evaluation on Testset

In [21]:
from google.colab import files

In [23]:
!cp /utils/cf_custom_functions.py /content

In [24]:
import pandas as pd
from transformers import AutoModelForSeq2SeqLM
from datasets import load_dataset, load_metric
import evaluate
import cf_custom_functions as cf

### Generate Test Predictions Pre-trained model

In [None]:
pt_tokenizer = AutoTokenizer.from_pretrained(cfc.checkpoint)
pt_model = AutoModelForSeq2SeqLM.from_pretrained(cfc.checkpoint)

In [None]:
prefix = ""

pt_metrics, pt_preds = cf.model_evaluation_on_testset(cfc.test_file_path, pt_model, pt_tokenizer, prefix)
print(pt_metrics)

In [None]:
cf.save_evaluation_metrics(cfc.model_name,pt_metrics,"../metrics/NLG_metrics_new.json")

### Generate Test Predictions Fine-tuned model


In [None]:
ft_tokenizer = AutoTokenizer.from_pretrained(cfc.model_dir)
ft_model = AutoModelForSeq2SeqLM.from_pretrained(cfc.model_dir)

In [None]:
prefix = ""

ft_metrics, df_preds_ft = cf.model_evaluation_on_testset(cfc.test_file_path, ft_model, ft_tokenizer, prefix)
print(ft_metrics)

In [None]:
cf.save_evaluation_metrics(cfc.model_name,ft_metrics,"../metrics/NLG_metrics_new.json")

### Generate Test Predictions Fine-tuned Hyperparameter Tuned Model

In [26]:
ht_tokenizer = AutoTokenizer.from_pretrained(cfc.model_dir)
ht_model = AutoModelForSeq2SeqLM.from_pretrained(cfc.model_dir)

In [None]:
prefix = ""

ht_metrics, df_preds_ht = cf.model_evaluation_on_testset(cfc.test_file_path, ht_model, ht_tokenizer, prefix)
print(ht_metrics)

In [None]:
cf.save_evaluation_metrics(f"{cfc.model_name}+hyper",ht_metrics,"../metrics/NLG_metrics_new.json")