In [None]:
!pip install -q transformers datasets
!pip install -q evaluate
!pip install -q sacrebleu rouge_score jiwer
!pip install --upgrade accelerate -q
!pip install wandb -Uqq

In [2]:
from datasets import load_dataset, Image
from PIL import Image
import torch

import warnings
warnings.filterwarnings("ignore")

In [3]:
if torch.cuda.is_available():
  device = torch.device("cuda")
  print("There are %d GPU(s) available." % torch.cuda.device_count())
  print("We will use the GPU:", torch.cuda.get_device_name(0))
else:
  print("No GPU available, using the CPU instead.")
  device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB


## ***Configuration Class***

In [4]:
class cfc:

  img_dir = "/data/images_formulas/"
  data_files_dir = "/data/datafiles/"
  test_file_path = "/data/datafiles/test_data.json"


  model_name = "TrOCR-Base_image-to-text-pad-16"
  model_dir = f"/content/drive/MyDrive/models/{model_name}"

  # Hyperparameter
  learning_rate = 5e-5
  batch_size = 16
  weight_decay = 0.01
  num_epochs = 6

  wandb_project = "VLM"
  run_name = model_name

# ***Data preprocessing***

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data_files = {
    "train": cfc.data_files_dir + "train_data.csv",
    "valid": cfc.data_files_dir + "valid_data.csv",
    "test" : cfc.data_files_dir + "test_data.csv"
    }

data = load_dataset("csv", data_files=data_files)

In [None]:
import torch
from torch.utils.data import Dataset
from PIL import Image

In [None]:
from transformers import TrOCRProcessor
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")

In [None]:
class IAMDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=128):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text
        file_name = self.df['image_name'][idx]
        text = self.df['label'][idx]
        # prepare image (i.e. resize + normalize)
        image = Image.open(self.root_dir + file_name).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(text,
                                          padding="max_length",
                                          max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

In [None]:
train_dataset = IAMDataset(root_dir=cfc.img_dir, df=data["train"], processor=processor)
eval_dataset = IAMDataset(root_dir= cfc.img_dir, df=data["valid"], processor=processor)
test_dataset = IAMDataset(root_dir= cfc.img_dir, df=data["test"], processor=processor)

### ***Define the evaluation metrics***

In [None]:
import numpy as np
import evaluate

bleu = evaluate.load("bleu")
ter = evaluate.load("ter")
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):

  labels_ids = eval_preds.label_ids
  pred_ids = eval_preds.predictions

  pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
  # Replace -100 in the labels as we can't decode them.
  labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
  label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

  bleu_res = bleu.compute(predictions=pred_str, references=label_str)
  ter_res = ter.compute(predictions=pred_str, references=label_str)
  rouge_res = rouge.compute(predictions=pred_str, references=label_str)
  ter_acc = (1-(ter_res["score"]/100))

  metrics = {
      "BLEU": bleu_res["bleu"],
      "TER" : ter_res["score"],
      "TER-ACC" : ter_acc,
      "ROUGE-1" : rouge_res["rouge1"],
      "ROUGE-2" : rouge_res["rouge2"],
      "ROUGE-L" : rouge_res["rougeL"],
      }

  return metrics

# ***Fine-tuning of the model***

### ***Weights and Biases Configuration***

In [None]:
import wandb
wandb.login()

In [None]:
wandb.init(
    project=cfc.wandb_project,
    name = cfc.run_name,

    config={
        "architecture": "TrOCR-Base-pad",
        "dataset": "Formula2Text-4k",
    })

In [None]:
from transformers import VisionEncoderDecoderModel
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")

In [None]:
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    cfc.model_dir,
    report_to = "wandb",
    predict_with_generate=True,
    num_train_epochs=cfc.num_epochs,
    evaluation_strategy="steps",
    save_strategy="steps",
    per_device_train_batch_size=cfc.batch_size,
    per_device_eval_batch_size=cfc.batch_size,
    learning_rate=cfc.learning_rate,
    weight_decay=cfc.weight_decay,
    fp16=True,
    logging_steps=200,
    save_steps=400,
    eval_steps=200,
    load_best_model_at_end=True,
    save_total_limit=1,
)

In [None]:
from transformers import default_data_collator

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.feature_extractor,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()
trainer.save_model()

In [None]:
wandb.finish()

# ***Evaluation on Testset***

In [None]:
from google.colab import files

In [None]:
!cp /utils/cf_custom_functions.py /content

In [None]:
import cf_custom_functions as cf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
df_test = cf.load_test_data(cfc.test_file_path)

### ***Generate Test Predictions Pre-trained model***



In [None]:
from transformers import TrOCRProcessor
from transformers import VisionEncoderDecoderModel

processor_pt = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
model_pt = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-stage1')

In [None]:
model_pt.config.decoder_start_token_id = processor_pt.tokenizer.cls_token_id
model_pt.config.pad_token_id = processor_pt.tokenizer.pad_token_id
model_pt.config.vocab_size = model_pt.config.decoder.vocab_size

# set beam search parameters
model_pt.config.eos_token_id = processor_pt.tokenizer.sep_token_id
model_pt.config.max_length = 64
model_pt.config.early_stopping = True
model_pt.config.no_repeat_ngram_size = 3
model_pt.config.length_penalty = 2.0
model_pt.config.num_beams = 4

In [None]:
def generate_VLM_predictions(test_data:pd.DataFrame, model:object, processor:object, IMG_DIR:str) -> pd.DataFrame:
  df = test_data.copy()
  model = model
  image_path = IMG_DIR
  y_preds = []

  for i, entry in df.iterrows():
    image_name = entry["image_name"]
    image = Image.open(image_path + image_name).convert('RGB')
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
    generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(generated_caption)
    y_preds.append(generated_caption)

  y_preds = np.array(y_preds)
  df["prediction"] = y_preds
  return df

In [None]:
df_preds_pt = generate_VLM_predictions(df_test,model_pt,processor_pt,cfc.img_dir)
df_preds_pt_clean = cf.post_processing_multi_predictions(df_preds_pt)

In [None]:
metrics_pt = cf.compute_evaluation_metrics(df_preds_pt_clean,"clean_prediction")
cf.save_evaluation_metrics(f"{cfc.model_name}_pretrained",metrics_pt,"../metrics/VLM_metrics_new.json")

### ***Generate Test Predictions Fine-tuned model***

In [None]:
from transformers import TrOCRProcessor
from transformers import VisionEncoderDecoderModel

processor_ft = TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed')
model_ft = VisionEncoderDecoderModel.from_pretrained(cfc.model_dir)

In [None]:
model_ft.config.decoder_start_token_id = processor_ft.tokenizer.cls_token_id
model_ft.config.pad_token_id = processor_ft.tokenizer.pad_token_id
model_ft.config.vocab_size = model_ft.config.decoder.vocab_size

# set beam search parameters
model_ft.config.eos_token_id = processor_ft.tokenizer.sep_token_id
model_ft.config.max_length = 64
model_ft.config.early_stopping = True
model_ft.config.no_repeat_ngram_size = 3
model_ft.config.length_penalty = 2.0
model_ft.config.num_beams = 4

In [None]:
df_preds_ft = generate_VLM_predictions(df_test,model_ft,processor_ft,cfc.img_dir)
df_preds_ft_clean = cf.post_processing_multi_predictions(df_preds_ft)

In [None]:
metrics_ft = cf.compute_evaluation_metrics(df_preds_ft_clean,"clean_prediction")
cf.save_evaluation_metrics(f"{cfc.model_name}_finetuned",metrics_ft,"../metrics/VLM_metrics_new.json")