<a href="https://colab.research.google.com/github/Vamsi995/Paraphrase-Generator/blob/master/Paraphrase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Inferential Transformer: Semantic Alignment Generation from Pre-trained Generative Model**

# **What models can be used**


# **Building the BART model for fine tuning**

## **Install Requirements**

In [None]:
!pip install pytorch-lightning
!pip install transformers
!pip install rouge-scorer

## **Set Up Environemnt and Seed**

In [2]:
import argparse
import os
import random
import pandas as pd
import numpy as np
import torch
import pytorch_lightning as pl

from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BartForConditionalGeneration, BartTokenizer

In [3]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

## **Prepare Few-shot Dataset**

In [4]:
train_alignment_pairs = []
val_alignment_pairs = [
  {
    "premise": "A man , a woman , and two dogs are walking on the beach .",
    "source": "Some people and two animals",
    "target": "A man , a woman , and two dogs",
  },
  {
    "premise": "A man with a helmet is riding a bike down the road",
    "source": "along a roadway",
    "target": "down the road",
  },
  {
    "premise": "A student broke a screen today in class",
    "source": "a screen was broken",
    "target": "student broke a screen",
  },
]

In [5]:
import pandas as pd

df = pd.read_csv("sick_alignment.csv")
df.head()

Unnamed: 0,premise,hypothesis,chunks1,chunks2,labels
0,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,"[A group of kids, A group of kids, is playing,...","[and a man, A group of boys, in a yard, is pla...","[unaligned, entailAlign, unaligned, entailAlig..."
1,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,"[A group of children, A group of children, is ...","[and an old man, A group of kids, in a yard, i...","[unaligned, entailAlign, unaligned, entailAlig..."
2,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,"[The young boys, The young boys, are playing o...","[are playing outdoors, The kids, near a man, a...","[unaligned, entailAlign, unaligned, entailAlig..."
3,The kids are playing outdoors near a man with ...,A group of kids is playing in a yard and an ol...,"[The kids, The kids, outdoors, are playing out...","[and an old man, A group of kids, in a yard, i...","[unaligned, entailAlign, entailAlign, entailAl..."
4,The young boys are playing outdoors and the ma...,A group of kids is playing in a yard and an ol...,"[The young boys, The young boys, are playing o...","[in a yard, A group of kids, in a yard, is pla...","[unaligned, entailAlign, unaligned, entailAlig..."


In [6]:
counter = 0
for row in df.itertuples(index=False):
  premise = row.premise
  sources = row.chunks2.replace("[","").replace("]","").split(", ")
  targets = row.chunks1.replace("[","").replace("]","").split(", ")
  relations = row.labels.replace("[","").replace("]","").split(", ")

  for i in range(len(sources)):
    if relations[i] == "entailAlign":
      example = {
        "premise": premise,
        "source": sources[i],
        "target": targets[i]
      }
      if example["source"] == example["target"]:
        continue
      if counter < 240:
        train_alignment_pairs.append(example)
      else:
        val_alignment_pairs.append(example)
  counter += 1

print(len(train_alignment_pairs))
print(len(val_alignment_pairs))

201
53


In [26]:
import copy

class AlignmentGenerationDataset(Dataset):
    def __init__(self, tokenizer, examples, max_len_inp=96,max_len_out=96):

        self.alignment_pairs = examples

        self.max_len_input = max_len_inp
        self.max_len_output = max_len_out
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
        self.skippedcount =0
        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()
        target_mask = self.targets[index]["attention_mask"].squeeze()

        labels = copy.deepcopy(target_ids)
        labels [labels==0] = -100

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask,"labels":labels}

    def _build(self):
        for inputs in self.alignment_pairs:
            premise = inputs["premise"]
            source = inputs['source']

            input_sent = f"Premise: {premise} <extra_id_0> Source Phrase: {source} <extra_id_1> In Premise , Source Phrase aligns to: "
            ouput_sent = inputs['target']

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_sent], max_length=self.max_len_input, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [ouput_sent], max_length=self.max_len_output, pad_to_max_length=True,return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

## **Create Fine-tuner and Model**

In [34]:
from pytorch_lightning.loggers import TensorBoardLogger
from rouge_score import rouge_scorer
from tqdm import tqdm

class MetricsCallback(pl.Callback):
    def __init__(self):
        super().__init__()
        self.metrics = []

    def on_validation_end(self, trainer, pl_module):
        self.metrics.append(trainer.callback_metrics)

class BartFineTuner(pl.LightningModule):
    def __init__(self, args, train_ds, val_ds):
        super(BartFineTuner, self).__init__()
        self.save_hyperparameters(args)
        self.args = args
        self.model = BartForConditionalGeneration.from_pretrained(
            args.model_name_or_path)
        self.tokenizer = BartTokenizer.from_pretrained(
            args.model_name_or_path)
        self.scorer = rouge_scorer.RougeScorer(
            ['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.build_dataset(train_ds, val_ds)

    def build_dataset(self, train_ds, val_ds):
        self.train_dataset = AlignmentGenerationDataset(self.tokenizer, train_ds)
        self.validation_dataset = AlignmentGenerationDataset(self.tokenizer, val_ds)

    def forward(self, input_ids,
                attention_mask=None,
                decoder_input_ids=None,
                decoder_attention_mask=None,
                lm_labels=None):
        outputs = self.model(
            input_ids=input_ids,
            decoder_input_ids=decoder_input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )

        return outputs


    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_input_ids = batch["target_ids"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log('train_loss',loss)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_input_ids = batch["target_ids"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log("val_loss",loss)
        return loss

    def save_core_model(self):
        store_path = os.path.join(
            self.args.output_dir,
            self.args.name)
        self.model.save_pretrained(store_path)
        self.tokenizer.save_pretrained(store_path)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.hparams.lr, eps=1e-8)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.hparams.max_epochs * len(self.train_dataset))
        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

    def generate_alignment(self, test_data):
        results = []
        self.model.eval()

        for _, test_example in tqdm(enumerate(test_data)):
            premise = test_example['premise']
            source = test_example['source']
            target = test_example["target"]

            test_sent = f"Premise: {premise} <extra_id_0> Source Phrase: {source} <extra_id_1> In Premise , Source Phrase aligns to: </s>"
            test_tokenized = self.tokenizer.encode_plus(test_sent, return_tensors="pt")

            test_input_ids  = test_tokenized["input_ids"]
            test_attention_mask = test_tokenized["attention_mask"]

            beam_outputs = model.model.generate(
                input_ids=test_input_ids,
                attention_mask=test_attention_mask,
                max_length=64,
                early_stopping=True,
                num_beams=10,
                num_return_sequences=5,
                no_repeat_ngram_size=2
            )

            for beam_output in beam_outputs:
                prediction = self.tokenizer.decode(
                    beam_output,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=True)
                scores = self.scorer.score(target, prediction)
                result = {
                    "premise": premise,
                    "source": source,
                    "prediction": prediction,
                    "target": target,
                    "scores": scores
                }
                results.append(result)

        return results

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.hparams.train_batch_size,
            num_workers=0)

    def val_dataloader(self):
        return DataLoader(
            self.validation_dataset,
            batch_size=self.hparams.eval_batch_size,
            num_workers=0)

# **Fine-tune Your Own Inferential T5**

## **Initialize Hyperparameters and Training Arguments**

In [35]:
import argparse

trial_number = 1

args_dict = dict(
    name="inferential-bart-base",
    data_dir="",
    output_dir="./runs",
    model_name_or_path='facebook/bart-base',
    max_seq_length=512,
    lr=3e-4,
    weight_decay=0.0,
    warmup_steps=0,
    train_batch_size=1,
    eval_batch_size=2,
    max_epochs=5,
    gradient_accumulation_steps=16,
    n_gpu=1,
    fp_16=False,
    opt_level='O1',
    max_grad_norm=1.0,
    seed=42,
)

args = argparse.Namespace(**args_dict)

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=os.path.join(args.output_dir, args.name),
    filename="checkpoint",
    monitor="val_loss",
    mode="min",
    save_top_k=5
)

metrics_callback = MetricsCallback()

train_params = dict(
    #accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.max_epochs,
    progress_bar_refresh_rate=10,
    #precision= 16 if args.fp_16 else 32,
    #amp_level=args.opt_level,
    #gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=True,
    callbacks=[metrics_callback, checkpoint_callback],
    logger=TensorBoardLogger(
        os.path.join(args.output_dir, 'logs'),
        name=args.name,
        version=f'trial_{trial_number}')
)

## **Run Training and Validation Loop**

In [36]:
import warnings
warnings.filterwarnings('ignore')

model = BartFineTuner(args, train_alignment_pairs, val_alignment_pairs)
trainer = pl.Trainer(**train_params)

print (" Training model")
trainer.fit(model)
#trainer.test(model)

print ("Saving model ... ")
model.save_core_model()

print("Model Saved")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


 Training model



  | Name  | Type                         | Params
-------------------------------------------------------
0 | model | BartForConditionalGeneration | 139 M 
-------------------------------------------------------
139 M     Trainable params
0         Non-trainable params
139 M     Total params
557.682   Total estimated model params size (MB)


Epoch 4: 100%|██████████| 228/228 [00:14<00:00, 15.46it/s, loss=0.387, v_num=al_1]
Saving model ... 
Model Saved


## **Test On Validation Set**

In [22]:
from utils.py_io import write_json

results = model.generate_alignment(val_alignment_pairs)
write_json(results, "./results_bart.json")

53it [00:52,  1.01it/s]


In [33]:
premise = val_alignment_pairs[0]['premise']
source = val_alignment_pairs[0]['source']
target = val_alignment_pairs[0]["target"]

test_sent = f"Premise: {premise} <extra_id_0> Source Phrase: {source} <extra_id_1> In Premise , Source Phrase aligns to: </s>"
test_tokenized = model.tokenizer.encode_plus(test_sent, return_tensors="pt")

test_input_ids  = test_tokenized["input_ids"]
test_attention_mask = test_tokenized["attention_mask"]

model.model.eval()
beam_outputs = model.model.generate(
    input_ids=test_input_ids,
    attention_mask=test_attention_mask,
    max_length=64,
    early_stopping=True,
    num_beams=10,
    num_return_sequences=5,
    no_repeat_ngram_size=2
)

print(source)
print(target)

for beam_output in beam_outputs:
    sent = model.tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print (sent)

Some people and two animals
A man , a woman , and two dogs
in front
A woman
in a woman
in the woman
is
