# Generate clarifying questions based on a Huggingface-BART Model


### Sources:
The following implementation was build with orientation of the Bart Model used in those tutorials: <br>
https://colab.research.google.com/drive/1Cy27V-7qqYatqMA7fEqG2kgMySZXw9I4?usp=sharing&pli=1 <br>
https://towardsdatascience.com/teaching-bart-to-rap-fine-tuning-hugging-faces-bart-model-41749d38f3ef

In [1]:
import transformers
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset
import pandas as pd
import numpy as np
import torch.nn.functional as F
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint
import time
import math
import random
import re
import argparse
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu
from bert_score import score

In [3]:
base_dir = ''

# Init the Pytorch lightning Model

In [4]:
class LitModel(pl.LightningModule):
  # Instantiate the model
  def __init__(self, learning_rate, tokenizer, model, hparams):
    super().__init__()
    self.tokenizer = tokenizer
    self.model = model
    self.learning_rate = learning_rate
    # self.freeze_encoder = freeze_encoder
    # self.freeze_embeds_ = freeze_embeds
    self.save_hyperparameters(hparams)

    if self.hparams.freeze_encoder:
      freeze_params(self.model.get_encoder())

    if self.hparams.freeze_embeds:
      self.freeze_embeds()
  
  def freeze_embeds(self):
    ''' freeze the positional embedding parameters of the model; adapted from finetune.py '''
    freeze_params(self.model.model.shared)
    for d in [self.model.model.encoder, self.model.model.decoder]:
      freeze_params(d.embed_positions)
      freeze_params(d.embed_tokens)

  # Do a forward pass through the model
  def forward(self, input_ids, **kwargs):
    return self.model(input_ids, **kwargs)
  
  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate)
    return optimizer

  def training_step(self, batch, batch_idx):
    # Load the data into variables
    src_ids, src_mask = batch[0], batch[1]
    tgt_ids = batch[2]
    # Shift the decoder tokens right (but NOT the tgt_ids)
    decoder_input_ids = shift_tokens_right(tgt_ids, tokenizer.pad_token_id)

    # Run the model and get the logits
    outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
    lm_logits = outputs[0]
    # Create the loss function
    ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    # Calculate the loss on the un-shifted tokens
    loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))

    return {'loss':loss}

  def validation_step(self, batch, batch_idx):

    src_ids, src_mask = batch[0], batch[1]
    tgt_ids = batch[2]

    decoder_input_ids = shift_tokens_right(tgt_ids, tokenizer.pad_token_id)
    
    # Run the model and get the logits
    outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
    lm_logits = outputs[0]

    ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    val_loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))

    return {'loss': val_loss}
  
  # Method that generates text using the BartForConditionalGeneration's generate() method
  def generate_text(self, text, eval_beams, early_stopping = True, max_len = 40):
    ''' Function to generate text '''
    generated_ids = self.model.generate(
        text["input_ids"],
        attention_mask=text["attention_mask"],
        use_cache=True,
        decoder_start_token_id = self.tokenizer.pad_token_id,
        num_beams= eval_beams,
        max_length = max_len,
        early_stopping = early_stopping
    )
    return [self.tokenizer.decode(w, skip_special_tokens=True, clean_up_tokenization_spaces=True) for w in generated_ids]

def freeze_params(model):
  ''' Function that takes a model as input (or part of a model) and freezes the layers for faster training
      adapted from finetune.py '''
  for layer in model.parameters():
    layer.requires_grade = False


In [5]:
# Create the hparams dictionary to pass in the model
# I realise that this isn't really how this is meant to be used, but having this here reminds me that I can edit it when I need
hparams = argparse.Namespace()

hparams.freeze_encoder = True
hparams.freeze_embeds = True
hparams.eval_beams = 4

In [6]:
# Load the model
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base', add_prefix_space=True)

bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

# Load the trained Model

In [8]:
model_loaded = LitModel.load_from_checkpoint(base_dir + "../Models/hf_v2_6_Model.ckpt", learning_rate = 2e-5, tokenizer = tokenizer, model = bart_model, hparams = hparams)

# Getting BART to predict some questions


In [7]:
def generate_prediction(seed_line, model_):
  # Put the model on eval mode
  model_.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
  model_.eval()
  
  prompt_line_tokens = tokenizer(seed_line, max_length = 192, return_tensors = "pt", truncation = True)

  line = model_.generate_text(prompt_line_tokens, eval_beams = 8)

  return line

### Some example predictions

In [9]:
seed_line = "Samsung | Stockmarket, CEO, Devices, Headquarter"
line_pred = generate_prediction(seed_line = seed_line, model_ = model_loaded)

print(seed_line)
print(line_pred)

Samsung | Stockmarket, CEO, Devices, Headquarter
[' What would you like to know about Samsung?']


In [10]:
seed_line = "Samsung | Television , Smartphone, Soundbox , Computer , Vaccum"
line_pred = generate_prediction(seed_line = seed_line, model_ = model_loaded)

print(seed_line)
print(line_pred)

Samsung | Television , Smartphone, Soundbox , Computer , Vaccum
[' Which "Samsung" do you mean?']


In [11]:
seed_line = "mercedes cla class convertible | exterior , interior , engine , prices , competition"
line_pred = generate_prediction(seed_line = seed_line, model_ = model_loaded)

print(seed_line)
print(line_pred)

mercedes cla class convertible | exterior , interior , engine , prices , competition
[' What do you want to know about this vehicle?']


In [12]:
seed_line = "Selena Gomez | Age , Birthday , Albums , Livingplace"
line_pred = generate_prediction(seed_line = seed_line, model_ = model_loaded)

print(seed_line)
print(line_pred)

Selena Gomez | Age , Birthday , Albums , Livingplace
[' What do you want to know about the singer?']


### Predict the Testdataset
And calculate some metrices for evaluation

In [13]:
#Loadings Dataset
df = pd.read_csv(base_dir + '../Data/Dataset_v2_6_20f_test.csv')

In [16]:
df[['predicted', 'Blue', 'Blue_1gram', 'Blue_2gram', 'Blue_3gram']] = ''

df[['rouge_1_r', 'rouge_1_p', 'rouge_1_f', 'rouge_2_r',
    'rouge_2_p', 'rouge_2_f', 'rouge_l_r', 'rouge_l_p' ,'rouge_l_f']] = ''

df[['bert_p', 'bert_r', 'bert_f1']] = ''

In [17]:
start_time = time.time()

for i in range(len(df)):
    #rnd = random.randint(0,len(df))
    line = df.iloc[i]['source']
    question_pred = generate_prediction(seed_line = line, model_ = model_loaded)
    question_true = df.iloc[i]['target']
    
    Blue_score = sentence_bleu([question_pred[0].split()], question_true.split())
    Blue_score_1n = sentence_bleu([question_pred[0].split()], question_true.split(), weights=(1, 0, 0, 0))
    Blue_score_2n = sentence_bleu([question_pred[0].split()], question_true.split(), weights=(0, 1, 0, 0))
    Blue_score_3n = sentence_bleu([question_pred[0].split()], question_true.split(), weights=(0, 0, 1, 0))
        
    rouge = Rouge()
    rouge_res = rouge.get_scores(question_pred[0], question_true)

    df.loc[i]['predicted'] = question_pred
    df.loc[i]['Blue'] = Blue_score
    df.loc[i]['Blue_1gram'] = Blue_score_1n
    df.loc[i]['Blue_2gram'] = Blue_score_2n
    df.loc[i]['Blue_3gram'] = Blue_score_3n

    df.loc[i]['rouge_1_r'] = rouge_res[0]["rouge-1"]['r']
    df.loc[i]['rouge_1_p'] = rouge_res[0]["rouge-1"]['p']
    df.loc[i]['rouge_1_f'] = rouge_res[0]["rouge-1"]['f']
    df.loc[i]['rouge_2_r'] = rouge_res[0]["rouge-2"]['r']
    df.loc[i]['rouge_2_p'] = rouge_res[0]["rouge-2"]['p']
    df.loc[i]['rouge_2_f'] = rouge_res[0]["rouge-2"]['f']
    df.loc[i]['rouge_l_r'] = rouge_res[0]["rouge-l"]['r']
    df.loc[i]['rouge_l_p'] = rouge_res[0]["rouge-l"]['p']
    df.loc[i]['rouge_l_f'] = rouge_res[0]["rouge-l"]['f']

        
print("--- %s seconds ---" % (time.time() - start_time))

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


--- 1801.9053971767426 seconds ---


In [18]:
P, R, F1 = score(df['target'].values.tolist(), df['predicted'].values.tolist(), lang="en", verbose=True)

df['bert_p'] = P
df['bert_r'] = R
df['bert_f1'] = F1

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


calculating scores...
computing bert embedding.


  0%|          | 0/35 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/30 [00:00<?, ?it/s]

done in 87.96 seconds, 21.67 sentences/sec


In [19]:
df.to_csv("../Data/hf_v2_6_evalResults_20f.csv", sep=',', index=False)