Installing dependencies

In [None]:
#!pip install -q datasets
!pip install -q transformers 
!pip install -q rouge_score
!pip install -q evaluate
     

Importing required modules

In [2]:
import pandas as pd
import nltk
import evaluate
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_metric, Dataset
from nltk.tokenize import sent_tokenize
from sklearn.model_selection import train_test_split
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
rouge_score = evaluate.load("rouge")

Loading dataset for training. Since our dataset is small, we use dev, train and test for training

In [4]:
df = pd.read_json('dataset/nt_train.json')
df = df.append(pd.read_json('dataset/nt_dev.json'))
df = df.append(pd.read_json('dataset/t_train.json'))
df = df.append(pd.read_json('dataset/nt_test.json'))
df = df.append(pd.read_json('dataset/t_test.json'))
df = df.append(pd.read_json('dataset/nt_wh_test.json'))
df = df.append(pd.read_json('dataset/nt_wh_train.json'))
df = df.append(pd.read_json('dataset/nt_wh_dev.json'))
df = df[['query','question']]
sp_df = pd.read_json('dataset/train_spider.json')
sp_df = sp_df[['query','question']]

df = df.append(sp_df)

#val_df = pd.read_json('dataset/nt_test.json')
#val_df = val_df.append(pd.read_json('dataset/t_test.json'))
df.info()

  df = df.append(pd.read_json('dataset/nt_dev.json'))
  df = df.append(pd.read_json('dataset/t_train.json'))
  df = df.append(pd.read_json('dataset/nt_test.json'))
  df = df.append(pd.read_json('dataset/t_test.json'))
  df = df.append(pd.read_json('dataset/nt_wh_test.json'))
  df = df.append(pd.read_json('dataset/nt_wh_train.json'))
  df = df.append(pd.read_json('dataset/nt_wh_dev.json'))


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7456 entries, 0 to 6999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   query     7456 non-null   object
 1   question  7456 non-null   object
dtypes: object(2)
memory usage: 174.8+ KB


  df = df.append(sp_df)


Splitting the dataset for training and validation

In [5]:
train_df, val_df = train_test_split(df,test_size = 0.1, random_state = 21)
print(train_df.shape)
print(val_df.shape)

(6710, 2)
(746, 2)


In [6]:
# Converting the pandas dataframe to huggingface datasets and drooping the index columns generated
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
train_dataset = train_dataset.remove_columns(['__index_level_0__'])
val_dataset = val_dataset.remove_columns(['__index_level_0__'])
print(train_dataset)
print(val_dataset)

Dataset({
    features: ['query', 'question'],
    num_rows: 6710
})
Dataset({
    features: ['query', 'question'],
    num_rows: 746
})


In [7]:
# downloading model
from transformers import AutoTokenizer
model_checkpoint = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
# Preprocessing and defining the sequence length for training the models
max_input_length = 768
max_target_length = 768


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples['question'],
        max_length=max_input_length,
        truncation=False, padding = 'max_length'
    )
    labels = tokenizer(text = examples['query'], max_length=max_target_length, truncation=True,padding = 'max_length')
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["labels_mask"] = labels["attention_mask"]
    return model_inputs

In [9]:
tokenized_train_datasets = train_dataset.map(preprocess_function, batched=True)



  0%|          | 0/7 [00:00<?, ?ba/s]

In [10]:
tokenized_val_datasets = val_dataset.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
tokenized_train_datasets = tokenized_train_datasets.remove_columns(['query','question'])
tokenized_val_datasets = tokenized_val_datasets.remove_columns(['query','question'])

In [12]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [13]:
data_collator = DataCollatorForSeq2Seq(model = model,tokenizer = tokenizer,label_pad_token_id=-100)
     


Defining the training arguments for training the model

In [14]:
batch_size = 4
learning_rate = 1e-5
args = Seq2SeqTrainingArguments(
    "test-summary",
    evaluation_strategy = "epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False,
    push_to_hub = False
)

In [15]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [16]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_val_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,

)

Training the model

In [17]:
import torch,os

device = torch.device("cuda")
model.cuda()
torch.cuda.empty_cache()
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
trainer.train()

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: labels_mask. If labels_mask are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 6710
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 5034


RuntimeError: CUDA out of memory. Tried to allocate 108.00 MiB (GPU 0; 7.80 GiB total capacity; 6.67 GiB already allocated; 90.44 MiB free; 6.79 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

Saving the model

In [None]:

model.save_pretrained('test-summary',push_to_hub=False)
tokenizer.save_pretrained('test-summary',push_to_hub=False)

Loading the saved model and generating predictions for NL Query

In [None]:
trained_model = AutoModelForSeq2SeqLM.from_pretrained("test-summary")
trained_tokenizer = AutoTokenizer.from_pretrained("test-summary")
#trained_model = model
#trained_tokenizer = tokenizer

#query = "<s> which employee works for Sales or Marketing? </s>"
query = "<s> What are the names of customers who bought IFONE? </s>"

encoding = trained_tokenizer(query, return_tensors="pt")


device = torch.device("cuda")

trained_model.to(device)

output = trained_model.generate(
            input_ids=encoding["input_ids"].to(device), attention_mask=encoding["attention_mask"].to(device)
        )
trained_tokenizer.decode(output[0]).replace("<pad> ", "", 1).replace("</s>", "").replace("<s>", "")