In [1]:
!nvidia-smi

Sun Nov 28 17:27:54 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  On   | 00000000:05:00.0 Off |                    0 |
| N/A   53C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [2]:
#!pip install --quiet transformers==4.1.1

In [3]:
#!pip install --quiet https://github.com/PyTorchLightning/pytorch-lightning/releases/download/1.2.6/pytorch-lightning-1.2.6.tar.gz

In [4]:
#!pip install --quiet tokenizers==0.9.4

In [5]:
#!pip install --quiet sentencepiece==0.1.94

In [6]:
#!pip install gdown

In [7]:
import gdown as gdown
import numpy as np
import pandas as pd
import math
import glob
import argparse
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import tensorflow as tf
from termcolor import colored
import textwrap

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [8]:
import pytorch_lightning as pl

In [9]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print("Number of GPU's available: ", len(physical_devices))

if(len(physical_devices)>0):
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

Number of GPU's available:  0


In [10]:
pl.seed_everything(0)

Global seed set to 0


0

In [11]:
#! unzip - bio-QA.zip

In [12]:
with Path("BioASQ/BioASQ-train-factoid-4b.json").open() as json_file:
    data = json.load(json_file)

In [13]:
data.keys()

dict_keys(['data', 'version'])

In [14]:
data['version']

'BioASQ6b'

In [15]:
len(data['data'])

1

In [16]:
data['data'][0].keys()

dict_keys(['paragraphs', 'title'])

In [17]:
data['data'][0]['title']

'BioASQ6b'

In [18]:
len(data['data'][0]['paragraphs'])

3266

In [19]:
questions = data['data'][0]['paragraphs']

In [20]:
questions[0]

{'qas': [{'id': '52bf208003868f1b06000019_002',
   'question': 'What is the inheritance pattern of Li–Fraumeni syndrome?',
   'answers': [{'text': 'autosomal dominant', 'answer_start': 213}]}],
 'context': 'Balanced t(11;15)(q23;q15) in a TP53+/+ breast cancer patient from a Li-Fraumeni syndrome family. Li-Fraumeni Syndrome (LFS) is characterized by early-onset carcinogenesis involving multiple tumor types and shows autosomal dominant inheritance. Approximately 70% of LFS cases are due to germline mutations in the TP53 gene on chromosome 17p13.1. Mutations have also been found in the CHEK2 gene on chromosome 22q11, and others have been mapped to chromosome 11q23. While characterizing an LFS family with a documented defect in TP53, we found one family member who developed bilateral breast cancer at age 37 yet was homozygous for wild-type TP53. Her mother also developed early-onset primary bilateral breast cancer, and a sister had unilateral breast cancer and a soft tissue sarcoma. Cytog

In [21]:
 def extract_questions_and_answers(path = Path):
    with path.open() as json_file:
        data = json.load(json_file)
        
    questions = data['data'][0]['paragraphs']
     
    data_rows = []
    
    for question in questions:
        context = question['context']

        for question_and_answers in question['qas']:
            question = question_and_answers['question']
            answers = question_and_answers['answers']

            for answer in answers:
                answer_text = answer['text']
                answer_start = answer['answer_start']
                answer_end = answer['answer_start'] + len(answer_text)
                    
                data_rows.append({
                     "question" : question,
                     "context"  : context,
                     "answer_text" : answer_text,
                     "answer_start" : answer_start,
                     "answer_end" : answer_end
                     })
    return pd.DataFrame(data_rows)

In [22]:
extract_questions_and_answers(Path("BioASQ/BioASQ-train-factoid-4b.json")).head()

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...,autosomal dominant,213,231
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...,autosomal dominant,105,123
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...,EGFR-mutant NSCLC,1203,1220
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h...",thyroid,419,426
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...,thyroid,705,712


In [23]:
paths = sorted(list(Path("BioASQ/").glob("BioASQ-train-*")))
paths

[PosixPath('BioASQ/BioASQ-train-factoid-4b.json'),
 PosixPath('BioASQ/BioASQ-train-factoid-5b.json'),
 PosixPath('BioASQ/BioASQ-train-factoid-6b.json')]

In [24]:
df_new = []

for path in paths:
    df_new.append(extract_questions_and_answers(path))
    
df_bioasq = pd.concat(df_new)

In [25]:
df_bioasq.head()

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...,autosomal dominant,213,231
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...,autosomal dominant,105,123
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...,EGFR-mutant NSCLC,1203,1220
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h...",thyroid,419,426
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...,thyroid,705,712


In [26]:
df_bioasq.shape

(12988, 5)

In [27]:
df_bioasq = df_bioasq.drop_duplicates(subset = ["context"]).reset_index(drop=True)

In [28]:
df_bioasq.shape

(2582, 5)

In [29]:
len(df_bioasq.question.unique())

441

In [30]:
len(df_bioasq.context.unique())

2582

In [31]:
sample_question = df_bioasq.iloc[240]
sample_question

question        What is the characteristic feature of the Dyke...
context         Left hemisphere and male sex dominance of cere...
answer_text                                  cerebral hemiatrophy
answer_start                                                  130
answer_end                                                    150
Name: 240, dtype: object

In [32]:
MODEL_NAME = 't5-base'

In [33]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

In [34]:
encoding = tokenizer(sample_question["question"],sample_question["context"], max_length = 396, padding = "max_length", 
          truncation = "only_second", return_attention_mask=True, add_special_tokens=True, return_tensors="pt")

In [35]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [36]:
tokenizer.special_tokens_map

{'eos_token': '</s>',
 'unk_token': '<unk>',
 'pad_token': '<pad>',
 'additional_special_tokens': ['<extra_id_0>',
  '<extra_id_1>',
  '<extra_id_2>',
  '<extra_id_3>',
  '<extra_id_4>',
  '<extra_id_5>',
  '<extra_id_6>',
  '<extra_id_7>',
  '<extra_id_8>',
  '<extra_id_9>',
  '<extra_id_10>',
  '<extra_id_11>',
  '<extra_id_12>',
  '<extra_id_13>',
  '<extra_id_14>',
  '<extra_id_15>',
  '<extra_id_16>',
  '<extra_id_17>',
  '<extra_id_18>',
  '<extra_id_19>',
  '<extra_id_20>',
  '<extra_id_21>',
  '<extra_id_22>',
  '<extra_id_23>',
  '<extra_id_24>',
  '<extra_id_25>',
  '<extra_id_26>',
  '<extra_id_27>',
  '<extra_id_28>',
  '<extra_id_29>',
  '<extra_id_30>',
  '<extra_id_31>',
  '<extra_id_32>',
  '<extra_id_33>',
  '<extra_id_34>',
  '<extra_id_35>',
  '<extra_id_36>',
  '<extra_id_37>',
  '<extra_id_38>',
  '<extra_id_39>',
  '<extra_id_40>',
  '<extra_id_41>',
  '<extra_id_42>',
  '<extra_id_43>',
  '<extra_id_44>',
  '<extra_id_45>',
  '<extra_id_46>',
  '<extra_id_47>',
 

In [37]:
tokenizer.eos_token, tokenizer.eos_token_id

('</s>', 1)

In [38]:
tokenizer.decode(encoding["input_ids"].squeeze())

'What is the characteristic feature of the Dyke-Davidoff-Masson syndrome.</s> Left hemisphere and male sex dominance of cerebral hemiatrophy (Dyke-Davidoff-Masson Syndrome). Although radiological findings of cerebral hemiatrophy (Dyke-Davidoff-Masson Syndrome) are well known, there is no systematic study about the gender and the affected side in this syndrome. Brain images in 26 patients (mean aged 11) with cerebral hemiatrophy were retrospectively reviewed. Nineteen patients (73.5%) were male and seven patients (26.5%) were female. Left hemisphere involvement was seen in 18 patients (69.2%) and right hemisphere involvement was seen in eight patients (30.8%). We conclude that male gender and left side involvement are frequent in cerebral hemiatrophy disease.</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

In [39]:
answer_encoding = tokenizer(
     sample_question['answer_text'],
     max_length=32,
     padding='max_length',
     truncation=True,
     return_attention_mask=True,
     add_special_tokens=True,
     return_tensors="pt"
     )

In [40]:
tokenizer.decode(answer_encoding['input_ids'].squeeze())

'cerebral hemiatrophy</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>'

In [41]:
labels = answer_encoding["input_ids"]

In [42]:
labels

tensor([[24387,     3,   107, 11658,    17, 29006,     1,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]])

In [43]:
labels[labels == 0] = -100

In [44]:
labels

tensor([[24387,     3,   107, 11658,    17, 29006,     1,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100]])

In [45]:
class TokenizeData(Dataset):
    
    def __init__(self, data:pd.DataFrame, tokenizer:T5Tokenizer, source_max_token_len: int =369, target_max_token_len: int =32):
        
        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index:int):
        
        data_row = self.data.iloc[index]
        
        source_encoding = tokenizer(data_row['question'], data_row['context'], max_length=self.source_max_token_len,
                                        padding='max_length', truncation="only_second", return_attention_mask=True,
                                        add_special_tokens=True, return_tensors="pt")
        
        target_encoding = tokenizer(data_row['answer_text'], max_length=self.target_max_token_len, padding='max_length',
                                        truncation=True, return_attention_mask=True, add_special_tokens=True,
                                        return_tensors="pt")
        
        labels = target_encoding["input_ids"]
        labels[labels == 0] = -100
        
        return dict(question = data_row["question"], context = data_row["context"], answer_text = data_row["answer_text"],
                       input_ids = source_encoding["input_ids"].flatten(), 
                       attention_mask = source_encoding["attention_mask"].flatten(), labels = labels.flatten())

In [46]:
sample_dataset = TokenizeData(df_bioasq,tokenizer)

In [47]:
for data in sample_dataset:
    print("question: ", data["question"])
    print("answers: ",data["answer_text"])
    print("input_ids: ", data["input_ids"][:10])
    print("labels: ", data["labels"][:10])
    break

question:  What is the inheritance pattern of Li–Fraumeni syndrome?
answers:  autosomal dominant
input_ids:  tensor([  363,    19,     8, 28915,  3275,    13,  1414,   104,   371,  6340])
labels:  tensor([ 1510, 10348,   138, 12613,     1,  -100,  -100,  -100,  -100,  -100])


In [103]:
bioasq_train_df, bioasq_val_df = train_test_split(df_bioasq, test_size=0.20)

In [104]:
print(bioasq_train_df.shape)
print(bioasq_val_df.shape)

(2065, 5)
(517, 5)


In [105]:
class CreateData(pl.LightningDataModule):
    def __init__(self, train_df: pd.DataFrame, test_df: pd.DataFrame, tokenizer:T5Tokenizer, batch_size: int = 8,
                    source_max_token_len: int = 396, target_max_token_len: int = 32):
        
        super().__init__()
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len
   
    def setup(self):
        
        self.train_dataset = TokenizeData(self.train_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        
        self.test_dataset = TokenizeData(self.test_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        
    def train_dataloader(self):
        
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)
    
    def val_dataloader(self):
        
        return DataLoader(self.test_dataset, batch_size=1, num_workers=4)
    
    def test_dataloader(self):
        
        return DataLoader(self.test_dataset, batch_size=1, num_workers=4)

In [106]:
BATCH_SIZE = 4
NUM_EPOCHS = 6

data_module = CreateData(bioasq_train_df, bioasq_val_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

In [107]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict =True)

In [108]:
output = model(input_ids=encoding["input_ids"], attention_mask=encoding["attention_mask"], labels=labels)

In [109]:
print(output.logits.shape)

torch.Size([1, 32, 32128])


In [110]:
training_loss = []
validation_loss = []
test_loss = []

In [111]:
class QAModel(pl.LightningModule):
    
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
   
    def forward(self, input_ids, attention_mask, labels=None):
        
        output = self.model(input_ids, attention_mask=attention_mask, labels=labels)
        return output.loss, output.logits
   
    def training_step(self, batch, batch_idx):
        
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True, on_step=True)
        return {"loss": loss, "predictions":outputs, "labels": labels}
       
    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True, on_step=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True, on_step=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=0.0001)
        return optimizer      

In [112]:
model = QAModel()

In [113]:
#from keras.callbacks import ModelCheckpoint
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint

In [115]:
checkpoint_callback = ModelCheckpoint(dirpath="BioASQ_checkpoints", filename="BioASQ-best-checkpoint", save_top_k=1,
                                        verbose=True, monitor="val_loss", mode="min")



In [116]:
trainer = pl.Trainer(checkpoint_callback=checkpoint_callback, max_epochs=NUM_EPOCHS, gpus=1, progress_bar_refresh_rate = 10)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [117]:
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 516: val_loss reached 0.01844 (best 0.01844), saving model to "/data/user/ankitk/NLP/BioASQ_checkpoints/BioASQ-best-checkpoint-v1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, step 1033: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, step 1550: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 3, step 2067: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 2584: val_loss reached 0.01204 (best 0.01204), saving model to "/data/user/ankitk/NLP/BioASQ_checkpoints/BioASQ-best-checkpoint-v1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 5, global step 3101: val_loss reached 0.00423 (best 0.00423), saving model to "/data/user/ankitk/NLP/BioASQ_checkpoints/BioASQ-best-checkpoint-v1.ckpt" as top 1


1

In [118]:
trainer.test()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.004226091783493757, 'test_loss_epoch': 0.23921088874340057}
--------------------------------------------------------------------------------


[{'test_loss_epoch': 0.23921088874340057, 'test_loss': 0.004226091783493757}]

In [119]:
model.eval()

QAModel(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseReluDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): Linear(in_features=3

In [120]:
trainer.test(ckpt_path = "BioASQ_checkpoints/BioASQ-best-checkpoint.ckpt")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.12169206142425537, 'test_loss_epoch': 0.06487704813480377}
--------------------------------------------------------------------------------


[{'test_loss_epoch': 0.06487704813480377, 'test_loss': 0.12169206142425537}]

In [121]:
trained_model = QAModel.load_from_checkpoint("BioASQ_checkpoints/BioASQ-best-checkpoint.ckpt")

In [122]:
trained_model.freeze()

In [123]:
def generate_answer(question):
    source_encoding=tokenizer(question["question"], question['context'], max_length = 396, padding="max_length",
                            truncation="only_second", return_attention_mask=True, add_special_tokens=True, return_tensors="pt")
    
    generated_ids = trained_model.model.generate(input_ids=source_encoding["input_ids"], attention_mask=source_encoding["attention_mask"],
                                                num_beams=1, max_length=80, repetition_penalty=2.5,
                                                length_penalty = 1.0, early_stopping=True, use_cache=True)

    preds = [tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
             for generated_id in generated_ids]

    return "".join(preds)

#num_beans = 1 for greedy search

In [124]:
def compute_loss(self, start_positions, end_positions, start_logits, end_logits):
    total_loss = None
    if start_positions is not None and end_positions is not None:
        # If we are on multi-GPU, split add a dimension
        if len(start_positions.size()) > 1:
            start_positions = start_positions.squeeze(-1)
        if len(end_positions.size()) > 1:
            end_positions = end_positions.squeeze(-1)
        # Needed to tackle some edge cases
        # sometimes the start/end positions are outside our model inputs, we ignore these terms
        ignored_index = start_logits.size(1)
        # using clamp_(min, max) to make sure start_positions and end_positions don't go beyond max
        start_positions.clamp_(0, ignored_index)
        end_positions.clamp_(0, ignored_index)

        loss_fct = torch.nn.CrossEntropyLoss(ignore_index=ignored_index)
        start_loss = loss_fct(start_logits, start_positions)
        end_loss = loss_fct(end_logits, end_positions)
        total_loss = (start_loss + end_loss) / 2
    return total_loss 

In [125]:
sample_question = bioasq_val_df.iloc[20]
sample_question["question"]  

'Which gene is involved in Giant Axonal Neuropathy?'

In [126]:
sample_question["answer_text"]  # Label Answer

'GAN gene'

In [127]:
generate_answer(sample_question)  # Predicted answer

'GAN gene'

In [128]:
sample_question = bioasq_val_df.iloc[66]

In [129]:
sample_question["answer_text"]

'xa'

In [130]:
generate_answer(sample_question) 

'Xa'

# <b>SQUAD 2 DATASET</b>

In [None]:
#! mkdir squad
#! wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
#! wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

In [91]:
with Path("squad/train-v2.0.json").open() as json_file:
    data = json.load(json_file)

In [92]:
data.keys()

dict_keys(['version', 'data'])

In [93]:
len(data['data'][0]['paragraphs'])

66

In [94]:
questions = data['data'][0]['paragraphs']

In [95]:
questions[0]

{'qas': [{'question': 'When did Beyonce start becoming popular?',
   'id': '56be85543aeaaa14008c9063',
   'answers': [{'text': 'in the late 1990s', 'answer_start': 269}],
   'is_impossible': False},
  {'question': 'What areas did Beyonce compete in when she was growing up?',
   'id': '56be85543aeaaa14008c9065',
   'answers': [{'text': 'singing and dancing', 'answer_start': 207}],
   'is_impossible': False},
  {'question': "When did Beyonce leave Destiny's Child and become a solo singer?",
   'id': '56be85543aeaaa14008c9066',
   'answers': [{'text': '2003', 'answer_start': 526}],
   'is_impossible': False},
  {'question': 'In what city and state did Beyonce  grow up? ',
   'id': '56bf6b0f3aeaaa14008c9601',
   'answers': [{'text': 'Houston, Texas', 'answer_start': 166}],
   'is_impossible': False},
  {'question': 'In which decade did Beyonce become famous?',
   'id': '56bf6b0f3aeaaa14008c9602',
   'answers': [{'text': 'late 1990s', 'answer_start': 276}],
   'is_impossible': False},
  {'q

In [96]:
df_squad2 = extract_questions_and_answers(Path("squad/train-v2.0.json"))
df_squad2.head()

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s,269,286
1,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing,207,226
2,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003,526,530
3,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas",166,180
4,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276,286


In [97]:
df_squad2.shape

(753, 5)

In [98]:
len(df_squad2.question.unique())

752

In [131]:
squad2_train_df, squad2_val_df = train_test_split(df_squad2, test_size=0.20)

In [132]:
print(squad2_train_df.shape)
print(squad2_val_df.shape)

(602, 5)
(151, 5)


In [133]:
squad_data_module = CreateData(squad2_train_df, squad2_val_df, tokenizer, batch_size=BATCH_SIZE)
squad_data_module.setup()

In [134]:
checkpoint_callback = ModelCheckpoint(
    dirpath="squad_checkpoints",
    filename="squad-best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)

In [135]:
trainer2 = pl.Trainer(checkpoint_callback=checkpoint_callback, max_epochs=NUM_EPOCHS, gpus=1, progress_bar_refresh_rate = 10)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [136]:
model2 = QAModel()

In [137]:
trainer2.fit(model2, squad_data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 150: val_loss reached 0.30052 (best 0.30052), saving model to "/data/user/ankitk/NLP/squad_checkpoints/squad-best-checkpoint.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, step 301: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 452: val_loss reached 0.22676 (best 0.22676), saving model to "/data/user/ankitk/NLP/squad_checkpoints/squad-best-checkpoint.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 3, step 603: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, step 754: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 5, step 905: val_loss was not in top 1


1

In [138]:
trainer2.test(ckpt_path = "squad_checkpoints/squad-best-checkpoint.ckpt")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.2267571985721588, 'test_loss_epoch': 0.40972840785980225}
--------------------------------------------------------------------------------


[{'test_loss_epoch': 0.40972840785980225, 'test_loss': 0.2267571985721588}]

In [139]:
trained_model2 = QAModel.load_from_checkpoint("squad_checkpoints/squad-best-checkpoint.ckpt")

In [140]:
trained_model2.freeze()

In [141]:
sample_question_squad = squad2_val_df.iloc[20]
sample_question_squad["question"]  

"Beyoncé's early recordings empowered who?"

In [142]:
sample_question_squad["answer_text"]

'Women'

In [143]:
generate_answer(sample_question_squad)  # Predicted answer

'female empowerment'

In [145]:
sample_question_squad = squad2_val_df.iloc[25]
sample_question_squad["question"]  

'What band supports Beyonce in her tours?'

In [146]:
sample_question_squad["answer_text"]

'Suga Mama'

In [147]:
generate_answer(sample_question_squad)

'Suga Mama'

In [148]:
sample_question_squad = squad2_val_df.iloc[35]
sample_question_squad["question"]  

'Who influenced Beyonce?'

In [149]:
sample_question_squad["answer_text"]

'Michael Jackson'

In [150]:
generate_answer(sample_question_squad)      #Here we can see that model gets confused between same entities.

'Whitney Houston'

In [151]:
sample_question_squad = squad2_val_df.iloc[150]
sample_question_squad["question"]  

'Who gave Beyoncé the Pop Songwriter of the Year award in 2001?'

In [152]:
sample_question_squad["answer_text"]

'the American Society of Composers, Authors, and Publishers Pop Music Awards.'

In [153]:
generate_answer(sample_question_squad)

"American Society of Composers, Authors and Publisher'"

In [158]:
squad2_val_df.to_csv(r'./SQUAD2_VALIDATION.txt', header=None, index=None, sep=' ', mode='a')