In [1]:
!nvidia-smi

Fri Dec  3 19:08:19 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  On   | 00000000:04:00.0 Off |                    0 |
| N/A   27C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

# INSTALLING ALL THE REQUIRED PACKAGES
 <B> We need to install pytorch lightning to build the model which we can do using line command as </b>
 <b> !pip install pytorch-lightning</b>

In [2]:
#!pip install transformers==4.1.1
#!pip install tokenizers==0.9.4
#!pip install sentencepiece==0.1.94

!pip install transformers
!pip install tokenizers
!pip install sentencepiece

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [3]:
#!pip install --quiet https://github.com/PyTorchLightning/pytorch-lightning/releases/download/1.2.6/pytorch-lightning-1.2.6.tar.gz

!pip install pytorch-lightning

Defaulting to user installation because normal site-packages is not writeable


# IMPORTING ALL THE REQUIRED MODULES TO RUN THIS iPynb File

In [81]:
import numpy as np
import pandas as pd
import math
import glob
import os
import json
import time
import logging
import random
from string import punctuation
from collections import Counter
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import tensorflow as tf

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [5]:
import pytorch_lightning as pl

In [6]:
physical_devices = tf.config.experimental.list_physical_devices('GPU')
print("Number of GPU's available: ", len(physical_devices))

if(len(physical_devices)>0):
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

Number of GPU's available:  0


<b> We seed everything to 0 using pytorch lightning</b>

In [7]:
pl.seed_everything(0)

Global seed set to 0


0

In [8]:
#! unzip - bio-QA.zip

<b> Importing all the training data from the json file from the file directory</b>

In [9]:
with Path("BioASQ/BioASQ-train-factoid-4b.json").open() as json_file:
    data = json.load(json_file)

<b> We will have a look at our data using pandas and creating a pandas dataframe using some analysis on the data</b>

In [10]:
data.keys()

dict_keys(['data', 'version'])

In [11]:
data['version']

'BioASQ6b'

In [12]:
len(data['data'])

1

In [13]:
data['data'][0].keys()

dict_keys(['paragraphs', 'title'])

In [14]:
data['data'][0]['title']

'BioASQ6b'

In [15]:
len(data['data'][0]['paragraphs'])

3266

In [16]:
questions = data['data'][0]['paragraphs']

In [17]:
questions[0]

{'qas': [{'id': '52bf208003868f1b06000019_002',
   'question': 'What is the inheritance pattern of Li–Fraumeni syndrome?',
   'answers': [{'text': 'autosomal dominant', 'answer_start': 213}]}],
 'context': 'Balanced t(11;15)(q23;q15) in a TP53+/+ breast cancer patient from a Li-Fraumeni syndrome family. Li-Fraumeni Syndrome (LFS) is characterized by early-onset carcinogenesis involving multiple tumor types and shows autosomal dominant inheritance. Approximately 70% of LFS cases are due to germline mutations in the TP53 gene on chromosome 17p13.1. Mutations have also been found in the CHEK2 gene on chromosome 22q11, and others have been mapped to chromosome 11q23. While characterizing an LFS family with a documented defect in TP53, we found one family member who developed bilateral breast cancer at age 37 yet was homozygous for wild-type TP53. Her mother also developed early-onset primary bilateral breast cancer, and a sister had unilateral breast cancer and a soft tissue sarcoma. Cytog

<b> We are creating a function to create the dataframe with these specific columns in our dataframe - <br> i) question <br> (ii) context/ or paragraph named as context <br> (iii) answer text <br> (iv) answer starting index <br> (v) answer end index </b>

In [18]:
def extract_questions_and_answers(path = Path):
    with path.open() as json_file:
        data = json.load(json_file)
        
    questions = data['data'][0]['paragraphs']
     
    data_rows = []
    
    for question in questions:
        context = question['context']

        for question_and_answers in question['qas']:
            question = question_and_answers['question']
            answers = question_and_answers['answers']

            for answer in answers:
                answer_text = answer['text']
                answer_start = answer['answer_start']
                answer_end = answer['answer_start'] + len(answer_text)
                    
                data_rows.append({
                     "question" : question,
                     "context"  : context,
                     "answer_text" : answer_text,
                     "answer_start" : answer_start,
                     "answer_end" : answer_end
                     })
    return pd.DataFrame(data_rows)

In [19]:
extract_questions_and_answers(Path("BioASQ/BioASQ-train-factoid-4b.json")).head()

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...,autosomal dominant,213,231
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...,autosomal dominant,105,123
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...,EGFR-mutant NSCLC,1203,1220
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h...",thyroid,419,426
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...,thyroid,705,712


In [20]:
paths = sorted(list(Path("BioASQ/").glob("BioASQ-train-*")))
paths

[PosixPath('BioASQ/BioASQ-train-factoid-4b.json'),
 PosixPath('BioASQ/BioASQ-train-factoid-5b.json'),
 PosixPath('BioASQ/BioASQ-train-factoid-6b.json')]

<b>Adding all three training files into the dataframe </b>

In [21]:
df_new = []

for path in paths:
    df_new.append(extract_questions_and_answers(path))
    
df_bioasq = pd.concat(df_new)

In [22]:
df_bioasq.head()

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,What is the inheritance pattern of Li–Fraumeni...,Balanced t(11;15)(q23;q15) in a TP53+/+ breast...,autosomal dominant,213,231
1,What is the inheritance pattern of Li–Fraumeni...,Genetic modeling of Li-Fraumeni syndrome in ze...,autosomal dominant,105,123
2,Which type of lung cancer is afatinib used for?,Clinical perspective of afatinib in non-small ...,EGFR-mutant NSCLC,1203,1220
3,Which hormone abnormalities are characteristic...,"DOCA sensitive pendrin expression in kidney, h...",thyroid,419,426
4,Which hormone abnormalities are characteristic...,Clinical and molecular characteristics of Pend...,thyroid,705,712


<b> We will do some basic data analysis to feed into our model and look for any descepencies into our data</b>

In [23]:
df_bioasq.shape

(12988, 5)

In [24]:
len(df_bioasq.question)

12988

In [25]:
len(df_bioasq.question.unique())

443

<b> we can see that our data have some duplicates in the question so we will drop the duplicate questions from our database </b>

In [26]:
df_bioasq = df_bioasq.drop_duplicates(subset = ["context"]).reset_index(drop=True)

In [27]:
df_bioasq.shape

(2582, 5)

In [28]:
len(df_bioasq.question.unique())

441

In [29]:
len(df_bioasq.context.unique())

2582

<b> Context looks fine </b>

In [30]:
sample_question = df_bioasq.iloc[240]
sample_question

question        What is the characteristic feature of the Dyke...
context         Left hemisphere and male sex dominance of cere...
answer_text                                  cerebral hemiatrophy
answer_start                                                  130
answer_end                                                    150
Name: 240, dtype: object

<b> Initiating model name since we are going to fine tune t5-base pretrained model to train our question answering</b>

In [31]:
MODEL_NAME = 't5-base'

In [32]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

<b> Testing encoder and decoder from t5-base pretrained model on sample data</b>

In [33]:
encoding = tokenizer(sample_question["question"],sample_question["context"], max_length = 396, padding = "max_length", 
          truncation = "only_second", return_attention_mask=True, add_special_tokens=True, return_tensors="pt")

In [34]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [35]:
tokenizer.special_tokens_map

{'eos_token': '</s>',
 'unk_token': '<unk>',
 'pad_token': '<pad>',
 'additional_special_tokens': ['<extra_id_0>',
  '<extra_id_1>',
  '<extra_id_2>',
  '<extra_id_3>',
  '<extra_id_4>',
  '<extra_id_5>',
  '<extra_id_6>',
  '<extra_id_7>',
  '<extra_id_8>',
  '<extra_id_9>',
  '<extra_id_10>',
  '<extra_id_11>',
  '<extra_id_12>',
  '<extra_id_13>',
  '<extra_id_14>',
  '<extra_id_15>',
  '<extra_id_16>',
  '<extra_id_17>',
  '<extra_id_18>',
  '<extra_id_19>',
  '<extra_id_20>',
  '<extra_id_21>',
  '<extra_id_22>',
  '<extra_id_23>',
  '<extra_id_24>',
  '<extra_id_25>',
  '<extra_id_26>',
  '<extra_id_27>',
  '<extra_id_28>',
  '<extra_id_29>',
  '<extra_id_30>',
  '<extra_id_31>',
  '<extra_id_32>',
  '<extra_id_33>',
  '<extra_id_34>',
  '<extra_id_35>',
  '<extra_id_36>',
  '<extra_id_37>',
  '<extra_id_38>',
  '<extra_id_39>',
  '<extra_id_40>',
  '<extra_id_41>',
  '<extra_id_42>',
  '<extra_id_43>',
  '<extra_id_44>',
  '<extra_id_45>',
  '<extra_id_46>',
  '<extra_id_47>',
 

In [36]:
tokenizer.eos_token, tokenizer.eos_token_id

('</s>', 1)

In [37]:
tokenizer.decode(encoding["input_ids"].squeeze())

'What is the characteristic feature of the Dyke-Davidoff-Masson syndrome.</s> Left hemisphere and male sex dominance of cerebral hemiatrophy (Dyke-Davidoff-Masson Syndrome). Although radiological findings of cerebral hemiatrophy (Dyke-Davidoff-Masson Syndrome) are well known, there is no systematic study about the gender and the affected side in this syndrome. Brain images in 26 patients (mean aged 11) with cerebral hemiatrophy were retrospectively reviewed. Nineteen patients (73.5%) were male and seven patients (26.5%) were female. Left hemisphere involvement was seen in 18 patients (69.2%) and right hemisphere involvement was seen in eight patients (30.8%). We conclude that male gender and left side involvement are frequent in cerebral hemiatrophy disease.</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

<b> We have to create necessary labels for the answers. We can extract answers according to questions </b>

In [38]:
answer_encoding = tokenizer(
     sample_question['answer_text'],
     max_length=32,
     padding='max_length',
     truncation=True,
     return_attention_mask=True,
     add_special_tokens=True,
     return_tensors="pt"
     )

In [39]:
tokenizer.decode(answer_encoding['input_ids'].squeeze())

'cerebral hemiatrophy</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>'

In [40]:
labels = answer_encoding["input_ids"]

In [41]:
labels

tensor([[24387,     3,   107, 11658,    17, 29006,     1,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]])

In [42]:
labels[labels == 0] = -100

In [43]:
labels

tensor([[24387,     3,   107, 11658,    17, 29006,     1,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100]])

<b> Creating function to tokenize data using tokenization using t5 tokenizer</b>

In [44]:
class TokenizeData(Dataset):
    
    def __init__(self, data:pd.DataFrame, tokenizer:T5Tokenizer, source_max_token_len: int =369, target_max_token_len: int =32):
        
        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index:int):
        
        data_row = self.data.iloc[index]
        
        source_encoding = tokenizer(data_row['question'], data_row['context'], max_length=self.source_max_token_len,
                                        padding='max_length', truncation="only_second", return_attention_mask=True,
                                        add_special_tokens=True, return_tensors="pt")
        
        target_encoding = tokenizer(data_row['answer_text'], max_length=self.target_max_token_len, padding='max_length',
                                        truncation=True, return_attention_mask=True, add_special_tokens=True,
                                        return_tensors="pt")
        
        labels = target_encoding["input_ids"]
        labels[labels == 0] = -100
        
        return dict(question = data_row["question"], context = data_row["context"], answer_text = data_row["answer_text"],
                       input_ids = source_encoding["input_ids"].flatten(), 
                       attention_mask = source_encoding["attention_mask"].flatten(), labels = labels.flatten())

<b> Sample data </b>

In [45]:
sample_dataset = TokenizeData(df_bioasq,tokenizer)

In [46]:
for data in sample_dataset:
    print("question: ", data["question"])
    print("answers: ",data["answer_text"])
    print("input_ids: ", data["input_ids"][:10])
    print("labels: ", data["labels"][:10])
    break

question:  What is the inheritance pattern of Li–Fraumeni syndrome?
answers:  autosomal dominant
input_ids:  tensor([  363,    19,     8, 28915,  3275,    13,  1414,   104,   371,  6340])
labels:  tensor([ 1510, 10348,   138, 12613,     1,  -100,  -100,  -100,  -100,  -100])


<b> Doing a train test split of 80/20 respecticely for our model</b>

In [47]:
bioasq_train_df, bioasq_val_df = train_test_split(df_bioasq, test_size=0.20)

In [48]:
print(bioasq_train_df.shape)
print(bioasq_val_df.shape)

(2065, 5)
(517, 5)


<b>Creating a function to create data to feed into our model. We did padding and tokenization from this function</b>

In [49]:
class CreateData(pl.LightningDataModule):
    def __init__(self, train_df: pd.DataFrame, test_df: pd.DataFrame, tokenizer:T5Tokenizer, batch_size: int = 8,
                    source_max_token_len: int = 396, target_max_token_len: int = 32):
        
        super().__init__()
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len
   
    def setup(self):
        
        self.train_dataset = TokenizeData(self.train_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        
        self.test_dataset = TokenizeData(self.test_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        
    def train_dataloader(self):
        
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)
    
    def val_dataloader(self):
        
        return DataLoader(self.test_dataset, batch_size=1, num_workers=4)
    
    def test_dataloader(self):
        
        return DataLoader(self.test_dataset, batch_size=1, num_workers=4)

<b>Setting up our data and model to start training. We will run total of 6 epocks with smaller batch of 4 because of the dataset available for training </b>

In [50]:
BATCH_SIZE = 4
NUM_EPOCHS = 6

data_module = CreateData(bioasq_train_df, bioasq_val_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

<b> MODEL</b>

In [51]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict =True)

In [52]:
output = model(input_ids=encoding["input_ids"], attention_mask=encoding["attention_mask"], labels=labels)

In [53]:
print(output.logits.shape)

torch.Size([1, 32, 32128])


# CREATE PYTORCH LIGHTNING MODULE

<b> We are creating the function QAModel for using pytorch lightning module. Reference link - https://pytorch-lightning.readthedocs.io/en/latest/starter/lightning_lite.html to create the method</b>

In [54]:
class QAModel(pl.LightningModule):
    
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
   
    def forward(self, input_ids, attention_mask, labels=None):
        
        output = self.model(input_ids, attention_mask=attention_mask, labels=labels)
        return output.loss, output.logits
   
    def training_step(self, batch, batch_idx):
        
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True, on_step=True)
        return {"loss": loss, "predictions":outputs, "labels": labels}
       
    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True, on_step=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True, on_step=True)
        return loss
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=0.0001)
        return optimizer      

In [55]:
model = QAModel()

<b> Using pytorchlightning function callback to save the best model from our epochs </b>

In [56]:
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint

In [57]:
checkpoint_callback = ModelCheckpoint(dirpath="BioASQ_checkpoints", filename="BioASQ-best-checkpoint", save_top_k=1,
                                        verbose=True, monitor="val_loss", mode="min")



<b> Start training our model </b>

In [58]:
trainer = pl.Trainer(checkpoint_callback=checkpoint_callback, max_epochs=NUM_EPOCHS, gpus=1, progress_bar_refresh_rate = 10)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [59]:
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 516: val_loss reached 0.01468 (best 0.01468), saving model to "/data/user/ankitk/NLP/BioASQ_checkpoints/BioASQ-best-checkpoint-v6.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 1033: val_loss reached 0.00967 (best 0.00967), saving model to "/data/user/ankitk/NLP/BioASQ_checkpoints/BioASQ-best-checkpoint-v6.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 1550: val_loss reached 0.00202 (best 0.00202), saving model to "/data/user/ankitk/NLP/BioASQ_checkpoints/BioASQ-best-checkpoint-v6.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 3, global step 2067: val_loss reached 0.00098 (best 0.00098), saving model to "/data/user/ankitk/NLP/BioASQ_checkpoints/BioASQ-best-checkpoint-v6.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 2584: val_loss reached 0.00041 (best 0.00041), saving model to "/data/user/ankitk/NLP/BioASQ_checkpoints/BioASQ-best-checkpoint-v6.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 5, global step 3101: val_loss reached 0.00018 (best 0.00018), saving model to "/data/user/ankitk/NLP/BioASQ_checkpoints/BioASQ-best-checkpoint-v6.ckpt" as top 1


1

<b> Testing and evaluating the object we just trained.</b>

In [60]:
trainer.test()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.0001760245067998767, 'test_loss_epoch': 0.2404516637325287}
--------------------------------------------------------------------------------


[{'test_loss_epoch': 0.2404516637325287, 'test_loss': 0.0001760245067998767}]

In [61]:
model.eval()

QAModel(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseReluDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): Linear(in_features=3

In [62]:
trainer.test(ckpt_path = "BioASQ_checkpoints/BioASQ-best-checkpoint.ckpt")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.0001973738835658878, 'test_loss_epoch': 0.11475042998790741}
--------------------------------------------------------------------------------


[{'test_loss_epoch': 0.11475042998790741, 'test_loss': 0.0001973738835658878}]

<b> Loading the best saved model</b>

In [63]:
trained_model = QAModel.load_from_checkpoint("BioASQ_checkpoints/BioASQ-best-checkpoint.ckpt")

In [64]:
trained_model.freeze()

<b> Creating a function to generate answers on our validation set</b>

In [65]:
def generate_answer(question):
    source_encoding=tokenizer(question["question"], question['context'], max_length = 396, padding="max_length",
                            truncation="only_second", return_attention_mask=True, add_special_tokens=True, return_tensors="pt")
    
    generated_ids = trained_model.model.generate(input_ids=source_encoding["input_ids"], attention_mask=source_encoding["attention_mask"],
                                                num_beams=1, max_length=80, repetition_penalty=2.5,
                                                length_penalty = 1.0, early_stopping=True, use_cache=True)

    preds = [tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
             for generated_id in generated_ids]

    return "".join(preds)

<b> Manually checking a few samples from our dataset to evaluate how our model is performing</b>

In [66]:
sample_question = bioasq_val_df.iloc[20]
sample_question  

question         Which calcium channels does ethosuximide target?
context         Inhibition of T-type calcium channels and hydr...
answer_text                               T-type calcium channels
answer_start                                                  459
answer_end                                                    482
Name: 1732, dtype: object

In [67]:
sample_question["answer_text"]  # Label Answer

'T-type calcium channels'

In [68]:
generate_answer(sample_question)  # Predicted answer

'T-type calcium channels'

In [69]:
sample_question = bioasq_val_df.iloc[66]

In [70]:
sample_question["answer_text"]

'transcription factor EB (TFEB)'

In [71]:
generate_answer(sample_question) 

'transcription factor EB (TFEB)'

# EVALUATING OUR MODEL USING F1 Score
<b> Function to evaluate f1 score</b>

In [72]:
def f1_score(prediction, ground_truth):
    """Calculates F1 score.
    Args:
        prediction: Predicted answer span (string).
        ground_truth: True answer span (string).
    Returns:
        F1 score.
    """
  
    common = Counter(prediction) & Counter(ground_truth)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction)
    recall = 1.0 * num_same / len(ground_truth)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [73]:
predicted_words = []
ground_truth = []
    
for i in range(len(bioasq_val_df)) :
    predicted_words.append(generate_answer(bioasq_val_df.iloc[i]))
    ground_truth.append(bioasq_val_df.iloc[i]['answer_text'])

In [74]:
predicted_words

['FBN1',
 'Hyaluronidase',
 'experimental autoimmune',
 'LHCI',
 'TYR',
 '7',
 'tyrosinase',
 'multiple myeloma',
 'BCR-ABL',
 "Alzheimer's disease",
 'spinal muscular atrophy',
 'post-traumatic stress disorder',
 'Hypertrophic cardiomyopathy',
 'lamp2a',
 'PD-L1',
 'ataxin-3',
 'SECIS',
 'Cystic Fibro',
 'NEDD8-activating enzyme',
 'posttraumatic stress disorder',
 'T-type calcium channels',
 'SUMO-conjugating enzyme',
 'MRSA',
 'H3K4',
 'liver',
 'repressor',
 'BCR-ABL',
 'Notch3 gene',
 'SECIS',
 'RET',
 'tofacitinib',
 'SLC9A6',
 'RANKL',
 'EGFR',
 'riocitentan',
 'lung',
 'diabetes mellitus',
 'Autosomal recessesive',
 'selenoprotein P-like protein',
 'Anorexia Athletica',
 'JNK',
 'ZEB2',
 'medicarpin',
 'AAUAAA',
 'autosomal recessesive',
 'medicarpin',
 'MRSA',
 'connective tissue',
 'Hypertrophic cardiomyopathy',
 'Retto syndrome',
 'L-Dopa',
 'poly(ADP-Ribose) polymerase',
 'Xist',
 'CaSR function is a G-protein coupled receptor',
 'thyroid',
 'multiple',
 'malaria',
 'X',
 '

In [75]:
ground_truth

['FBN1',
 'GKT136901',
 'experimental autoimmune encephalomyelitis (EAE)',
 'LHCII',
 'TYR',
 '7',
 'tyrosinase',
 'multiple myeloma',
 'BCR-ABL',
 "Alzheimer's disease",
 'Spinal Muscular Atrophy',
 'post-traumatic stress disorder',
 'Hypertrophic cardiomyopathy',
 'lamp2a',
 'PD-L1',
 'ataxin-3',
 'SECIS',
 'DeltaF508',
 'NEDD8-activating enzyme',
 'PTSD',
 'T-type calcium channels',
 'SUMO-conjugating enzyme',
 'MRSA',
 'H3K4',
 'liver',
 'repressor',
 'BCR-ABL',
 'Notch3 gene',
 'SECIS',
 'RET',
 'tofacitinib',
 'SLC9A6',
 'RANKL',
 'EGFR',
 'riociguat',
 'lung',
 'diabetes mellitus',
 'autosomal recessive',
 'selenoprotein P',
 'Anorexia Athletica',
 'JNK',
 'ZEB2',
 'medicarpin',
 'AAUAAA',
 'autosomal recessive',
 'medicarpin',
 'MRSA',
 'connective tissue',
 'Hypertrophic cardiomyopathy',
 'Rett syndrome',
 'L-Dopa',
 'poly(ADP-Ribose) polymerase',
 'Xist',
 'The calcium-sensing receptor (CaSR) is a G-protein-coupled receptor that plays an essential role in maintaining calcium 

<b> Converting answers to strings to evaluate </b>

In [76]:
eval_predicted_words = ",".join(predicted_words)
eval_ground_truth = ",".join(ground_truth)

<b> Checking our model's F1 score</b>

In [82]:
print("F1 Score of our model -> ",100*f1_score(eval_predicted_words, eval_ground_truth))

F1 Score of our model ->  95.3475863923625


# <b>SQUAD 2 DATASET</b>

<b> We will create data similarly for SqUAD2.0 dataset and feed it into the same model we trained for BioASQ dataset </b>

In [83]:
#! mkdir squad
#! wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
#! wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

In [84]:
with Path("squad/train-v2.0.json").open() as json_file:
    data = json.load(json_file)

In [85]:
data.keys()

dict_keys(['version', 'data'])

In [86]:
len(data['data'][0]['paragraphs'])

66

In [87]:
questions = data['data'][0]['paragraphs']

In [88]:
questions[0]

{'qas': [{'question': 'When did Beyonce start becoming popular?',
   'id': '56be85543aeaaa14008c9063',
   'answers': [{'text': 'in the late 1990s', 'answer_start': 269}],
   'is_impossible': False},
  {'question': 'What areas did Beyonce compete in when she was growing up?',
   'id': '56be85543aeaaa14008c9065',
   'answers': [{'text': 'singing and dancing', 'answer_start': 207}],
   'is_impossible': False},
  {'question': "When did Beyonce leave Destiny's Child and become a solo singer?",
   'id': '56be85543aeaaa14008c9066',
   'answers': [{'text': '2003', 'answer_start': 526}],
   'is_impossible': False},
  {'question': 'In what city and state did Beyonce  grow up? ',
   'id': '56bf6b0f3aeaaa14008c9601',
   'answers': [{'text': 'Houston, Texas', 'answer_start': 166}],
   'is_impossible': False},
  {'question': 'In which decade did Beyonce become famous?',
   'id': '56bf6b0f3aeaaa14008c9602',
   'answers': [{'text': 'late 1990s', 'answer_start': 276}],
   'is_impossible': False},
  {'q

In [89]:
df_squad2 = extract_questions_and_answers(Path("squad/train-v2.0.json"))
df_squad2.head()

Unnamed: 0,question,context,answer_text,answer_start,answer_end
0,When did Beyonce start becoming popular?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,in the late 1990s,269,286
1,What areas did Beyonce compete in when she was...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,singing and dancing,207,226
2,When did Beyonce leave Destiny's Child and bec...,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,2003,526,530
3,In what city and state did Beyonce grow up?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"Houston, Texas",166,180
4,In which decade did Beyonce become famous?,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,late 1990s,276,286


In [90]:
df_squad2.shape

(753, 5)

In [91]:
len(df_squad2.question.unique())

752

In [92]:
squad2_train_df, squad2_val_df = train_test_split(df_squad2, test_size=0.20)

In [93]:
print(squad2_train_df.shape)
print(squad2_val_df.shape)

(602, 5)
(151, 5)


In [94]:
squad_data_module = CreateData(squad2_train_df, squad2_val_df, tokenizer, batch_size=BATCH_SIZE)
squad_data_module.setup()

In [95]:
checkpoint_callback = ModelCheckpoint(
    dirpath="squad_checkpoints",
    filename="squad-best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="val_loss",
    mode="min"
)



<b> F1 Score for SqUAD2.0 from previously trained object</b>

In [96]:
predicted_words_squad2_oldmodel = []
ground_truth_squad2_oldmodel = []
    
for i in range(len(squad2_val_df)) :
    predicted_words_squad2_oldmodel.append(generate_answer(squad2_val_df.iloc[i]))
    ground_truth_squad2_oldmodel.append(squad2_val_df.iloc[i]['answer_text'])

In [97]:
eval_predicted_words_squad2_old = ",".join(predicted_words_squad2_oldmodel)
eval_ground_truth_squad2_old = ",".join(ground_truth_squad2_oldmodel)

In [98]:
print("F1 Score of our model -> ",100*f1_score(eval_predicted_words_squad2_old, eval_ground_truth_squad2_old))

F1 Score of our model ->  95.62794684954994


<b> We can see the F1 Score we are getting for this model is actually slightly better than the one on BioASQ dataset </b>

# USING THE SAME MODEL WE USED TO GENERATE ANSWERS FOR SQUAD 2.00 DATASET.

<b> WE WILL USE THE SAME GENERATE METHOD AS DEFINED PREVIOUSLY FOR BIOASQ DATASET </b>


In [99]:
sample_question_squad = squad2_val_df.iloc[20]
sample_question_squad["question"]  

'For what movie did Beyonce receive  her first Golden Globe nomination?'

In [100]:
sample_question_squad["answer_text"]

'Dreamgirls'

In [101]:
generate_answer(sample_question_squad)

"B'Day"

In [102]:
sample_question_squad = squad2_val_df.iloc[25]
sample_question_squad["question"]  

"What city was Beyoncé's elementary school located in?"

In [103]:
sample_question_squad["answer_text"]

'Fredericksburg'

In [104]:
generate_answer(sample_question_squad)

'Fredericksburg'

In [105]:
sample_question_squad = squad2_val_df.iloc[35]
sample_question_squad["question"]  

'When did The Mamas make their debut?'

In [106]:
sample_question_squad["answer_text"]

'the 2006 BET Awards'

In [107]:
generate_answer(sample_question_squad)      #Here we can see that model gets confused between same entities.

'2006'

In [108]:
sample_question_squad = squad2_val_df.iloc[150]
sample_question_squad["question"]  

"What percentage of people were positive about Beyonce's endorsement of Pepsi?"

In [109]:
sample_question_squad["answer_text"]

'70'

In [110]:
generate_answer(sample_question_squad)

'70 per cent'

<b> We can see that we are getting expected results but the model can get confused sometimes and does nor predict the exact same answer but even though it is not the exact answer, it is still the correct answer except for a few cases </b>

# TRAINING NEW MODEL FOR SQUAD2.0 DATASET

In [111]:
trainer2 = pl.Trainer(checkpoint_callback=checkpoint_callback, max_epochs=NUM_EPOCHS, gpus=1, progress_bar_refresh_rate = 10)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [112]:
model2 = QAModel()

In [113]:
trainer2.fit(model2, squad_data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Set SLURM handle signals.

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 150: val_loss reached 0.65332 (best 0.65332), saving model to "/data/user/ankitk/NLP/squad_checkpoints/squad-best-checkpoint-v4.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, step 301: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 452: val_loss reached 0.16294 (best 0.16294), saving model to "/data/user/ankitk/NLP/squad_checkpoints/squad-best-checkpoint-v4.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 3, step 603: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, step 754: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 5, step 905: val_loss was not in top 1


1

In [114]:
trainer2.test(ckpt_path = "squad_checkpoints/squad-best-checkpoint.ckpt")

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_loss': 0.05630814656615257, 'test_loss_epoch': 0.17532028257846832}
--------------------------------------------------------------------------------


[{'test_loss_epoch': 0.17532028257846832, 'test_loss': 0.05630814656615257}]

In [115]:
trained_model2 = QAModel.load_from_checkpoint("squad_checkpoints/squad-best-checkpoint.ckpt")

In [116]:
trained_model2.freeze()

In [117]:
def generate_answer(question):
    source_encoding=tokenizer(question["question"], question['context'], max_length = 396, padding="max_length",
                            truncation="only_second", return_attention_mask=True, add_special_tokens=True, return_tensors="pt")
    
    generated_ids = trained_model2.model.generate(input_ids=source_encoding["input_ids"], attention_mask=source_encoding["attention_mask"],
                                                num_beams=1, max_length=80, repetition_penalty=2.5,
                                                length_penalty = 1.0, early_stopping=True, use_cache=True)

    preds = [tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
             for generated_id in generated_ids]

    return "".join(preds)

#num_beans = 1 for greedy search

<b> We will again check our answer text for some of the random questions manually which we tested above </b>

In [118]:
sample_question_squad = squad2_val_df.iloc[20]
sample_question_squad["question"]  

'For what movie did Beyonce receive  her first Golden Globe nomination?'

In [119]:
sample_question_squad["answer_text"]

'Dreamgirls'

In [120]:
generate_answer(sample_question_squad)  # Predicted answer

'Dreamgirls'

In [121]:
sample_question_squad = squad2_val_df.iloc[25]
sample_question_squad["question"]  

"What city was Beyoncé's elementary school located in?"

In [122]:
sample_question_squad["answer_text"]

'Fredericksburg'

In [123]:
generate_answer(sample_question_squad)

'Fredericksburg'

In [124]:
sample_question_squad = squad2_val_df.iloc[35]
sample_question_squad["question"]  

'When did The Mamas make their debut?'

In [125]:
sample_question_squad["answer_text"]

'the 2006 BET Awards'

In [126]:
generate_answer(sample_question_squad)      #Here we can see that model gets confused between same entities.

'2006 BET Awards'

In [127]:
sample_question_squad = squad2_val_df.iloc[150]
sample_question_squad["question"]  

"What percentage of people were positive about Beyonce's endorsement of Pepsi?"

In [128]:
sample_question_squad["answer_text"]

'70'

In [129]:
generate_answer(sample_question_squad)

'70'

In [130]:
squad2_val_df.to_csv(r'./SQUAD2_VALIDATION.txt', header=None, index=None, sep=' ', mode='a')

<b> F1 Score for newly trained SqUAD2.0 Model </b>

In [131]:
predicted_words_squad2_newmodel = []
ground_truth_squad2_newmodel = []
    
for i in range(len(squad2_val_df)) :
    predicted_words_squad2_newmodel.append(generate_answer(squad2_val_df.iloc[i]))
    ground_truth_squad2_newmodel.append(squad2_val_df.iloc[i]['answer_text'])

In [132]:
eval_predicted_words_squad_new = ",".join(predicted_words_squad2_newmodel)
eval_ground_truth_squad_new = ",".join(ground_truth_squad2_newmodel)

In [133]:
print("F1 Score of our model -> ",100*f1_score(eval_predicted_words_squad_new, eval_ground_truth_squad_new))

F1 Score of our model ->  97.29139362166885


<b> Reference for Evaluate function - https://github.com/vibalcam/nlp-qa-squad-bioasq/blob/master/evaluate.py </b>