### Install Necessary packages

In [47]:
!pip install pytorch_pretrained_bert



### Import Libraries

In [48]:
import json
from pathlib import Path
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, BertForQuestionAnswering
from transformers import AdamW
import string, re

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# 1. Load the SQUAD 2.0

**Note: I have downloaded the train and dev files from https://rajpurkar.github.io/SQuAD-explorer/**

In [49]:
def load_data(path, size):
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
    texts = []
    questions = []
    answers = []
    count = 0
    for data in squad_dict['data']:
        for para in data['paragraphs']:
            text = para['context']
            for qas in para['qas']:
                question = qas['question']
                for answer in qas['answers']:
                    texts.append(text)
                    questions.append(question)
                    answers.append(answer)
                    count += 1
                    if count >= size:
                        return texts, questions, answers

In [50]:
train_texts, train_questions, train_answers = load_data("train-v2.0.json", size=800)
val_texts, val_questions, val_answers = load_data("dev-v2.0.json", size=200)

In [51]:
print(len(train_texts))
print(len(train_questions))
print(len(train_answers))

800
800
800


In [52]:
print(len(val_texts))
print(len(val_questions))
print(len(val_answers))

200
200
200


# 2. Display a few raw QnA data samples.

In [53]:
print("TRAIN DATA")
for idx in range(0, 3):
    print("Text: ", train_texts[idx])
    print("Question: ", train_questions[idx])
    print("Answer: ", train_answers[idx])
    print("\n")

TRAIN DATA
Text:  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Question:  When did Beyonce start becoming popular?
Answer:  {'text': 'in the late 1990s', 'answer_start': 269}


Text:  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in 

In [54]:
print("VAL DATA")
for idx in range(3, 6):
    print("Text: ", val_texts[idx])
    print("Question: ", val_questions[idx])
    print("Answer: ", val_answers[idx])
    print("\n")

VAL DATA
Text:  The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse ("Norman" comes from "Norseman") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.
Question:  In what country is Normandy located?
Answer:  {'text': 'France', 'answer_start': 159}


Text:  The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name t

# 3. Preprocess/Clean dataset as per BERT format.

In [55]:
def process_dataset(answers, texts):
    for answer, text in zip(answers, texts):
        real_answer = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(real_answer)

        # Exact match
        if text[start_idx:end_idx] == real_answer:
            answer['answer_end'] = end_idx
        # answer is one character longer
        elif text[start_idx - 1:end_idx - 1] == real_answer:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1
        # answer is two characters longer
        elif text[start_idx - 2:end_idx - 2] == real_answer:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2

In [56]:
process_dataset(train_answers, train_texts)
process_dataset(val_answers, val_texts)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_texts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, val_questions, truncation=True, padding=True)

In [57]:
def add_token_positions(encodings, answers):
    start = []
    end = []

    for i, answer in enumerate(answers):
        start_token = encodings.char_to_token(i, answer['answer_start'])
        end_token = encodings.char_to_token(i, answer['answer_end'] - 1)

        if start_token is None:
            start_token = encodings.model_max_length
        if end_token is None:
            end_token = encodings.model_max_length

        start.append(start_token)
        end.append(end_token)

    encodings.update({'start': start, 'end': end})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [58]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        encoded_tensors = {}
        for key, val in self.encodings.items():
            tensor_at_idx = torch.tensor(val[idx])
            encoded_tensors[key] = tensor_at_idx
        return encoded_tensors

    def __len__(self):
        return len(self.encodings.input_ids)

In [59]:
train_dataset = Dataset(train_encodings)
val_dataset = Dataset(val_encodings)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)

## 4. Train the BERT QnA model. Evaluate the model.

In [60]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device)

optim = AdamW(model.parameters(), lr=5e-5)
epochs = 20

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [61]:
for epoch in range(epochs):
    train_loss, val_loss = 0, 0

    model.train()
    for batch_idx, batch in enumerate(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start = batch['start'].to(device)
        end = batch['end'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start, end_positions=end)
        loss = outputs[0]
        loss.backward()
        optim.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)

    model.eval()
    for batch_idx, batch in enumerate(val_loader):
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start = batch['start'].to(device)
            end = batch['end'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start, end_positions=end)
            loss = outputs[0]
            val_loss += loss.item()
    val_loss /= len(val_loader)

    print(f"Epoch {epoch} Training loss {train_loss} and Validation Loss {val_loss}")

Epoch 0 Training loss 3.9835909605026245 and Validation Loss 3.306875057220459
Epoch 1 Training loss 2.4406928277015685 and Validation Loss 1.9043096494674683
Epoch 2 Training loss 1.25600115776062 and Validation Loss 1.6564743208885193
Epoch 3 Training loss 0.7162760701775551 and Validation Loss 1.6332973337173462
Epoch 4 Training loss 0.5061268813163042 and Validation Loss 1.6989549314975738
Epoch 5 Training loss 0.3689735359326005 and Validation Loss 1.7415593361854553
Epoch 6 Training loss 0.30243203073740005 and Validation Loss 1.5848237800598144
Epoch 7 Training loss 0.20973721192218364 and Validation Loss 1.7167938137054444
Epoch 8 Training loss 0.21440503989346327 and Validation Loss 2.1902471113204958
Epoch 9 Training loss 0.15795133426785468 and Validation Loss 1.9124277901649476
Epoch 10 Training loss 0.19473949896171688 and Validation Loss 2.0082615661621093
Epoch 11 Training loss 0.15191162386909127 and Validation Loss 1.8077738332748412
Epoch 12 Training loss 0.1257058657

## 5. Perform an Inference and show the predicted vs ground truth answers.

In [62]:
def clean_text(s):
    regex_articles = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    s = re.sub(regex_articles, " ", s)
    s = "".join(ch for ch in s if ch not in string.punctuation)
    s = " ".join(s.split())
    s = s.lower()
    return s

In [63]:
model.eval()
model.to(device)

def compute_f1(prediction, truth):
    pred_tokens = clean_text(prediction).split()
    truth_tokens = clean_text(truth).split()
    if not pred_tokens or not truth_tokens:
        return int(pred_tokens == truth_tokens)
    common_tokens = set(pred_tokens) & set(truth_tokens)
    if not common_tokens:
        return 0

    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(truth_tokens)
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

In [64]:
def predict(text, question, answer):
    inputs = tokenizer.encode_plus(question, text, return_tensors='pt')
    inputs = {key: tensor.to(device) for key, tensor in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    answer_start = torch.argmax(outputs[0])
    answer_end = torch.argmax(outputs[1]) + 1
    prediction = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

    exact_match = int(clean_text(prediction) == clean_text(answer))
    f1_score = compute_f1(prediction, answer)
    print(f"Question: {question}")
    print(f"Prediction: {prediction}")
    print(f"True Answer: {answer}")
    print(f"Exact Match: {exact_match}")
    print(f"F1 Score: {f1_score}")
    print("\n")

In [65]:
text = "A girl named Cinderella was very pretty. She lived in a village named Everest. She was 18 years old. It was Cinderella who had to wake up each morning when it was still dark and cold to start the fire.  Cinderella who cooked the meals. Cinderella who kept the fire going"

questions = ["What is the name of the girl?",
           "Where did Cindrella live?",
           "How old was she?"

          ]
answers = ["cindrella",
           "everest",
           "18"
          ]

for q,a in zip(questions, answers):
    predict(text, q, a)

Question: What is the name of the girl?
Prediction: cinderella
True Answer: cindrella
Exact Match: 0
F1 Score: 0


Question: Where did Cindrella live?
Prediction: everest
True Answer: everest
Exact Match: 1
F1 Score: 1.0


Question: How old was she?
Prediction: 18 years old
True Answer: 18
Exact Match: 0
F1 Score: 0.5




Github Link: https://github.com/daminivichare66/Transformers_Bert_QnA