In [128]:
import json
from pathlib import Path

def read_squad(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

    return contexts, questions, answers

train_contexts, train_questions, train_answers = read_squad('train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('dev-v2.0.json')

In [35]:
train_contexts = train_contexts[:20]
train_questions = train_questions[:20]
train_answers = train_answers[:20]

print(train_contexts[1])
print(train_questions[1])
print(train_answers[1])

Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
What areas did Beyonce compete in when she was growing up?
{'text': 'singing and dancing', 'answer_start': 207, 'answer_end': 226}


In [190]:
import yaml
import random

with open(r'full_text.yaml') as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    question_answer_data = yaml.load(file, Loader=yaml.FullLoader)


data = []

for i in question_answer_data:
    dat = []
    answer = {}
    dat.append(question_answer_data[i]['context'])
    dat.append(question_answer_data[i]['question'])
    answer['text'] = question_answer_data[i]['answer']
    answer_start = question_answer_data[i]['context'].find(answer['text']);
    answer["answer_start"] = answer_start
    answer["answer_end"] = answer_start + len(question_answer_data[i]['answer'])
    dat.append(answer)
    data.append(dat)

random.shuffle(data)

val_contexts = []
val_questions = []
val_answers = []

train_contexts = []
train_questions = []
train_answers = []

n = 6

for i in range(len(data)):
    if i < n:
        val_contexts.append(data[i][0])
        val_questions.append(data[i][1])
        val_answers.append(data[i][2])
    else:
        train_contexts.append(data[i][0])
        train_questions.append(data[i][1])
        train_answers.append(data[i][2])

In [192]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [193]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [194]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [195]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [196]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-cased")
model2 = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-cased")

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on

In [197]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    print(epoch)
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

0
1
2


DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            

In [199]:
from transformers import pipeline

myModel = pipeline(task = 'question-answering', model = model, tokenizer = tokenizer)
baseModel = pipeline(task = 'question-answering', model = model2, tokenizer = tokenizer)
otherModel = pipeline(task = 'question-answering')


for i in range(n):
    print(myModel(question = val_questions[i], context = val_contexts[i]))
    print(baseModel(question = val_questions[i], context = val_contexts[i]))
    print(otherModel(question = val_questions[i], context = val_contexts[i]))
    print(val_answers[i]['text'])

{'score': 0.005373006220906973, 'start': 151, 'end': 162, 'answer': 'janissaries'}
{'score': 0.0004604584537446499, 'start': 162, 'end': 174, 'answer': '.  Recruited'}
{'score': 0.21019193530082703, 'start': 190, 'end': 211, 'answer': 'Chris-tian population'}
an elite guard
{'score': 0.0025214729830622673, 'start': 217, 'end': 226, 'answer': 'Hand axes'}
{'score': 0.0002752182481344789, 'start': 310, 'end': 344, 'answer': '. Hand axes eventually were set in'}
{'score': 0.9793450832366943, 'start': 477, 'end': 483, 'answer': 'spears'}
spears
{'score': 0.015940193086862564, 'start': 89, 'end': 96, 'answer': 'Assyria'}
{'score': 0.00109293672721833, 'start': 114, 'end': 173, 'answer': '. The Sumerians were the creators of the first Mesopotamian'}
{'score': 0.7701165676116943, 'start': 116, 'end': 129, 'answer': 'The Sumerians'}
Sumerians
{'score': 0.0008520749979652464, 'start': 285, 'end': 324, 'answer': 'Wellington and suffered a bloody defeat'}
{'score': 0.0001992827601497993, 'start':