In [1]:
import requests
import json
import os
import torch
from tqdm import tqdm

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
if not os.path.exists('/content/drive/MyDrive/BERT-SQuAD'):
  os.mkdir('/content/drive/MyDrive/BERT-SQuAD')

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!wget -nc https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
!wget -nc https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json

File ‘train-v2.0.json’ already there; not retrieving.

File ‘dev-v2.0.json’ already there; not retrieving.



In [5]:
#load the training data
with open('train-v2.0.json', 'rb') as f:
  squad = json.load(f)

#data dict keys are title and paragraph
print(squad['data'][1])
squad['data'][1]['paragraphs'][0]['context']
print(squad['data'][213])
squad['data'][213]['paragraphs'][0]['context']

{'title': 'Frédéric_Chopin', 'paragraphs': [{'qas': [{'question': "What was Frédéric's nationalities?", 'id': '56cbd2356d243a140015ed66', 'answers': [{'text': 'Polish and French', 'answer_start': 182}], 'is_impossible': False}, {'question': 'In what era was Frédéric active in?', 'id': '56cbd2356d243a140015ed67', 'answers': [{'text': 'Romantic era', 'answer_start': 276}], 'is_impossible': False}, {'question': 'For what instrument did Frédéric write primarily for?', 'id': '56cbd2356d243a140015ed68', 'answers': [{'text': 'solo piano', 'answer_start': 318}], 'is_impossible': False}, {'question': 'In what area was Frédéric born in?', 'id': '56cbd2356d243a140015ed69', 'answers': [{'text': 'Duchy of Warsaw', 'answer_start': 559}], 'is_impossible': False}, {'question': 'At what age did Frédéric depart from Poland?', 'id': '56cbd2356d243a140015ed6a', 'answers': [{'text': '20', 'answer_start': 777}], 'is_impossible': False}, {'question': 'What year was Chopin born?', 'id': '56ce0a3762d2951400fa6

'In signal processing, data compression, source coding, or bit-rate reduction involves encoding information using fewer bits than the original representation. Compression can be either lossy or lossless. Lossless compression reduces bits by identifying and eliminating statistical redundancy. No information is lost in lossless compression. Lossy compression reduces bits by identifying unnecessary information and removing it. The process of reducing the size of a data file is referred to as data compression. In the context of data transmission, it is called source coding (encoding done at the source of the data before it is stored or transmitted) in opposition to channel coding.'

In [6]:
def load_data(file_path):  

  with open(file_path, 'rb') as file:
    squad = json.load(file)

  contexts = []
  questions = []
  answers = []

  for sentence in squad['data']:
    for passage in sentence['paragraphs']:
      context = passage['context']
      for question_answer in passage['qas']:
        question = question_answer['question']
        for answer in question_answer['answers']:
          contexts.append(context)
          questions.append(question)
          answers.append(answer)

  return contexts, questions, answers

In [7]:
train_contexts, train_questions, train_answers = load_data('train-v2.0.json')
valid_contexts, valid_questions, valid_answers = load_data('dev-v2.0.json')

train_contexts_batch = train_contexts[10000:]
train_questions_batch = train_questions[10000:]
train_answers_batch = train_answers[10000:]

val_contexts_batch = valid_contexts[1000:]
val_questions_batch = valid_questions[1000:]
val_answers_batch = valid_answers[1000:]

In [8]:
print(f'Num train questions: {len(train_questions_batch)}')
print(f'Num train answers: {len(train_answers_batch)}')
print(f'Num train contexts: {len(train_contexts_batch)}')

print(f'Num valid questions: {len(val_questions_batch)}')
print(f'Num valid answers: {len(val_answers_batch)}')
print(f'Num valid contexts: {len(val_contexts_batch)}')

print(train_questions_batch[-100])
print(train_answers_batch[-100])

Num train questions: 76821
Num train answers: 76821
Num train contexts: 76821
Num valid questions: 19302
Num valid answers: 19302
Num valid contexts: 19302
When was the National Museum founded?
{'text': '1928', 'answer_start': 382}


The answers are a dictionary with the textual answer and also the start index of the answer in the context, but the end_index is not provided, so we have to find it ourselves. Some of the answers of the SQuAD data are also off by a few characters, so we need to adjust for that.

In [9]:
def add_end_index(answers, contexts):
  for answer, context in zip(answers, contexts):
    text = answer['text']
    starting_index = answer['answer_start']
    answer_len = len(text)
    ending_index = starting_index + answer_len

    #adjust the answers being off in the context by a couple characters

    #if answer is in right spot, set ending index
    if context[starting_index:ending_index] == text:
      answer['answer_end'] = ending_index

    #when text is off by one character
    elif context[starting_index-1:ending_index-1] == text:
      answer['answer_start'] = starting_index - 1
      answer['answer_end'] = ending_index - 1 

    #when text is off by two characters
    elif context[starting_index-2:ending_index-2] == text:
      answer['answer_start'] = starting_index - 2
      answer['answer_end'] = ending_index - 2    

    #when text is off by three characters
    elif context[starting_index-3:ending_index-3] == text:
      answer['answer_start'] = starting_index - 3
      answer['answer_end'] = ending_index - 3    


In [10]:
add_end_index(train_answers, train_contexts)
add_end_index(valid_answers, valid_contexts)

In [11]:
add_end_index(train_answers_batch, train_contexts_batch)
add_end_index(val_answers_batch, val_contexts_batch)

In [12]:
#print answer_end
print(train_questions[-100])
print(train_contexts[-100])
print(train_answers[-100])

print(train_questions_batch[100])
print(train_contexts_batch[100])
print(train_answers_batch[100])

When was the National Museum founded?
The National Museum is located in the western part of Kathmandu, near the Swayambhunath stupa in an historical building. This building was constructed in the early 19th century by General Bhimsen Thapa. It is the most important museum in the country, housing an extensive collection of weapons, art and antiquities of historic and cultural importance. The museum was established in 1928 as a collection house of war trophies and weapons, and the initial name of this museum was Chhauni Silkhana, meaning "the stone house of arms and ammunition". Given its focus, the museum contains many weapons, including locally made firearms used in wars, leather cannons from the 18th–19th century, and medieval and modern works in wood, bronze, stone and paintings.
{'text': '1928', 'answer_start': 382, 'answer_end': 386}
What group arrangement is usual in family authority?
Anthropologists maintain that hunter/gatherers don't have permanent leaders; instead, the person 

In [13]:
print(valid_questions[100])
print(valid_contexts[100])
print(valid_answers[100])

print(val_questions_batch[-100])
print(val_contexts_batch[-100])
print(val_answers_batch[-100])

When did the Normans attack Dyrrachium?
The further decline of Byzantine state-of-affairs paved the road to a third attack in 1185, when a large Norman army invaded Dyrrachium, owing to the betrayal of high Byzantine officials. Some time later, Dyrrachium—one of the most important naval bases of the Adriatic—fell again to Byzantine hands.
{'text': '1185', 'answer_start': 86, 'answer_end': 90}
Where does centripetal force go?
where  is the mass of the object,  is the velocity of the object and  is the distance to the center of the circular path and  is the unit vector pointing in the radial direction outwards from the center. This means that the unbalanced centripetal force felt by any object is always directed toward the center of the curving path. Such forces act perpendicular to the velocity vector associated with the motion of an object, and therefore do not change the speed of the object (magnitude of the velocity), but only the direction of the velocity vector. The unbalanced forc

# Tokenization

We will be using the BertTokenizerFast because it is much faster than the regular BertTokenizer and will set the padding to True so that we can train the model in batches. 

In [14]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
valid_encodings = tokenizer(valid_contexts, valid_questions, truncation=True, padding=True)

In [15]:
print(train_encodings.keys())
print(valid_encodings.keys())

print(train_encodings[1])
print(valid_encodings[0])

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [16]:
num_encodings = len(train_encodings['input_ids'])
print(f'Number of context-question pairs: {num_encodings}')

Number of context-question pairs: 86821


In [17]:
print(train_encodings['input_ids'][0])
print(train_encodings['token_type_ids'][0])
print(train_encodings['attention_mask'][0])

tokenizer.decode(train_encodings['input_ids'][0])
#tokenizer.decode(train_encodings['token_type_ids'][0])
#tokenizer.decode(train_encodings['attention_mask'][0])

[101, 20773, 21025, 19358, 22815, 1011, 5708, 1006, 1013, 12170, 23432, 29715, 3501, 29678, 12325, 29685, 1013, 10506, 1011, 10930, 2078, 1011, 2360, 1007, 1006, 2141, 2244, 1018, 1010, 3261, 1007, 2003, 2019, 2137, 3220, 1010, 6009, 1010, 2501, 3135, 1998, 3883, 1012, 2141, 1998, 2992, 1999, 5395, 1010, 3146, 1010, 2016, 2864, 1999, 2536, 4823, 1998, 5613, 6479, 2004, 1037, 2775, 1010, 1998, 3123, 2000, 4476, 1999, 1996, 2397, 4134, 2004, 2599, 3220, 1997, 1054, 1004, 1038, 2611, 1011, 2177, 10461, 1005, 1055, 2775, 1012, 3266, 2011, 2014, 2269, 1010, 25436, 22815, 1010, 1996, 2177, 2150, 2028, 1997, 1996, 2088, 1005, 1055, 2190, 1011, 4855, 2611, 2967, 1997, 2035, 2051, 1012, 2037, 14221, 2387, 1996, 2713, 1997, 20773, 1005, 1055, 2834, 2201, 1010, 20754, 1999, 2293, 1006, 2494, 1007, 1010, 2029, 2511, 2014, 2004, 1037, 3948, 3063, 4969, 1010, 3687, 2274, 8922, 2982, 1998, 2956, 1996, 4908, 2980, 2531, 2193, 1011, 2028, 3895, 1000, 4689, 1999, 2293, 1000, 1998, 1000, 3336, 2879, 1000

'[CLS] beyonce giselle knowles - carter ( / biːˈjɒnseɪ / bee - yon - say ) ( born september 4, 1981 ) is an american singer, songwriter, record producer and actress. born and raised in houston, texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of r & b girl - group destiny\'s child. managed by her father, mathew knowles, the group became one of the world\'s best - selling girl groups of all time. their hiatus saw the release of beyonce\'s debut album, dangerously in love ( 2003 ), which established her as a solo artist worldwide, earned five grammy awards and featured the billboard hot 100 number - one singles " crazy in love " and " baby boy ". [SEP] when did beyonce start becoming popular? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

Convert our character start/end positions to token start/end positions. Since our words were converted into tokens, the start/end of the answer needs to show the index of the start/end token that contains the answer (and not the specific characters in context)

In [18]:
def add_token_index(encodings, answers):
  start_positions = []
  end_positions = []

  for i in range(len(answers)):

    #store index and tokenized encoding to start/end position arrays
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

    #if start position is none, then answer was truncated
    if start_positions[-1] is None:
      #if start position none, set it to model_max_length
      start_positions[-1] = tokenizer.model_max_length
    if end_positions[-1] is None:
      end_positions[-1] = tokenizer.model_max_length

  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_index(train_encodings, train_answers)
add_token_index(valid_encodings, valid_answers)

In [19]:
train_encodings['start_positions'][:10]

[67, 55, 128, 47, 69, 81, 124, 91, 69, 72]

In [20]:
class SQuAD_Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings
  def __getitem__(self, idx):
    return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  def __len__(self):
    return len(self.encodings.input_ids)

In [21]:
train_dataset = SQuAD_Dataset(train_encodings)
valid_dataset = SQuAD_Dataset(valid_encodings)

In [22]:
from torch.utils.data import DataLoader

#define data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32)

In [23]:
from transformers import default_data_collator

model_checkpoint = "bert-base-uncased"
batch_size = 16
data_collator = default_data_collator

In [24]:
# from keras_lamb import LAMBOptimizer

# optimizer = LAMBOptimizer(0.001, weight_decay=0.01)
# model.compile(optimizer, ...)

In [25]:
#check if we need to use cpu or use available device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Working on {device}')

Working on cpu


In [26]:
from transformers import BertForQuestionAnswering, BertConfig, TrainingArguments, Trainer, DataCollatorWithPadding
import tensorflow as tf
from transformers import AdamW


#opt = tf.keras.optimizers.Adam(learning_rate=0.001)

model = BertForQuestionAnswering.from_pretrained('bert-base-cased',config=BertConfig(vocab_size=30522),ignore_mismatched_sizes=True).to(device)

optim = AdamW(model.parameters(), lr=5e-5)
# optim = AdamW(model.parameters(), lr=3e-5)
# optim = AdamW(model.parameters(), lr=2e-5)

# epochs = 2
epochs = 3


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and a

In [27]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-SQuAD",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01
)

In [28]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    tokenizer=tokenizer
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 86821
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1357


In [None]:
from transformers import AdamW

N_EPOCHS = 1
optim = AdamW(model.parameters(), lr=5e-5)
                    
model.to(device)
model.train()

for epoch in range(N_EPOCHS):
  loop = tqdm(train_loader, leave=True)
  for batch in loop:
    optim.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    loss.backward()
    optim.step()

    loop.set_description(f'Epoch {epoch+1}')
    loop.set_postfix(loss=loss.item())

  0%|          | 0/5427 [00:00<?, ?it/s]

In [None]:
import time

whole_train_eval_time = time.time()

train_losses = []
val_losses = []

print_every = 1000

for epoch in range(epochs):
  epoch_time = time.time()

  # Set model in train mode
  model.train()
    
  loss_of_epoch = 0

  print("############Train############")

  for batch_idx,batch in enumerate(train_loader): 
    
    optim.zero_grad()

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    # do a backwards pass 
    loss.backward()
    # update the weights
    optim.step()
    # Find the total loss
    loss_of_epoch += loss.item()

    if (batch_idx+1) % print_every == 0:
      print("Batch {:} / {:}".format(batch_idx+1,len(train_loader)),"\nLoss:", round(loss.item(),1),"\n")

  loss_of_epoch /= len(train_loader)
  train_losses.append(loss_of_epoch)

    ##########Evaluation##################

  # Set model in evaluation mode
  model.eval()

  print("############Evaluate############")

  loss_of_epoch = 0

  for batch_idx,batch in enumerate(valid_loader):
    
    with torch.no_grad():

      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      
      outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
      loss = outputs[0]
      # Find the total loss
      loss_of_epoch += loss.item()

    if (batch_idx+1) % print_every == 0:
       print("Batch {:} / {:}".format(batch_idx+1,len(valid_loader)),"\nLoss:", round(loss.item(),1),"\n")

  loss_of_epoch /= len(valid_loader)
  val_losses.append(loss_of_epoch)

  # Print each epoch's time and train/val loss 
  print("\n-------Epoch ", epoch+1,
        "-------"
        "\nTraining Loss:", train_losses[-1],
        "\nValidation Loss:", val_losses[-1],
        "\nTime: ",(time.time() - epoch_time),
        "\n-----------------------",
        "\n\n")

print("Total training and evaluation time: ", (time.time() - whole_train_eval_time))

############Train############
Batch 1000 / 5427 
Loss: 1.3 



KeyboardInterrupt: ignored