<h1>Step 0: import module for bert_QA project</h1>

In [2]:
#step 0
import json
import os
from dataset import read_jsonl
import re
# step 1
from sklearn.model_selection import train_test_split
# step 4
from transformers import AutoTokenizer, BertForQuestionAnswering
from transformers import AdamW
# step 6
import torch as th
# step 7
from torch.utils.data import DataLoader
# step 10
import time

os.chdir('/home/sangmin/grade-school-math/grade_school_math/')



<h1>Step 1: Retrieve and Store the data</h1>
Here I take and store the texts, queries and answers from the train and validation .json files. I save these informations into lists.

In [13]:
path = os.path.join("data/train.jsonl")
objs = read_jsonl(path)

texts = []
queries = []
answers = []

for i in objs:

    answer_info = {}

    an = i['answer']
    co = re.findall('.+\n#', an)
    ans = re.findall('\d+$', an)

    context = co[0]
    answer_info['text'] = ans[0]
    answer_info['answer_start'] = context.rfind('>>') + 2
    question = i['question']
        
    texts.append(context)
    answers.append(answer_info)
    queries.append(question)

train_texts, val_texts, train_queries, val_queries, train_answers, val_answers = train_test_split(texts, queries, answers, test_size=0.15, shuffle=True)

#{'question': 'Nina enjoys keeping insects as pets. She has 3 spiders and 50 ants. Each spider has 8 eyes. Each ant has 2 eyes. How many eyes are there in total among Nina’s pet insects?', 
#'answer': 'The number of eyes from the spiders is 3 * 8 = <<3*8=24>>24 eyes\nThe number of eyes from the ants is 50 * 2 = <<50*2=100>>100 eyes\nThe total number of eyes among Nina’s insects is 24 + 100 = <<24+100=124>>124 eyes\n#### 124'}


In [4]:
# this is test dataset, so keep this from training until test!
# path = os.path.join("data/test.jsonl")
# objs = read_jsonl(path)

# texts = []
# queries = []
# answers = []

# for i in objs:

#     answer_info = {}

#     an = i['answer']
#     co = re.findall('.+\n#', an)
#     ans = re.findall('\d+$', an)

#     context = co[0]
#     answer_info['text'] = ans[0]
#     answer_info['answer_start'] = context.rfind('>>') + 2
#     question = i['question']
        
#     texts.append(context)
#     answers.append(answer_info)
#     queries.append(question)

# val_texts, val_queries, val_answers = texts, queries, answers

<H1>Step 2: Check the data</H1>

As you can see we have 7473 passages, queries and answers from the training data. The answer is stored in a dictionary with the specific answer in the "text" cell and the accurate character index that the answer is started in cell "answer start". As we observe, we need to fill the information about the exact index of the character that the answer is ending from the referance.

In [11]:
print(len(train_texts))
print(len(train_queries))
print(len(train_answers))

5978
5978
5978


In [6]:
print("Passage: ",train_texts[0:2])  
print("Query: ",train_queries[0:2])
print("Answer: ",train_answers[0:2])

Passage:  ['Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#', 'Working 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\n#']
Query:  ['Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?']
Answer:  [{'text': '72', 'answer_start': 33}, {'text': '10', 'answer_start': 56}]


In [12]:
print(len(val_texts))
print(len(val_queries))
print(len(val_answers))

1495
1495
1495


In [16]:
print("Passage: ",val_texts[1])  
print("Query: ",val_queries[1])
print("Answer: ",val_answers[1])

Passage:  Together, they have 34+17 = <<34+17=51>>51 tickets.
#
Query:  Tate has 32 tickets for a particular game. He then buys two more tickets for the game. His friend Peyton, who has half as many total tickets, drives with him to the game. How many tickets do Tate and Peyton have together?
Answer:  {'text': '51', 'answer_start': 40}


<h1>Step 3: Find the end position character</h1>
Because Bert model needs both start and end position characters of the answer, I have to find it and store it for later. 

In [17]:
#Find end position character in train data.
for answer, text in zip(train_answers, train_texts):
    real_answer = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(real_answer) -1
    answer['answer_end'] = end_idx

In [18]:
#Find end position character in valid data.
for answer, text in zip(val_answers, val_texts):
    real_answer = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(real_answer) -1
    answer['answer_end'] = end_idx

In [19]:
#double check for data including answer_end index
print("Passage: ",train_texts[0:2])  
print("Query: ",train_queries[0:2])
print("Answer: ",train_answers[0:2])

Passage:  ['After the children buy the balloons, the clown is holding this many in his hand 36 - 15 = <<36-15=21>>21 balloons.\n#', 'The average weight of all the dogs is 20 / 4 = <<20/4=5>>5 pounds.\n#']
Query:  ['At the circus, the clown has 3 dozen balloons on a string in his hand.  3 boys and 12 girls buy a balloon each.  How many balloons is the clown still holding?', "Terry's mom brought home 4 different colored dogs from the shelter. The brown dog weighs 4 pounds. The black dog weighs 1 pound more than the brown dog. The white dog weighs twice as much as the brown dog. The grey dog weighs 2 pounds less than the black dog. What's the average weight of all the dogs?"]
Answer:  [{'text': '21', 'answer_start': 102, 'answer_end': 103}, {'text': '5', 'answer_start': 57, 'answer_end': 57}]


<h1>Step 4: Tokenize passages and queries</h1>
In this task is asked to select the BERT-base pretrained model “bert-base-uncased” for the tokenization

In [20]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_texts, train_queries, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, val_queries, truncation=True, padding=True)

<h1>Step 5: Convert the start-end positions to tokens start-end positions</h1>
In this task is asked to select the BERT-base pretrained model “bert-base-uncased” for the tokenization

In [28]:
def add_token_positions(encodings, answers):
  start_positions = []
  end_positions = []

  count = 0
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

    # if start position is None, the answer passage has been truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
      
    # if end position is None, the 'char_to_token' function points to the space after the correct token, so add - 1
    if end_positions[-1] is None:
      end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - 1)
      # if end position is still None the answer passage has been truncated
      if end_positions[-1] is None:
        count += 1
        end_positions[-1] = tokenizer.model_max_length

  print(count)

  # Update the data in dictionary
  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

0
0


In [29]:
train_encodings[0]

Encoding(num_tokens=226, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

<h1>Step 6: Create a Dataset class</h1>
Create a GSM8Kdataset class for BERT (inherits from torch.utils.data.Dataset), that helped me to train and validate my previous data more easily and convert encodings to datasets.

In [30]:
class gsm8kDataset(th.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: th.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [31]:
train_dataset = gsm8kDataset(train_encodings)
val_dataset = gsm8kDataset(val_encodings)

<h1>Step 7: Use of DataLoader</h1>
I put my previous data to DataLoader, so as to split them in "pieces" of 8 batch size. I will explain the selection of this value of batch size later.

In [32]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=True)

<h1>Step 8: Use GPU</h1>

In [33]:
device = th.device('cuda:0' if th.cuda.is_available()
                      else 'cpu')

<h1>Step 9: Build the Bert model</h1>

In [34]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device)

optim = AdamW(model.parameters(), lr=5e-5)
# optim = AdamW(model.parameters(), lr=3e-5)
# optim = AdamW(model.parameters(), lr=2e-5)

# epochs = 2
epochs = 3
# epochs = 4

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

<h1>Step 10: Train and Evaluate Model</h1>


In [39]:
whole_train_eval_time = time.time()

train_losses = []
val_losses = []

print_every = 1000

for epoch in range(epochs):
  epoch_time = time.time()

  # Set model in train mode
  model.train()
    
  loss_of_epoch = 0

  print("############Train############")

  for batch_idx,batch in enumerate(train_loader): 
    
    optim.zero_grad()

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    # do a backwards pass 
    loss.backward()
    # update the weights
    optim.step()
    # Find the total loss
    loss_of_epoch += loss.item()

    if (batch_idx+1) % print_every == 0:
      print("Batch {:} / {:}".format(batch_idx+1,len(train_loader)),"\nLoss:", round(loss.item(),1),"\n")

  loss_of_epoch /= len(train_loader)
  train_losses.append(loss_of_epoch)

  ##########Evaluation##################

  # Set model in evaluation mode
  model.eval()

  print("############Evaluate############")

  loss_of_epoch = 0

  for batch_idx,batch in enumerate(val_loader):
    
    with th.no_grad():

      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      
      outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
      loss = outputs[0]
      # Find the total loss
      loss_of_epoch += loss.item()

    if (batch_idx+1) % print_every == 0:
       print("Batch {:} / {:}".format(batch_idx+1,len(val_loader)),"\nLoss:", round(loss.item(),1),"\n")

  loss_of_epoch /= len(val_loader)
  val_losses.append(loss_of_epoch)

  # Print each epoch's time and train/val loss 
  print("\n-------Epoch ", epoch+1,
        "-------"
        "\nTraining Loss:", train_losses[-1],
        "\nValidation Loss:", val_losses[-1],
        "\nTime: ",(time.time() - epoch_time),
        "\n-----------------------",
        "\n\n")

print("Total training and evaluation time: ", (time.time() - whole_train_eval_time))

############Train############


RuntimeError: CUDA out of memory. Tried to allocate 22.00 MiB (GPU 0; 23.70 GiB total capacity; 1.99 GiB already allocated; 12.81 MiB free; 2.16 GiB reserved in total by PyTorch)

In [63]:
s_list = ['abc', 'bcd', 'bcdefg', 'abba', 'cddc', 'opq']
 
print(max(s_list, key=lambda s: len(s)))

bcdefg
