<h1>Step 0: import module for bert_QA project</h1>

In [72]:
#step 0
import json
import os
from dataset import read_jsonl
import re
# step 4
from transformers import AutoTokenizer, BertForQuestionAnswering
from transformers import AdamW
# step 6
import torch as th

os.chdir('/home/sangmin/grade-school-math/grade_school_math/')

<h1>Step 1: Retrieve and Store the data</h1>
Here I take and store the texts, queries and answers from the train and validation .json files. I save these informations into lists.

In [55]:
path = os.path.join("data/train.jsonl")
objs = read_jsonl(path)

texts = []
queries = []
answers = []

for i in objs:

    answer_info = {}

    an = i['answer']
    co = re.findall('.+\n', an)
    ans = re.findall('\d+$', an)

    context = co[0]
    answer_info['text'] = ans[0]
    answer_info['answer_start'] = context.rfind('>>') + 2
    question = i['question']
        
    texts.append(context)
    answers.append(answer_info)
    queries.append(question)

train_texts, train_queries, train_answers = texts, queries, answers

#{'question': 'Nina enjoys keeping insects as pets. She has 3 spiders and 50 ants. Each spider has 8 eyes. Each ant has 2 eyes. How many eyes are there in total among Nina’s pet insects?', 
#'answer': 'The number of eyes from the spiders is 3 * 8 = <<3*8=24>>24 eyes\nThe number of eyes from the ants is 50 * 2 = <<50*2=100>>100 eyes\nThe total number of eyes among Nina’s insects is 24 + 100 = <<24+100=124>>124 eyes\n#### 124'}


In [58]:
path = os.path.join("data/test.jsonl")
objs = read_jsonl(path)

texts = []
queries = []
answers = []

for i in objs:

    answer_info = {}

    an = i['answer']
    co = re.findall('.+\n', an)
    ans = re.findall('\d+$', an)

    context = co[0]
    answer_info['text'] = ans[0]
    answer_info['answer_start'] = context.rfind('>>') + 2
    question = i['question']
        
    texts.append(context)
    answers.append(answer_info)
    queries.append(question)

val_texts, val_queries, val_answers = texts, queries, answers

<H1>Step 2: Check the data</H1>

As you can see we have 7473 passages, queries and answers from the training data. The answer is stored in a dictionary with the specific answer in the "text" cell and the accurate character index that the answer is started in cell "answer start". As we observe, we need to fill the information about the exact index of the character that the answer is ending from the referance.

In [59]:
print(len(train_texts))
print(len(train_queries))
print(len(train_answers))

7473
7473
7473


In [57]:
print("Passage: ",train_texts[0:2])  
print("Query: ",train_queries[0:2])
print("Answer: ",train_answers[0:2])

Passage:  ['Natalia sold 48/2 = <<48/2=24>>24 clips in May.\n', 'Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\n']
Query:  ['Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?']
Answer:  [{'text': '72', 'answer_start': 31}, {'text': '10', 'answer_start': 33}]


In [60]:
print("Passage: ",val_texts[0])  
print("Query: ",val_queries[0])
print("Answer: ",val_answers[0])

Passage:  Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.

Query:  Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Answer:  {'text': '18', 'answer_start': 37}


<h1>Step 3: Find the end position character</h1>
Because Bert model needs both start and end position characters of the answer, I have to find it and store it for later. 

In [64]:
#Find end position character in train data.
for answer, text in zip(train_answers, train_texts):
    real_answer = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(real_answer)
    answer['answer_end'] = end_idx



In [65]:
#Find end position character in valid data.
for answer, text in zip(val_answers, val_texts):
    real_answer = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(real_answer)
    answer['answer_end'] = end_idx

In [67]:
#double check for data including answer_end index
print("Passage: ",train_texts[0:2])  
print("Query: ",train_queries[0:2])
print("Answer: ",train_answers[0:2])

Passage:  ['Natalia sold 48/2 = <<48/2=24>>24 clips in May.\n', 'Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\n']
Query:  ['Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?']
Answer:  [{'text': '72', 'answer_start': 31, 'answer_end': 33}, {'text': '10', 'answer_start': 33, 'answer_end': 35}]


<h1>Step 4: Tokenize passages and queries</h1>
In this task is asked to select the BERT-base pretrained model “bert-base-uncased” for the tokenization

In [70]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_texts, train_queries, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, val_queries, truncation=True, padding=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

<h1>Step 5: Convert the start-end positions to tokens start-end positions</h1>

In [71]:
def add_token_positions(encodings, answers):
  start_positions = []
  end_positions = []

  count = 0

  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

    # if start position is None, the answer passage has been truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
      
    # if end position is None, the 'char_to_token' function points to the space after the correct token, so add - 1
    if end_positions[-1] is None:
      end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - 1)
      # if end position is still None the answer passage has been truncated
      if end_positions[-1] is None:
        count += 1
        end_positions[-1] = tokenizer.model_max_length

  print(count)

  # Update the data in dictionary
  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

223
47


<h1>Step 6: Create a Dataset class</h1>
Create a Squatdataset class (inherits from torch.utils.data.Dataset), that helped me to train and validate my previous data more easily and convert encodings to datasets.

In [None]:
class SquadDataset(th.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

<h1>Step 7: Use of DataLoader</h1>
I put my previous data to DataLoader, so as to split them in "pieces" of 8 batch size. I will explain the selection of this value of batch size later.

In [63]:
s_list = ['abc', 'bcd', 'bcdefg', 'abba', 'cddc', 'opq']
 
print(max(s_list, key=lambda s: len(s)))

bcdefg
