<h1>Step 0: import module for distillbert_QA project</h1>

In [23]:
#step 0
import json
import os
from dataset import read_jsonl
import re
# step 1
from sklearn.model_selection import train_test_split
# step 4
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering
from transformers import AdamW
# step 6
import torch as th
# step 7
from torch.utils.data import DataLoader
# step 10
import time

os.chdir('/home/sangmin/grade-school-math/grade_school_math/')

<h1>Step 1: Retrieve and Store the data</h1>
Here I take and store the texts, queries and answers from the train and validation .json files. I save these informations into lists.

In [13]:
path = os.path.join("data/train.jsonl")
objs = read_jsonl(path)

texts = []
queries = []
answers = []

for i in objs:

    answer_info = {}

    an = i['answer']
    co = re.findall('.+\n#', an)
    ans = re.findall('\d+$', an)

    context = co[0]
    answer_info['text'] = ans[0]
    answer_info['answer_start'] = context.rfind('>>') + 2
    question = i['question']
        
    texts.append(context)
    answers.append(answer_info)
    queries.append(question)

train_texts, val_texts, train_queries, val_queries, train_answers, val_answers = train_test_split(texts, queries, answers, test_size=0.15, shuffle=True)


<H1>Step 2: Check the data</H1>

As you can see we have 7473 passages, queries and answers from the training data. The answer is stored in a dictionary with the specific answer in the "text" cell and the accurate character index that the answer is started in cell "answer start". As we observe, we need to fill the information about the exact index of the character that the answer is ending from the referance.

In [24]:
print(len(train_texts))
print(len(train_queries))
print(len(train_answers))

6352
6352
6352


In [25]:
print("Passage: ",train_texts[0:2])  
print("Query: ",train_queries[0:2])
print("Answer: ",train_answers[0:2])

Passage:  ['Hence, each part is 64 inches / 4 parts = <<64/4=16>>16 inches/part.\n#', "It's 2021 and Julia is 42 years old so she was born in 2021-42 = <<2021-42=1979>>1979\n#"]
Query:  ['A piece of wire 5 feet 4 inches long was divided into 4 equal parts. How long was each part in inches if 1 foot is equal to 12 inches?', 'In 2021, Wayne is 37 years old.  His brother Peter is 3 years older than him and their sister Julia is 2 years older than Peter.  What year was Julia born in?']
Answer:  [{'text': '16', 'answer_start': 53, 'answer_end': 54}, {'text': '1979', 'answer_start': 81, 'answer_end': 84}]


In [26]:
print(len(val_texts))
print(len(val_queries))
print(len(val_answers))

1121
1121
1121


In [27]:
print("Passage: ",val_texts[1])  
print("Query: ",val_queries[1])
print("Answer: ",val_answers[1])

Passage:  His original order was $25 and they added $10.00 so his new order is 25+10 = $<<25+10=35.00>>35.00
#
Query:  Ian used a grocery delivery app to have his groceries delivered.  His original order was $25 before delivery and tip.  He noticed that 3 items changed on his order.  A $0.99 can of tomatoes was replaced by a $2.20 can of tomatoes, his $1.00 lettuce was replaced with $1.75 head of lettuce and his $1.96 celery was replaced with celery that cost $2.00.  Delivery and tip came to a total of $8.00.  How much is his new bill now, with the food substitutes and delivery/tip?
Answer:  {'text': '35', 'answer_start': 93, 'answer_end': 94}


<h1>Step 3: Find the end position character</h1>
Because distilBert model needs both start and end position characters of the answer, I have to find it and store it for later. 

In [28]:
#Find end position character in train data.
for answer, text in zip(train_answers, train_texts):
    real_answer = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(real_answer) -1
    answer['answer_end'] = end_idx

In [29]:
#Find end position character in valid data.
for answer, text in zip(val_answers, val_texts):
    real_answer = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(real_answer) -1
    answer['answer_end'] = end_idx

In [30]:
#double check for data including answer_end index
print("Passage: ",train_texts[0:2])  
print("Query: ",train_queries[0:2])
print("Answer: ",train_answers[0:2])

Passage:  ['Hence, each part is 64 inches / 4 parts = <<64/4=16>>16 inches/part.\n#', "It's 2021 and Julia is 42 years old so she was born in 2021-42 = <<2021-42=1979>>1979\n#"]
Query:  ['A piece of wire 5 feet 4 inches long was divided into 4 equal parts. How long was each part in inches if 1 foot is equal to 12 inches?', 'In 2021, Wayne is 37 years old.  His brother Peter is 3 years older than him and their sister Julia is 2 years older than Peter.  What year was Julia born in?']
Answer:  [{'text': '16', 'answer_start': 53, 'answer_end': 54}, {'text': '1979', 'answer_start': 81, 'answer_end': 84}]


<h1>Step 4: Tokenize passages and queries</h1>
In this task is asked to select the distilBERT-base pretrained model “bert-base-uncased” for the tokenization

In [32]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, train_queries, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, val_queries, truncation=True, padding=True)

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

<h1>Step 5: Convert the start-end positions to tokens start-end positions</h1>
In this task is asked to select the distilBERT-base pretrained model “distilbert-base-uncased” for the tokenization

In [33]:
def add_token_positions(encodings, answers):
  start_positions = []
  end_positions = []

  count = 0
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

    # if start position is None, the answer passage has been truncated
    if start_positions[-1] is None:
      start_positions[-1] = tokenizer.model_max_length
      
    # if end position is None, the 'char_to_token' function points to the space after the correct token, so add - 1
    if end_positions[-1] is None:
      end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - 1)
      # if end position is still None the answer passage has been truncated
      if end_positions[-1] is None:
        count += 1
        end_positions[-1] = tokenizer.model_max_length

  print(count)

  # Update the data in dictionary
  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

0
0


<h1>Step 6: Create a Dataset class</h1>
Create a GSM8Kdataset class for distilBERT (inherits from torch.utils.data.Dataset), that helped me to train and validate my previous data more easily and convert encodings to datasets.

In [34]:
class gsm8kDataset(th.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: th.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [36]:
train_dataset = gsm8kDataset(train_encodings)
val_dataset = gsm8kDataset(val_encodings)

<h1>Step 7: Use of DataLoader</h1>
I put my previous data to DataLoader, so as to split them in "pieces" of 8 batch size. I will explain the selection of this value of batch size later.

In [37]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=True)

<h1>Step 7: Use of DataLoader</h1>
I put my previous data to DataLoader, so as to split them in "pieces" of 4 batch size. I will explain the selection of this value of batch size later.

In [38]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=True)

<h1>Step 8: Use GPU</h1>

In [39]:
device = th.device('cuda:0' if th.cuda.is_available()
                      else 'cpu')

In [40]:
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased').to(device)

optim = AdamW(model.parameters(), lr=5e-5)

epochs = 5


Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

<h1>Step 10: Train and Evaluate Model</h1>


In [42]:
whole_train_eval_time = time.time()

train_losses = []
val_losses = []

print_every = 1000

for epoch in range(epochs):
  epoch_time = time.time()

  # Set model in train mode
  model.train()
    
  loss_of_epoch = 0

  print("############Train############")

  for batch_idx,batch in enumerate(train_loader): 
    
    optim.zero_grad()

    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    start_positions = batch['start_positions'].to(device)
    end_positions = batch['end_positions'].to(device)
    
    outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
    loss = outputs[0]
    # do a backwards pass 
    loss.backward()
    # update the weights
    optim.step()
    # Find the total loss
    loss_of_epoch += loss.item()

    if (batch_idx+1) % print_every == 0:
      print("Batch {:} / {:}".format(batch_idx+1,len(train_loader)),"\nLoss:", round(loss.item(),1),"\n")

  loss_of_epoch /= len(train_loader)
  train_losses.append(loss_of_epoch)

  ##########Evaluation##################

  # Set model in evaluation mode
  model.eval()

  print("############Evaluate############")

  loss_of_epoch = 0

  for batch_idx,batch in enumerate(val_loader):
    
    with th.no_grad():

      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      
      outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
      loss = outputs[0]
      # Find the total loss
      loss_of_epoch += loss.item()

    if (batch_idx+1) % print_every == 0:
       print("Batch {:} / {:}".format(batch_idx+1,len(val_loader)),"\nLoss:", round(loss.item(),1),"\n")

  loss_of_epoch /= len(val_loader)
  val_losses.append(loss_of_epoch)

  # Print each epoch's time and train/val loss 
  print("\n-------Epoch ", epoch+1,
        "-------"
        "\nTraining Loss:", train_losses[-1],
        "\nValidation Loss:", val_losses[-1],
        "\nTime: ",(time.time() - epoch_time),
        "\n-----------------------",
        "\n\n")

print("Total training and evaluation time: ", (time.time() - whole_train_eval_time))

############Train############
Batch 1000 / 1588 
Loss: 0.0 

############Evaluate############

-------Epoch  1 -------
Training Loss: 0.024229531170967378 
Validation Loss: 0.0645959948257034 
Time:  58.82606482505798 
----------------------- 


############Train############
Batch 1000 / 1588 
Loss: 0.0 

############Evaluate############

-------Epoch  2 -------
Training Loss: 0.021380648441208617 
Validation Loss: 0.032404276124595954 
Time:  59.08175182342529 
----------------------- 


############Train############
Batch 1000 / 1588 
Loss: 0.0 

############Evaluate############

-------Epoch  3 -------
Training Loss: 0.011682988612927867 
Validation Loss: 0.044793760267119884 
Time:  59.06923294067383 
----------------------- 


Total training and evaluation time:  176.97829008102417
