In [None]:
!mkdir squad
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json
!pip install transformers
!pip install git+https://github.com/d2l-ai/d2l-zh@release  # installing d2l

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from d2l import torch as d2l

In [None]:
from transformers import XLNetTokenizerFast, XLNetForQuestionAnsweringSimple
import torch
tokenizer = XLNetTokenizerFast.from_pretrained('xlnet-base-cased')
model = XLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased')

In [None]:
#do the data preprocessing
from pathlib import Path
def preprocessing_and_read_dataset(path):
  path = Path(path)
  data_set = pd.read_json(path) 
  contexts = []
  questions = []
  answers = []
  for article in data_set['data']:
    paragraphs = article['paragraphs']
    title = article['title']
    for paragraph in paragraphs:
      context = paragraph['context']
      qas = paragraph['qas']
      for text in qas:
        q_id = text['id']
        q_answer = text['answers']
        q_isimp = text['is_impossible']
        q_question = text['question']
        for ans in q_answer:
          contexts.append(context)
          questions.append(q_question)
          answers.append(ans)
  return contexts, questions, answers

In [None]:
train_contexts, train_questions, train_answers = preprocessing_and_read_dataset('squad/train-v2.0.json')
valid_contexts, valid_questions, valid_answers = preprocessing_and_read_dataset('squad/dev-v2.0.json')

In [None]:
training_encoding = tokenizer(train_contexts, train_questions, truncation=True, padding=True, max_length=512)#do the tokenize with padding and truncation
validing_encoding = tokenizer(valid_contexts, valid_questions, truncation=True, padding=True, max_length=512)

In [None]:
def add_end_indices(answer, context):
  '''
  The given input is the answer dicitionary and corresponding context
  '''
  for a, ctext in zip(answer, context):
    answer_text = a['text']
    start_idx = a['answer_start']
    end_idx = start_idx + len(answer_text)
    # the readme file say that squad answers maybe off by a character or two, to fix this problem
    if ctext[start_idx:end_idx] == answer_text:
      a['answer_end'] = end_idx# add the end idx into it
    elif ctext[start_idx-1:end_idx-1] == answer_text:
      a['answer_end'] = end_idx-1
      a['answer_start'] = start_idx-1
    elif ctext[start_idx-2:end_idx-2] == answer_text:
      a['answer_end'] = end_idx-2
      a['answer_start'] = start_idx-2
add_end_indices(train_answers,train_contexts)
add_end_indices(valid_answers,valid_contexts)

In [None]:
def token_positions(encoding, answer):
  start_position = []
  end_position = []
  for i in range(len(answer)):
    #it will return the index of start token and end token
    start_position.append(encoding.char_to_token(i, answer[i]['answer_start']))
    end_position.append(encoding.char_to_token(i, answer[i]['answer_end']-1))
    # if start position is none, answer has been truncated, then we assume it to the max_length
    if start_position[-1] is None:
      start_position[-1] = tokenizer.model_max_length
    if end_position[-1] is None:
      end_position[-1] = tokenizer.model_max_length
  encoding.update({"start_positions": start_position, "end_positions": end_position})
token_positions(training_encoding, train_answers)
token_positions(validing_encoding, valid_answers)

In [None]:
class Squad_v2_Dataset(torch.utils.data.Dataset):
  '''
  We need to implement the __init__, __get__item and __len__ function
  '''
  def __init__(self, encoding):
    self.encoding = encoding
  def __getitem__(self, idx):
    '''
    given the idx, return the corresponding key-value pair
    '''
    dic = {}
    for key, value in self.encoding.items():
      dic[key] = torch.tensor(value[idx],dtype=torch.float)
    return dic
  def __len__(self):
    return len(self.encoding.input_ids)
training_dataset = Squad_v2_Dataset(training_encoding)
validing_dataset = Squad_v2_Dataset(validing_encoding)

In [None]:
training_iter = DataLoader(training_dataset, batch_size=4, shuffle=True)
validing_iter = DataLoader(validing_dataset, batch_size=4, shuffle=False)
next(iter(training_iter)), next(iter(validing_iter))

In [None]:
from transformers import AdamW
def evaluate_accuracy_gpu(net, data_iter, device=None):
    '''
    For test iter evaluate the accuracy
    '''
    #set to evaluation
    net.eval()
    if not device:
        device = next(iter(net.parameters())).device
    metric  = d2l.Accumulator(4)# validation loss,total correct num and total prediction
    for batch in data_iter:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_pi = batch['start_positions'].to(device)
        end_pi = batch['end_positions'].to(device)
        outputs = net(input_ids, attention_mask=attention_mask, start_positions = start_pi,
                        end_positions = end_pi)
        loss = outputs[0]
        start_logit = outputs.start_logits
        end_logit = outputs.end_logits
        metric.add(loss*input_ids.shape[0], d2l.accuracy(start_logit, start_pi),d2l.accuracy(end_logit,end_pi), input_ids.shape[0])
    return metric[0]/metric[3], metric[1]/metric[3], metric[2]/metric[3]

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
num_epoches = 3
print("training on ", device)
model.to(device)
training_iter = DataLoader(training_dataset, batch_size=16, shuffle=True)
optimizer = AdamW(model.parameters(), lr = 5e-5)
# do the visiualiztion
animator = d2l.Animator(xlabel='epoch', xlim=[1,num_epoches],
                            legend=['train loss','train_start_acc','train_end_acc','valid_loss', 'valid_start_acc', 'valid_end_acc'])
num_batches = len(training_iter)
for epoch in range(num_epoches):
    model.train()
    metric = d2l.Accumulator(4)
    for j,i in enumerate(training_iter):
        optimizer.zero_grad()
        input_ids = i['input_ids'].to(device).long()
        attention_mask = i['attention_mask'].to(device).long()
        start_pi = i['start_positions'].to(device).long()
        end_pi = i['end_positions'].to(device).long()
        outputs = model(input_ids, attention_mask=attention_mask, start_positions = start_pi,
                        end_positions = end_pi)
        loss = outputs[0]
        start_logit = outputs.start_logits
        end_logit = outputs.end_logits
        #get the one-hot encoding for 
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            metric.add(loss*input_ids.shape[0], d2l.accuracy(start_logit, start_pi),d2l.accuracy(end_logit,end_pi), input_ids.shape[0])
            train_loss = metric[0]/ metric[3]
            start_acc = metric[1]/metric[3]
            end_acc = metric[2]/metric[3]
            if (j + 1) % (num_batches // 5) == 0 or j == num_batches - 1:
                  animator.add(epoch + (j + 1) / num_batches,
                                 (train_loss, start_acc, end_acc, None, None, None))
        #if j%10 ==0:
    print('/****************** validation part of epoch '+str(epoch)+' ******************/')
    valid_ls, valid_start_acc, valid_end_acc = evaluate_accuracy_gpu(model,validing_iter)
    animator.add(epoch+1, (None, None, None, valid_ls, valid_start_acc, valid_end_acc))
    print(f'epoch {epoch}, loss {train_loss:.3f}, train start acc {start_acc:.3f}, '
              f'train end acc {end_acc:.3f}, valid loss {valid_ls:.3f}, valid start acc {valid_start_acc:.3f} '
         f' valid_end_acc {valid_end_acc:.3f}')

In [None]:
model.save_pretrained("./model_saved_xlnet/")
tokenizer.save_pretrained("./model_saved_xlnet/")