## 設定cloud的權限與train data/test data的路徑

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_dir = '/content/drive/My Drive/squad20/hpv_data/'

## 讀取data

In [None]:
import json

def read_squad(path):
    # open JSON file and load intro dictionary
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                # check if we need to be extracting from 'answers' or 'plausible_answers'
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers

# execute our read SQuAD function for training and validation sets
train_contexts, train_questions, train_answers = read_squad(data_dir + 'hcp_train.json')
val_contexts, val_questions, val_answers = read_squad(data_dir + 'hcp_test.json')

## 增加end index


In [None]:
def add_end_idx(answers, contexts):
  # loop through each answer-context pair
  for answer, context in zip(answers, contexts):
    gold_text = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(gold_text)

    if context[start_idx:end_idx] == gold_text:
      answer['answer_end'] = end_idx
    else:
      for n in [1, 2]:
        if context[start_idx-n:end_idx-n] == gold_text:
          answer['answer_start'] = start_idx - n
          answer['answer_end'] = end_idx - n

# # and apply the function to our two answer lists
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

## 設定Berttokenizer


In [None]:
!pip install transformers
from transformers import BertTokenizerFast
# initialize the tokenizer
tokenizer = BertTokenizerFast.from_pretrained('mrm8488/bert-multi-cased-finetuned-xquadv1')
# tokenize
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 14.8 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 55.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 54.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 39.8 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.3 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/657 [00:00<?, ?B/s]

## 將start_positions與end_positions加入tokenizer



In [None]:
def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift position until found
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    # update our encodings object with the new token-based start/end positions

    if len(start_positions) == 0:
      start_positions.append(512)
    if len(end_positions) == 0:
      end_positions.append(512)
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# apply function to our data
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

## 設定Dataset


In [None]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        # print(self.encodings)
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# build datasets for both our training and validation sets
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

## train
* 將dataset轉成dataloader
* 對dataloader進行批次訓練


In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm
from transformers import BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained('mrm8488/bert-multi-cased-finetuned-xquadv1')

# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=1e-5)
best_val_loss = 999999

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=24, shuffle=False)

for epoch in range(10):
    # set model to train mode
    model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
      # initialize calculated gradients (from prev step)
      optim.zero_grad()
      # pull all the tensor batches required for training
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      # train model on batch and return outputs (incl. loss)
      outputs = model(input_ids, attention_mask=attention_mask,
                      start_positions=start_positions,
                      end_positions=end_positions)
      # extract loss
      loss = outputs[0]
      # calculate loss for every parameter that needs grad update
      loss.backward()
      # update parameters
      optim.step()
      # print relevant info to progress bar
      loop.set_description(f'Epoch {epoch}')
      loop.set_postfix(loss=loss.item())
    # setup loop (we use tqdm for the progress bar)
    model.eval()
    val_loop = tqdm(val_loader, leave=True)
    val_loss = 0
    for batch in val_loop:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask,
                      start_positions=start_positions,
                      end_positions=end_positions)
      # extract loss
      loss = outputs[0]
      loss.backward()
      # update parameters
      optim.step()
      # print relevant info to progress bar
      val_loop.set_description(f'Epoch {epoch} val loss')
      val_loop.set_postfix(loss=loss.item())
      val_loss += loss.item()
    if val_loss/len(val_loader) < best_val_loss:
      best_val_loss = val_loss/len(val_loader)
      torch.save(model.state_dict(), data_dir + 'model/jimmy_QA_model_epoch5_noG9.bin')
      model.config.to_json_file(data_dir + 'model/jimmy_QA_config_file_epoch5_noG9.bin')
      tokenizer.save_vocabulary(data_dir + 'model')
      print("save this model ---------------> \n")

Downloading:   0%|          | 0.00/711M [00:00<?, ?B/s]

Epoch 0: 100%|██████████| 6/6 [03:24<00:00, 34.07s/it, loss=4.39]
Epoch 0 val loss: 100%|██████████| 2/2 [01:16<00:00, 38.20s/it, loss=4.04]


save this model ---------------> 



Epoch 1: 100%|██████████| 6/6 [03:19<00:00, 33.26s/it, loss=3.11]
Epoch 1 val loss: 100%|██████████| 2/2 [01:16<00:00, 38.19s/it, loss=2.76]


save this model ---------------> 



Epoch 2: 100%|██████████| 6/6 [03:19<00:00, 33.30s/it, loss=2.54]
Epoch 2 val loss: 100%|██████████| 2/2 [01:16<00:00, 38.24s/it, loss=1.99]


save this model ---------------> 



Epoch 3: 100%|██████████| 6/6 [03:19<00:00, 33.17s/it, loss=2.28]
Epoch 3 val loss: 100%|██████████| 2/2 [01:15<00:00, 37.95s/it, loss=1.53]


save this model ---------------> 



Epoch 4: 100%|██████████| 6/6 [03:19<00:00, 33.31s/it, loss=2.02]
Epoch 4 val loss: 100%|██████████| 2/2 [01:16<00:00, 38.03s/it, loss=1.51]
Epoch 5:  67%|██████▋   | 4/6 [02:50<01:25, 42.69s/it, loss=2.27]


KeyboardInterrupt: ignored

In [None]:
# switch model out of training mode
model.eval()

#val_sampler = SequentialSampler(val_dataset)
val_loader = DataLoader(val_dataset, batch_size=1)

acc = []
# initialize loop for progress bar
loop = tqdm(val_loader)
counti = 0
# loop through batches
for batch in loop:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        # pull preds out
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        print(start_pred, end_pred)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())

        tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
        answer=tokens[start_pred]
        for i in range(start_pred+1, end_pred):
          answer += tokens[i]
        print('\nquestion: ' + val_questions[counti])
        print('Answer: '+ answer)
        counti += 1
# calculate average accuracy in total
acc = sum(acc)/len(acc)

In [None]:
acc

## 保存模型(可自行設定)

In [None]:
# from transformers import WEIGHTS_NAME, CONFIG_NAME

torch.save(model.state_dict(), data_dir + 'model/jimmy_QA_model.bin')
model.config.to_json_file(data_dir + 'model/jimmy_QA_config_file.bin')
# tokenizer.save_vocabulary(output_vocab_file)
tokenizer.save_vocabulary(data_dir + 'model')

## 載入model


In [None]:
# !pip install transformers
from transformers import BertConfig
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm
from transformers import BertForQuestionAnswering
import torch
from transformers import BertTokenizerFast

# 調用測試 model
# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
jimmy_QA_config = BertConfig.from_json_file(data_dir + 'model/jimmy_QA_config_file.bin')
jimmy_QA_model = BertForQuestionAnswering(jimmy_QA_config).to(device)
state_dict = torch.load(data_dir + 'model/jimmy_QA_model.bin')
jimmy_QA_model.load_state_dict(state_dict)
jimmy_QA_tokenizer = BertTokenizerFast(data_dir + 'model/vocab.txt')