# 環境準備

In [1]:
!nvidia-smi #check GPU，Colab會自動分配GPU，顯存15G以上比較好，不然建議終止工作階段重連來隨機更換GPU（重啟不會更換GPU）

Sun Aug 15 12:03:39 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
# 連結google drive，設定路徑

run_path = "/content/gdrive/MyDrive/tweet_drug_project/code/tweet_drug_search" 

from google.colab import drive
import os
drive.mount('/content/gdrive')
os.chdir(run_path) #更改路徑
os.getcwd()

import sys
sys.path.append(run_path) #添加當前路徑為sys path中，不然無法import

Mounted at /content/gdrive


In [3]:
!pip install datasets transformers accelerate sentencepiece

Collecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[K     |████████████████████████████████| 264 kB 7.2 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 50.7 MB/s 
[?25hCollecting accelerate
  Downloading accelerate-0.4.0-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 4.9 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 59.1 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 65.6 MB/s 
Collecting huggingface-hub<0.1.0
  Downloading huggingface_hub-0.0.15-py3-none-any.whl (43 kB)
[K     |████████████████████████████████| 43 kB 2.4 MB/s 
Collecting fsspec>=2021.05.0
  Downloading fsspec-2021.7.0-py3-non

In [4]:
import transformers

print(transformers.__version__) # version check, at least 4.8.1

4.9.2


# 在QA資料集上微調 預訓練Language Model

這個腳本會講解如何微調BERT等預訓練模型來進行QA任務，要注意的是，這個範例中的回答方式不是透過文本生成來回答問題，而是擷取給定Context中的文本片段來進行回答。目前，最常用的QA資料集為[SQuAD](https://rajpurkar.github.io/SQuAD-explorer/)，以下為SQuAD的範例：

<img src="https://i.imgur.com/sOKTl1Z.jpg" width="500"/>

## 超參數設定

In [5]:
squad_v2 = True # Switch between SQUAD v1 or 2
model_checkpoint = "bert-base-cased" # Download Model from HuggingFace Library, 可以跑BERT、ELECTRA、RoBERTa、DeBERTa，其他未測試
batch_size = 16

In [6]:
class arguments:
  def __init__(self,batch_size,model_checkpoint):
    self.dataset_name=None
    self.dataset_config_name=None
    self.train_file=None
    self.preprocessing_num_workers=4
    self.do_predict=False
    self.validation_file=None
    self.test_file=None
    self.max_seq_length=384
    self.pad_to_max_length=True
    self.model_name_or_path=model_checkpoint
    self.config_name=None
    self.tokenizer_name=None
    self.use_slow_tokenizer=False
    self.per_device_train_batch_size=batch_size
    self.per_device_eval_batch_size=batch_size
    self.learning_rate=3e-5
    self.weight_decay=0.01
    self.num_train_epochs=1 
    self.max_train_steps=None
    self.gradient_accumulation_steps=1 #顯存不夠大時設定，建議gradient_accumulation_steps*batch_size >= 16
    self.lr_scheduler_type="linear" 
    self.num_warmup_steps=0
    self.output_dir="./models/"
    self.seed=None
    self.doc_stride=128
    self.n_best_size=20 #輸出後處理，選擇多少「答案」作為候選
    self.null_score_diff_threshold=0.0
    self.version_2_with_negative=False
    self.max_answer_length=30
    self.max_train_samples=3000 #測試時請設定，不然會跑很久；正式訓練改為None
    self.max_eval_samples=None
    self.overwrite_cache=True
    self.max_predict_samples=None
    self.model_type=None

args = arguments(batch_size,model_checkpoint)

## 下載資料集

這部分將使用[Datasets](https://github.com/huggingface/datasets) 提供的 `load_dataset` 來完成資料集準備。當然，直接從其他QA資料集的官方網站下載也可以。`load_metric` 是寫好的評估方法。如果要用自己的json、csv格式的dataset，load_dataset也可以完成。請看官方文檔[Datasets documentation](https://huggingface.co/docs/datasets/loading_datasets.html#from-local-files)。


In [7]:
from datasets import load_dataset, load_metric

datasets = load_dataset("squad_v2" if squad_v2 else "squad")

Downloading:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 (download: 44.34 MiB, generated: 122.41 MiB, post-processed: Unknown size, total: 166.75 MiB) to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


Downloading:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/801k [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


In [8]:
import pprint
pp = pprint.PrettyPrinter(indent=4) #只是為了漂亮印出Dict

print("datasets結構：\n")
pp.pprint(datasets)
print("\n\ntrain資料：\n")
pp.pprint(datasets['train'][0])
print("\n\nvalidation資料：\n")
pp.pprint(datasets['validation'][0])

datasets結構：

{   'train': Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 130319
}),
    'validation': Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 11873
})}


train資料：

{   'answers': {'answer_start': [269], 'text': ['in the late 1990s']},
    'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) '
               '(born September 4, 1981) is an American singer, songwriter, '
               'record producer and actress. Born and raised in Houston, '
               'Texas, she performed in various singing and dancing '
               'competitions as a child, and rose to fame in the late 1990s as '
               "lead singer of R&B girl-group Destiny's Child. Managed by her "
               "father, Mathew Knowles, the group became one of the world's "
               'best-selling girl groups of all time. Their hiatus saw the '
               "release of Beyoncé's debut album, Dangerous

## 訓練資料預處理

通常Transformers提供的 Tokenizer 可以自動將文本Tokenize並轉換為model可以讀取的形式。BERT的輸入格式參考下方圖片：

<img src="https://i.imgur.com/xQIkHWu.png" width="800" />

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [10]:
#預處理，主要處理的問題是部分passage可能超過設定的max_seq_length，因此需要切段落來製作train,eval並調整答案的index

from src.utils.dataset_preprocess import QAdataset

SQuAD_dataset = QAdataset(datasets,tokenizer,args)

train_dataset = SQuAD_dataset.generate_train_dataset()
eval_dataset = SQuAD_dataset.generate_eval_dataset()

print("train_dataset：\n")
pp.pprint(train_dataset)

print("\n\neval_dataset：\n")
pp.pprint(eval_dataset)

train_dataset：

Dataset({
    features: ['attention_mask', 'end_positions', 'input_ids', 'start_positions', 'token_type_ids'],
    num_rows: 3000
})


eval_dataset：

Dataset({
    features: ['attention_mask', 'example_id', 'input_ids', 'offset_mapping', 'token_type_ids'],
    num_rows: 12199
})


## 模型建構-Output Layer

In [12]:
from torch import nn
from torch.nn import CrossEntropyLoss

class QuestionAnsweringModelOutput: # Output layer輸出的資料格式
    def __init__(self,loss,start_logits,end_logits):
      self.loss = loss
      self.start_logits = start_logits
      self.end_logits = end_logits

class OutputQA(nn.Module): # Output layer，套用於預訓練模型上層

    def __init__(self,hidden_dim):
        super(OutputQA, self).__init__()
        self.num_labels = 2
        self.qa_outputs = nn.Linear(hidden_dim, 2) # Linear Layer, 輸出start, end logits 可視為預測得分

    def forward(
        self,
        outputs,
        start_positions=None,
        end_positions=None
    ):
        sequence_output = outputs[0] # last_hidden_state: batch size * seq_len * 768

        logits = self.qa_outputs(sequence_output) # batch size * seq_len * 2
        start_logits, end_logits = logits.split(1, dim=-1) # split to start and end logits, each shape: batch size * seq_len
        start_logits = start_logits.squeeze(-1).contiguous() # 1D: len = batch size * seq_len
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)

            # 若start/end positions超出模型inputs範圍，直接忽略
            ignored_index = start_logits.size(1) # shape: batch size
            start_positions = start_positions.clamp(0, ignored_index) # clamp = into the range [ min, max ]
            end_positions = end_positions.clamp(0, ignored_index)

            # loss 計算
            loss_fct = CrossEntropyLoss(ignore_index=ignored_index) 
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits
        )


## Fine-tune模型

現在資料就緒，可以使用Transformers預設好的`AutoModelForQuestionAnswering`進行預訓練，也可以自己定義。

In [13]:
from transformers import default_data_collator
from transformers import AdamW
from transformers import get_scheduler
from accelerate import Accelerator
from torch.utils.data.dataloader import DataLoader
import logging
import math
import os
import numpy as np
import torch
from tqdm.auto import tqdm

logger = logging.getLogger(__name__)

In [14]:
#載入模型
from transformers import RobertaModel,BertModel,ElectraModel,DebertaModel, AutoModel
import re

if re.search(r"roberta",args.model_name_or_path.lower()):
  model = RobertaModel.from_pretrained(args.model_name_or_path) 
elif re.search(r"electra",args.model_name_or_path.lower()):
  model = ElectraModel.from_pretrained(args.model_name_or_path) 
elif re.search(r"deberta",args.model_name_or_path.lower()):
  model = DebertaModel.from_pretrained(args.model_name_or_path) 
else:
  print("warning: Using AutoModel but not sure about the type of model_name_or_path.")
  model = AutoModel.from_pretrained(args.model_name_or_path) 

output_layer = OutputQA(model.config.hidden_size)



Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
# Dataloader

data_collator = default_data_collator 

train_dataloader = DataLoader(
        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
    )

eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"])
eval_dataloader = DataLoader(
        eval_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
    )

In [16]:
# Optimizer
# 權重分兩組，一個有weight decay，另一組不設。Weight dacay是在loss計算中加入懲罰，用來避免overfit。同時，Output Layer的LR通常要設置大於Bert的LR
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in output_layer.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": args.weight_decay,
        "lr": args.learning_rate*10, # linear的lr設大一些
    },
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

In [17]:
# 用`accelerate`自動切換設備/多線程，跟寫.cuda()或.to(device)效果相同，可參考https://pypi.org/project/accelerate/

accelerator = Accelerator()
print(accelerator.state)

output_layer, model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    output_layer, model, optimizer, train_dataloader, eval_dataloader
) #back propagation時要用accelerator.backward(loss)取代loss.backward()

Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
Use FP16 precision: False



In [18]:
# 根據步數設定學習率調整策略，可以調整args.lr_scheduler_type為其他策略，例如cosine
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) #計算總訓練步數

if args.max_train_steps is None:
    args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
else:
    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

lr_scheduler = get_scheduler(
    name=args.lr_scheduler_type, #學習率調整策略
    optimizer=optimizer, 
    num_warmup_steps=args.num_warmup_steps, #warm up步數通常取總布數的1/10，這邊默認是0
    num_training_steps=args.max_train_steps,
)

In [None]:
# 訓練
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

print("***** Running training *****")
print(f"  Num examples = {len(train_dataset)}")
print(f"  Num Epochs = {args.num_train_epochs}")
print(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
print(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
print(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
print(f"  Total optimization steps = {args.max_train_steps}")

# 進度條設定，disable部分是為了規定多個GPU時只顯示一個進度條，在Colab中有沒有都不影響
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)

completed_steps = 0
for epoch in range(args.num_train_epochs):
    model.train()
    output_layer.train()
    for step, batch in enumerate(train_dataloader):
        input = {key: value for key, value in batch.items() if key not in ['start_positions','end_positions']}
        encoding = model(**input)
        outputs = output_layer(encoding,batch['start_positions'],batch['end_positions'])
        loss = outputs.loss
        loss = loss / args.gradient_accumulation_steps
        accelerator.backward(loss)
        if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            completed_steps += 1

        if completed_steps >= args.max_train_steps:
            break

***** Running training *****
  Num examples = 3000
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 188


  0%|          | 0/188 [00:00<?, ?it/s]

# 在eval集上評估

### 評估方法說明

SQuAD使用exact match (EM)和F1 score。這些分數是根據單個「問題+答案」對計算的。當一個問題可能有多個正確答案時，計算所有可能的正確答案的最大分數。Model的EM和F1所有例子分數的平均分。

#### Exact Match

對於每個問題+答案對，如果模型預測的詞句與（其中一個）真實答案的詞句完全匹配，EM=1，否則EM=0。嚴格的全有或全無指標；偏離一個詞句就會得到0分。

#### F1

通過預測中的token與真實答案中的token進行計算的。預測和真實答案之間的共享token的數量來計算F1分數的基礎：Precision是共享token的數量與預測中的總token數的比率，Recall是共享token的數量與真實答案中的總token數的比率。

In [None]:
# 評估
from src.utils.utils_qa import * #post_processing_function, create_and_fill_np_array

squad_ver = "squad_v2" if args.version_2_with_negative else "squad"

print("***** Running Evaluation *****")
print(f"  Num examples = {len(eval_dataset)}")
print(f"  Batch size = {args.per_device_eval_batch_size}")
print(f"  squad_version = {squad_ver}")

metric = load_metric(squad_ver)

progress_bar = tqdm(range(len(eval_dataloader)), disable=not accelerator.is_local_main_process)

model.eval()
output_layer.eval()

all_start_logits = []
all_end_logits = []
for step, batch in enumerate(eval_dataloader):
    with torch.no_grad():
        encoding = model(**batch)
        outputs = output_layer(encoding)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        if not args.pad_to_max_length:  # 必須為預測做padding才能使用gather（accelerator的要求）
            start_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100) # pad tensor across processes to max length
            end_logits = accelerator.pad_across_processes(end_logits, dim=1, pad_index=-100)

        all_start_logits.append(accelerator.gather(start_logits).cpu().numpy()) # 分佈式計算時用於集合預測結果
        all_end_logits.append(accelerator.gather(end_logits).cpu().numpy())
    progress_bar.update(1)

max_len = max([x.shape[1] for x in all_start_logits])  # 獲得最大長度

# concatenate array
start_logits_concat = create_and_fill_np_array(all_start_logits, eval_dataset, max_len)
end_logits_concat = create_and_fill_np_array(all_end_logits, eval_dataset, max_len)

# 使用完畢，刪除
del all_start_logits
del all_end_logits

outputs_numpy = (start_logits_concat, end_logits_concat) #預測結果
prediction = post_processing_function(datasets['validation'], eval_dataset, outputs_numpy,args) #後處理原理可以參考：https://qa.fastforwardlabs.com/no%20answer/null%20threshold/bert/distilbert/exact%20match/f1/robust%20predictions/2020/06/09/Evaluating_BERT_on_SQuAD.html#Metrics-for-QA
eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
print(f"Evaluation metrics: {eval_metric}")

***** Running Evaluation *****
  Num examples = 10790
  Batch size = 12
  squad_version = squad


HBox(children=(FloatProgress(value=0.0, max=900.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=10570.0), HTML(value='')))


Evaluation metrics: {'exact_match': 67.14285714285714, 'f1': 77.76019472351693}


In [None]:
# 測試看看簡單選取start_logits的最大值和end_logits的最大值所對應的prediction（outputs_numpy）
# 與經過post_processing_function處理過後的prediction差別

for qs_id in range(10):

  start_idx = outputs_numpy[0][qs_id].argmax()
  end_idx = outputs_numpy[1][qs_id].argmax()
  print(str(start_idx)+" "+str(end_idx))
  print(' '.join(tokenizer.convert_ids_to_tokens(eval_dataset['input_ids'][qs_id][start_idx:end_idx+1])))
  print(prediction.predictions[qs_id]['prediction_text'])
  print('\n')

48 49
ĠDenver ĠBroncos
Denver Broncos


48 49
ĠDenver ĠBroncos
Denver Broncos


86 83

San Francisco Bay Area


45 46
ĠDenver ĠBroncos
Denver Broncos


123 123
Ġgold
gold


117 113

golden anniversary


75 78
ĠFebruary Ġ7 , Ġ2016
February 7, 2016


10 45
Super ĠBowl Ġ50 Ġwas Ġan ĠAmerican Ġfootball Ġgame Ġto Ġdetermine Ġthe Ġchampion Ġof Ġthe ĠNational ĠFootball ĠLeague Ġ( NFL ) Ġfor Ġthe Ġ2015 Ġseason . ĠThe ĠAmerican ĠFootball ĠConference Ġ( A FC ) Ġchampion ĠDenver ĠBroncos
Denver Broncos


111 113
gold en Ġanniversary
golden anniversary


35 37
ĠAmerican ĠFootball ĠConference
American Football Conference




In [None]:
# 人工查看，比較預測結果與和答案

for i in range(10):
  print("====Question====")
  pp.pprint(datasets['validation'][i])
  print("\n====prediction====")
  pp.pprint(prediction.predictions[i])
  print("\n")

====Question====
{   'answers': {   'answer_start': [177, 177, 177],
                   'text': [   'Denver Broncos',
                               'Denver Broncos',
                               'Denver Broncos']},
    'context': 'Super Bowl 50 was an American football game to determine the '
               'champion of the National Football League (NFL) for the 2015 '
               'season. The American Football Conference (AFC) champion Denver '
               'Broncos defeated the National Football Conference (NFC) '
               'champion Carolina Panthers 24–10 to earn their third Super '
               "Bowl title. The game was played on February 7, 2016, at Levi's "
               'Stadium in the San Francisco Bay Area at Santa Clara, '
               'California. As this was the 50th Super Bowl, the league '
               'emphasized the "golden anniversary" with various gold-themed '
               'initiatives, as well as temporarily suspending the tradition '
        

In [None]:
# 儲存微調好的模型

if args.output_dir is not None:
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)