<a href="https://colab.research.google.com/github/epadam/Machine-Learning-Tutorial-Demo-Resources/blob/master/notebooks/nlp/Bert_for_SQuAD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import requests

In [2]:
if not os.path.exists('squad'):
    os.mkdir('squad')

url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/'
res = requests.get(f'{url}train-v2.0.json')

In [3]:
for file in ['train-v2.0.json', 'dev-v2.0.json']:
    # 通过HTTP请求下载数据
    res = requests.get(f'{url}{file}')
    # 写入文件
    with open(f'/content/squad/{file}', 'wb') as f:
        for chunk in res.iter_content(chunk_size=4):
            f.write(chunk)

In [4]:
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 4.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 45.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 25.8 MB/s 
[?25hCollecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 48.7 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: Py

In [5]:
import json

def read_squad(path):
    # 打开JSON文件并加载字典
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    # 初始化上下文、问题和答案列表
    contexts = []
    questions = []
    answers = []
    # 在squad数据中迭代所有数据
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                # 检查是否需要从'answers'或'plausible_answers'中提取
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    # 添加数据到列表
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # 返回格式化的数据列表
    return contexts, questions, answers

In [6]:
train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

In [15]:
type(train_contexts)

list

In [7]:
def add_end_idx(answers, contexts):
    # 循环每个answer-context对
    for answer, context in zip(answers, contexts):
        # gold_text指的是我们期望在上下文中找到的答案
        gold_text = answer['text']
        # 我们已经知道了起始索引
        start_idx = answer['answer_start']
        # #理想情况下，这将是结束索引…
        end_idx = start_idx + len(gold_text)

        # 然而，有时squad的答案会被一两个字符遗漏
        if context[start_idx:end_idx] == gold_text:
            # 如果答案不是off:)
            answer['answer_end'] = end_idx
        else:
            # 这意味着答案相差1-2个标识
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n

In [8]:
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [9]:
from transformers import DistilBertTokenizerFast
# 初始化tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
# tokenize
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [10]:
tokenizer.decode(train_encodings['input_ids'][0])

'[CLS] beyonce giselle knowles - carter ( / biːˈjɒnseɪ / bee - yon - say ) ( born september 4, 1981 ) is an american singer, songwriter, record producer and actress. born and raised in houston, texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of r & b girl - group destiny\'s child. managed by her father, mathew knowles, the group became one of the world\'s best - selling girl groups of all time. their hiatus saw the release of beyonce\'s debut album, dangerously in love ( 2003 ), which established her as a solo artist worldwide, earned five grammy awards and featured the billboard hot 100 number - one singles " crazy in love " and " baby boy ". [SEP] when did beyonce start becoming popular? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [11]:
def add_token_positions(encodings, answers):
    # 初始化列表以包含答案start/end的标记索引
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # 使用char_to_token方法追加开始/结束标记位置
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # 如果起始位置为None，则答案已被截断
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position无法找到，char_to_token找到了空格，所以移动位置直到找到为止
        shift = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1
    # 用新的基于标识的开始/结束位置更新我们的encodings对象
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})


In [12]:
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [13]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [14]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [15]:
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [16]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [17]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# 设置GPU / CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# 将模型移到被检测设备
model.to(device)
# 激活模型的训练模式
model.train()
# 初始化AdamW优化器的权重衰减
optim = AdamW(model.parameters(), lr=5e-5)

# 初始化训练数据的数据加载器
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [19]:

for epoch in range(1):
    # 设置模型为训练模式
    model.train()
    # 设置循环(我们对进度条使用tqdm)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # 初始化计算的梯度(从上一步)
        optim.zero_grad()
        # 提取训练所需的所有张量批次
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # 训练模型，返回输出
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # 提取损失
        loss = outputs[0]
        # 计算每个需要更新的参数的损失
        loss.backward()
        # 更新参数
        optim.step()
        # 在进度条上打印相关信息
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 16290/16290 [3:35:37<00:00,  1.26it/s, loss=2.13]
