# 1.导包

In [11]:
import torch
import random
import numpy as np
from accelerate import Accelerator
from transformers import BertTokenizerFast, BertForQuestionAnswering
from torch.optim import AdamW
import os
import json
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from transformers import get_cosine_schedule_with_warmup


# 2. 固定随机种子，选择设备

In [12]:
# 选择设备
device = "cuda" if torch.cuda.is_available() else "cpu"

# 固定随机种子
def same_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

same_seed(0)

In [13]:
# 选择是否开启自动混合精度
amp_fp16 = True

if amp_fp16:
    accelerator = Accelerator(mixed_precision = 'fp16')
    device = accelerator.device

# 3.加载模型

In [14]:
model_name = 'bert-base-chinese'
model_saved_path = './bert-use-chinese'

if os.path.exists(model_saved_path + '/vocab.txt') == False:
    print("downloading model......")
    os.mkdir(model_saved_path)
    tokenizer = BertTokenizerFast.from_pretrained(model_name)
    tokenizer.save_pretrained(model_saved_path)
    model = BertForQuestionAnswering.from_pretrained(model_name).to(device)
    model.save_pretrained(model_saved_path)
else:
    print("model exist.")
    tokenizer = BertTokenizerFast.from_pretrained(model_saved_path)
    model = BertForQuestionAnswering.from_pretrained(model_saved_path).to(device)


model exist.


# 4.加载数据和数据集构造

In [15]:
# 加载数据
def read_data(file):
    with open(file, "r", encoding = 'utf-8') as reader:
        data = json.load(reader)
    return data['questions'], data['paragraphs']

train_questions, train_paragraphs = read_data('./data/hw7_train.json')
dev_questions, dev_paragraphs = read_data('./data/hw7_dev.json')
test_questions, test_paragraphs = read_data('./data/hw7_test.json')

In [16]:
train_questions_tokenized = tokenizer([train_question["question_text"] for train_question in train_questions], add_special_tokens=False)
dev_questions_tokenized = tokenizer([dev_question["question_text"] for dev_question in dev_questions], add_special_tokens=False)
test_questions_tokenized = tokenizer([test_question["question_text"] for test_question in test_questions], add_special_tokens=False) 

train_paragraphs_tokenized = tokenizer(train_paragraphs, add_special_tokens=False)
dev_paragraphs_tokenized = tokenizer(dev_paragraphs, add_special_tokens=False)
test_paragraphs_tokenized = tokenizer(test_paragraphs, add_special_tokens=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors


In [17]:
# 定义数据集
class QA_Dataset(Dataset):
    def __init__(self, split, questions, tokenized_questions, tokenized_paragraphs):
        self.split = split
        self.questions = questions
        self.tokenized_questions = tokenized_questions
        self.tokenized_paragraphs = tokenized_paragraphs
        self.max_question_len = 64
        self.max_paragraph_len = 256

        #####  TODO: Change doc_strides  #####
        self.doc_stride = 128

        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1
        
    def __len__(self):
        return len(self.questions)
    
    def get_random_window_start(self, start, end):
        start_max = start
        start_min = max(0, end - self.max_paragraph_len + 1)
        if start_min > start_max:
            window_start = start_min
        else:
            window_start = np.random.randint(start_min, start_max + 1)
        return window_start

    def __getitem__(self, idx):
        question = self.questions[idx]
        tokenized_question = self.tokenized_questions[idx]
        tokenized_paragraph = self.tokenized_paragraphs[question["paragraph_id"]]

        # ##### TODO: How to prevent model from learning somthing it should not learn #####
        if self.split == "train":
            answer_start_token = tokenized_paragraph.char_to_token(question['answer_start'])
            answer_end_token = tokenized_paragraph.char_to_token(question['answer_end'])
            window_start = self.get_random_window_start(answer_start_token, answer_end_token)
            window_end = window_start + self.max_paragraph_len

            input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
            input_ids_paragraph = tokenized_paragraph.ids[window_start:window_end] + [102]

            # Convert answer's start/end positions in tokenized_paragraph to start/end positions in the window  
            answer_start_token += len(input_ids_question) - window_start
            answer_end_token += len(input_ids_question) - window_start
            
            # Pad sequence and obtain inputs to model 
            input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), answer_start_token, answer_end_token

        # Validation/Testing
        else:
            input_ids_list, token_type_ids_list, attention_mask_list = [], [], []
            
            # Paragraph is split into several windows, each with start positions separated by step "doc_stride"
            for i in range(0, len(tokenized_paragraph), self.doc_stride):
                
                # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
                input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
                input_ids_paragraph = tokenized_paragraph.ids[i : i + self.max_paragraph_len] + [102]
                
                # Pad sequence and obtain inputs to model
                input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
                
                input_ids_list.append(input_ids)
                token_type_ids_list.append(token_type_ids)
                attention_mask_list.append(attention_mask)
            
            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list)

    def padding(self, input_ids_question, input_ids_paragraph):
        padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)
        assert padding_len >= 0, f"padding_len negative! qlen={len(input_ids_question)}, plen={len(input_ids_paragraph)}, max={self.max_seq_len}"
        input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len
        token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len
        attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len
        # 确保每个list长度都等于self.max_seq_len
        assert len(input_ids) == self.max_seq_len, f"input_ids: {len(input_ids)} != {self.max_seq_len}"
        assert len(token_type_ids) == self.max_seq_len, f"token_type_ids: {len(token_type_ids)} != {self.max_seq_len}"
        assert len(attention_mask) == self.max_seq_len, f"attention_mask: {len(attention_mask)} != {self.max_seq_len}"
        return input_ids, token_type_ids, attention_mask


train_set = QA_Dataset("train", train_questions, train_questions_tokenized, train_paragraphs_tokenized)
dev_set = QA_Dataset("dev", dev_questions, dev_questions_tokenized, dev_paragraphs_tokenized)
test_set = QA_Dataset("test", test_questions, test_questions_tokenized, test_paragraphs_tokenized)

train_batch_size = 32

# Note: Do NOT change batch size of dev_loader / test_loader !
# Although batch size=1, it is actually a batch consisting of several windows from the same QA pair
train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True, pin_memory=True)
dev_loader = DataLoader(dev_set, batch_size=1, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_set, batch_size=1, shuffle=False, pin_memory=True)

In [18]:
def evaluate(data, output):
    ##### TODO: Postprocessing #####
    # There is a bug and room for improvement in postprocessing 
    # Hint: Open your prediction file to see what is wrong 
    
    answer = ''
    max_prob = float('-inf')
    num_of_windows = data[0].shape[1]
    
    for k in range(num_of_windows):
        # Obtain answer by choosing the most probable start position / end position
        start_prob, start_index = torch.max(output.start_logits[k], dim=0)
        end_prob, end_index = torch.max(output.end_logits[k], dim=0)
        
        # Probability of answer is calculated as sum of start_prob and end_prob
        prob = start_prob + end_prob
        
        # Replace answer if calculated probability is larger than previous windows
        if prob > max_prob:
            max_prob = prob
            # Convert tokens to chars (e.g. [1920, 7032] --> "大 金")
            answer = tokenizer.decode(data[0][0][k][start_index : end_index + 1])
    
    # Remove spaces in answer (e.g. "大 金" --> "大金")
    return answer.replace(' ','')

In [19]:
num_epoch = 10
validation = True
logging_step = 100
learning_rate = 1e-4
optimizer = AdamW(model.parameters(), lr=learning_rate)

best_val_acc = 0

model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) 

model.train()


num_training_steps = num_epoch * len(train_loader)
num_warmup_steps = int(0.1 * num_training_steps)  # 前10%用于warmup
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)


print("Start Training ...")

for epoch in range(num_epoch):
    step = 1
    train_loss = train_acc = 0
    
    for data in tqdm(train_loader):	
        # Load all data into GPU
        data = [i.to(device) for i in data]
        
        # Model inputs: input_ids, token_type_ids, attention_mask, start_positions, end_positions (Note: only "input_ids" is mandatory)
        # Model outputs: start_logits, end_logits, loss (return when start_positions/end_positions are provided)  
        output = model(input_ids=data[0], token_type_ids=data[1], attention_mask=data[2], start_positions=data[3], end_positions=data[4])

        # Choose the most probable start position / end position
        start_index = torch.argmax(output.start_logits, dim=1)
        end_index = torch.argmax(output.end_logits, dim=1)
        
        # Prediction is correct only if both start_index and end_index are correct
        train_acc += ((start_index == data[3]) & (end_index == data[4])).float().mean()
        train_loss += output.loss
        
        
        accelerator.backward(output.loss)
        
        optimizer.step()
        scheduler.step() 
        optimizer.zero_grad()
        step += 1

        ##### TODO: Apply linear learning rate decay #####
        
        
        # Print training loss and accuracy over past logging step
        if step % logging_step == 0:
            print(f"Epoch {epoch + 1} | Step {step} | loss = {train_loss.item() / logging_step:.3f}, acc = {train_acc / logging_step:.3f}")
            train_loss = train_acc = 0

    if validation:
        print("Evaluating Dev Set ...")
        model.eval()
        with torch.no_grad():
            dev_acc = 0
            for i, data in enumerate(tqdm(dev_loader)):
                output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                       attention_mask=data[2].squeeze(dim=0).to(device))
                # prediction is correct only if answer text exactly matches
                dev_acc += evaluate(data, output) == dev_questions[i]["answer_text"]
            print(f"Validation | Epoch {epoch + 1} | acc = {dev_acc / len(dev_loader):.3f}")
            if dev_acc > best_val_acc:
                best_val_acc = dev_acc
                print("Saving Model ...")
                model_save_dir = "saved_model" 
                model.save_pretrained(model_save_dir)
        model.train()

# Save a model and its configuration file to the directory 「saved_model」 
# i.e. there are two files under the direcory 「saved_model」: 「pytorch_model.bin」 and 「config.json」
# Saved model can be re-loaded using 「model = BertForQuestionAnswering.from_pretrained("saved_model")」


Start Training ...


 10%|▉         | 99/991 [00:32<05:05,  2.92it/s]

Epoch 1 | Step 100 | loss = 4.734, acc = 0.030


 20%|██        | 199/991 [01:05<04:37,  2.86it/s]

Epoch 1 | Step 200 | loss = 1.895, acc = 0.371


 30%|███       | 299/991 [01:38<03:54,  2.95it/s]

Epoch 1 | Step 300 | loss = 1.097, acc = 0.597


 40%|████      | 399/991 [02:10<03:22,  2.92it/s]

Epoch 1 | Step 400 | loss = 0.967, acc = 0.634


 50%|█████     | 499/991 [02:43<02:46,  2.95it/s]

Epoch 1 | Step 500 | loss = 0.879, acc = 0.661


 60%|██████    | 599/991 [03:15<02:17,  2.85it/s]

Epoch 1 | Step 600 | loss = 0.845, acc = 0.665


 71%|███████   | 699/991 [03:48<01:39,  2.94it/s]

Epoch 1 | Step 700 | loss = 0.821, acc = 0.680


 81%|████████  | 799/991 [04:20<01:04,  3.00it/s]

Epoch 1 | Step 800 | loss = 0.797, acc = 0.677


 91%|█████████ | 899/991 [04:53<00:30,  2.97it/s]

Epoch 1 | Step 900 | loss = 0.799, acc = 0.698


100%|██████████| 991/991 [05:22<00:00,  3.07it/s]


Evaluating Dev Set ...


100%|██████████| 4131/4131 [00:59<00:00, 69.69it/s]


Validation | Epoch 1 | acc = 0.664
Saving Model ...


 10%|▉         | 99/991 [00:32<05:00,  2.97it/s]

Epoch 2 | Step 100 | loss = 0.718, acc = 0.699


 20%|██        | 199/991 [01:04<04:25,  2.99it/s]

Epoch 2 | Step 200 | loss = 0.692, acc = 0.718


 30%|███       | 299/991 [01:37<03:53,  2.97it/s]

Epoch 2 | Step 300 | loss = 0.671, acc = 0.726


 40%|████      | 399/991 [02:09<03:18,  2.98it/s]

Epoch 2 | Step 400 | loss = 0.691, acc = 0.720


 50%|█████     | 499/991 [02:41<02:44,  3.00it/s]

Epoch 2 | Step 500 | loss = 0.669, acc = 0.728


 60%|██████    | 599/991 [03:13<02:10,  3.00it/s]

Epoch 2 | Step 600 | loss = 0.692, acc = 0.717


 71%|███████   | 699/991 [03:46<01:38,  2.96it/s]

Epoch 2 | Step 700 | loss = 0.687, acc = 0.724


 81%|████████  | 799/991 [04:18<01:04,  2.97it/s]

Epoch 2 | Step 800 | loss = 0.689, acc = 0.725


 91%|█████████ | 899/991 [04:50<00:31,  2.96it/s]

Epoch 2 | Step 900 | loss = 0.703, acc = 0.723


100%|██████████| 991/991 [05:20<00:00,  3.09it/s]


Evaluating Dev Set ...


100%|██████████| 4131/4131 [00:58<00:00, 70.60it/s]


Validation | Epoch 2 | acc = 0.696
Saving Model ...


 10%|▉         | 99/991 [00:32<05:04,  2.93it/s]

Epoch 3 | Step 100 | loss = 0.586, acc = 0.755


 20%|██        | 199/991 [01:05<04:29,  2.94it/s]

Epoch 3 | Step 200 | loss = 1.065, acc = 0.646


 30%|███       | 299/991 [01:37<03:57,  2.92it/s]

Epoch 3 | Step 300 | loss = 0.791, acc = 0.692


 40%|████      | 399/991 [02:10<03:22,  2.93it/s]

Epoch 3 | Step 400 | loss = 0.656, acc = 0.732


 50%|█████     | 499/991 [02:43<02:49,  2.90it/s]

Epoch 3 | Step 500 | loss = 0.590, acc = 0.749


 60%|██████    | 599/991 [03:15<02:10,  3.00it/s]

Epoch 3 | Step 600 | loss = 0.532, acc = 0.772


 71%|███████   | 699/991 [03:48<01:37,  2.98it/s]

Epoch 3 | Step 700 | loss = 0.545, acc = 0.776


 81%|████████  | 799/991 [04:20<01:05,  2.95it/s]

Epoch 3 | Step 800 | loss = 0.536, acc = 0.775


 91%|█████████ | 899/991 [04:53<00:31,  2.93it/s]

Epoch 3 | Step 900 | loss = 0.509, acc = 0.783


100%|██████████| 991/991 [05:22<00:00,  3.07it/s]


Evaluating Dev Set ...


100%|██████████| 4131/4131 [01:01<00:00, 66.80it/s]


Validation | Epoch 3 | acc = 0.682


 10%|▉         | 99/991 [00:31<05:01,  2.96it/s]

Epoch 4 | Step 100 | loss = 0.448, acc = 0.798


 17%|█▋        | 168/991 [00:54<04:26,  3.09it/s]


KeyboardInterrupt: 

In [None]:
print("Evaluating Test Set ...")

result = []

model.eval()
with torch.no_grad():
    for data in tqdm(test_loader):
        output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                       attention_mask=data[2].squeeze(dim=0).to(device))
        result.append(evaluate(data, output))

result_file = "with_learningratedecay_change_stride_improve_pre_epoch_amp.csv"

with open(result_file, 'w') as f:
    f.write('ID,Answer\n')
    for i, test_question in enumerate(test_questions):
        f.write(f"{test_question['id'], {result[i].replace(',', '')}}\n")


Evaluating Test Set ...


100%|██████████| 4957/4957 [01:13<00:00, 67.77it/s]


In [1]:
import re

input_file = "with_learningratedecay_change_stride_improve_pre_epoch_amp.csv"
output_file = "res_1.csv"

with open(input_file, encoding="utf-8") as fin, open(output_file, "w", encoding="utf-8") as fout:
    fout.write("ID,Answer\n")
    for line in fin:
        # 跳过表头
        if line.strip().startswith("ID,"):
            continue
        # 正则提取ID和答案内容
        match = re.match(r"\((\d+), \{[\"']?(.*?)[\"']?\}\)", line.strip())
        if match:
            idx, answer = match.groups()
            fout.write(f"{idx},{answer}\n")
