In [1]:
from datasets import load_dataset

squad = load_dataset("squad")

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [2]:
train = squad["train"]
valid = squad["validation"]

In [3]:
from datasets import Dataset

# Giả sử `train_dataset` là tập dữ liệu gốc
def process_data(example):
    # Lấy phần tử đầu tiên từ 'answers'
    answer_text = example["answers"]["text"][0] if example["answers"]["text"] else ""
    answer_start_idx = example["answers"]["answer_start"][0] if example["answers"]["answer_start"] else -1
    
    return {
        "id": example["id"],
        "title": example["title"],
        "context": example["context"],
        "question": example["question"],
        "answer_text": answer_text,
        "answer_start_idx": answer_start_idx,
    }

# Chuyển đổi dataset
train = train.map(process_data)
valid = valid.map(process_data)


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [4]:
from datasets import Dataset
import random

# Đảm bảo tính ngẫu nhiên được tái lập
random.seed(42)

# Lấy ngẫu nhiên 20000 chỉ số từ tập có 87599 hàng
num_rows = 87599
sample_size = 30000
random_indices = random.sample(range(num_rows), sample_size)

# Trích xuất dữ liệu ngẫu nhiên
train = train.select(random_indices)

num_rows = 10570
sample_size = 3000
random_indices = random.sample(range(num_rows), sample_size)
valid = valid.select(random_indices)


In [5]:
def assert_sample(sample):
    assert sample['context'][sample['answer_start_idx']: sample['answer_start_idx'] + len(sample['answer_text'])] == \
           sample['answer_text'], sample
    assert len(sample['context']) > 0
    assert len(sample['question']) > 0
    return True

def format_sample(sample):
    context_prev = sample['context'][:sample['answer_start_idx']].split()
    sample['answer_word_start_idx'] = len(context_prev)
    sample['answer_word_end_idx'] = len(context_prev) + len(sample['answer_text'].split()) - 1
    return sample

train.filter(assert_sample)
train = train.map(format_sample)

valid.filter(assert_sample)
valid = valid.map(format_sample)

Filter:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

### Building model

In [6]:
!pip install datasets
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [7]:
import os
os.environ["WANDB_DISABLED"] = "true"

import evaluate
from transformers import TrainingArguments
from transformers import Trainer
import numpy as np
# from datasets import load_metric
import os

import datasets
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
import torch
import numpy as np
from nltk import word_tokenize

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


def compute_metrics(eval_pred):
    # metric = datasets.load_metric("squad", cache_dir='./log/metric')
    metric = evaluate.load('squad', cache_dir="./log/metric")
    # print(eval_pred)
    logits, labels = eval_pred
    logits = list(zip(logits[0], logits[1]))
    labels, span_ids, samples_input_ids, word_lengths = list(zip(labels[0], labels[1])), labels[2], labels[3], labels[4]
    predictions = []
    references = []
    for idx, (predict, span_truth, input_ids, sample_words_length) in enumerate(
            list(zip(logits, span_ids, samples_input_ids, word_lengths))):
        span_truth = np.delete(span_truth, np.where(span_truth == -100))
        input_ids = np.delete(input_ids, np.where(input_ids == -100))

        # Get the most likely beginning of answer with the argmax of the score
        answer_start = sum(sample_words_length[:np.argmax(predict[0])])
        # Get the most likely end of answer with the argmax of the score
        answer_end = sum(sample_words_length[:np.argmax(predict[1]) + 1])

        answer = tokenizer.convert_tokens_to_string(
            tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
        answer_truth = tokenizer.convert_tokens_to_string(
            tokenizer.convert_ids_to_tokens(span_truth))

        predictions.append({'prediction_text': answer, 'id': str(idx)})
        references.append({'answers': {'answer_start': [answer_start], 'text': [answer_truth]}, 'id': str(idx)})
    results = metric.compute(predictions=predictions, references=references)
    return results

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
def data_collator(samples):
    if len(samples) == 0:
        return {}

    for sample in samples:
        start_idx = sum(sample['words_lengths'][:sample['start_idx']])
        end_idx = sum(sample['words_lengths'][:sample['end_idx'] + 1])
        sample['span_answer_ids'] = sample['input_ids'][start_idx:end_idx]

    def collate_tokens(values, pad_idx, eos_idx=None, left_pad=False, move_eos_to_beginning=False):
        """Convert a list of 1d tensors into a padded 2d tensor."""
        size = max(v.size(0) for v in values)
        res = values[0].new(len(values), size).fill_(pad_idx)

        def copy_tensor(src, dst):
            assert dst.numel() == src.numel()
            if move_eos_to_beginning:
                assert src[-1] == eos_idx
                dst[0] = eos_idx
                dst[1:] = src[:-1]
            else:
                dst.copy_(src)

        for i, v in enumerate(values):
            copy_tensor(v, res[i][size - len(v):] if left_pad else res[i][:len(v)])
        return res

        
    input_ids = collate_tokens([torch.tensor(item['input_ids']) for item in samples], pad_idx=tokenizer.pad_token_id)
    attention_mask = torch.zeros_like(input_ids)
    for i in range(len(samples)):
        attention_mask[i][:len(samples[i]['input_ids'])] = 1
        
    words_lengths = collate_tokens([torch.tensor(item['words_lengths']) for item in samples], pad_idx=0)
    answer_start = collate_tokens([torch.tensor([item['start_idx']]) for item in samples], pad_idx=0)
    answer_end = collate_tokens([torch.tensor([item['end_idx']]) for item in samples], pad_idx=0)
    span_answer_ids = collate_tokens([torch.tensor(item['span_answer_ids']) for item in samples],
                                     pad_idx=-100)

    batch_samples = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'words_lengths': words_lengths,
        'start_positions': answer_start,
        'end_positions': answer_end,
        'span_answer_ids': span_answer_ids
    }

    return batch_samples


In [9]:
# from transformers import AutoTokenizer

# class TokenizerHelper:
#     def __init__(self, model_name):
#         # self.tokenizer = AutoTokenizer.from_pretrained(model_name)
#         self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


#     def tokenize_function(self, example):
#         example["question"] = example["question"].split()
#         max_len_single_sentence = 368
#         sep_tok = "[SEP]"
#         example["context"] = example["context"].split()
#         example["title"] = example["title"].split()

#         question_sub_words_ids = [self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(w)) for w in example["question"]]
#         context_sub_words_ids = [self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(w)) for w in example["context"]]
#         title_sub_words_ids = [self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(w)) for w in example['title']]

#         valid = True
#         if len([j for i in question_sub_words_ids + title_sub_words_ids + context_sub_words_ids for j in i]) > max_len_single_sentence - 1:
#             question_ids = [j for i in question_sub_words_ids for j in i]
#             context_ids = [j for i in context_sub_words_ids[:example['answer_word_end_idx'] + 1] for j in i]
#             title_ids = [j for i in title_sub_words_ids for j in i]
#             remain_tokens = max_len_single_sentence - 1 - len(question_ids) - len(title_ids)
#             if len(question_ids + context_ids + title_ids) < max_len_single_sentence - 1:
#                 context_sub_words_ids_revise = context_sub_words_ids[:example['answer_word_end_idx'] + 1]
#                 idx = example['answer_word_end_idx'] + 1
#                 while len([j for i in (context_sub_words_ids_revise + [context_sub_words_ids[idx]]) for j in i]) < remain_tokens and idx < len(context_sub_words_ids):
#                     context_sub_words_ids_revise.append(context_sub_words_ids[idx])
#                     idx += 1
#                 context_sub_words_ids = context_sub_words_ids_revise
#             else:
#                 valid = False

#         # question_sub_words_ids = [[self.tokenizer.bos_token_id]] + question_sub_words_ids + [[self.tokenizer.eos_token_id]]
#         if self.tokenizer.bos_token_id is not None:
#             question_sub_words_ids = [[self.tokenizer.bos_token_id]] + question_sub_words_ids

#         if self.tokenizer.eos_token_id is not None:
#             question_sub_words_ids += [[self.tokenizer.eos_token_id]]
#             context_sub_words_ids = context_sub_words_ids + [[self.tokenizer.eos_token_id]]

#         # title_sub_words_ids = [[self.tokenizer.sep_token_id]] + title_sub_words_ids + [[self.tokenizer.sep_token_id]]
#         if self.tokenizer.sep_token_id is not None:
#             title_sub_words_ids = [[self.tokenizer.sep_token_id]] + title_sub_words_ids + [[self.tokenizer.sep_token_id]]


#         input_ids = [j for i in question_sub_words_ids + title_sub_words_ids + context_sub_words_ids for j in i]
#         if len(input_ids) > max_len_single_sentence + 4:  # 4 special token
#             valid = False

#         if None in input_ids:
#             valid = False
            
#         words_lengths = [len(item) for item in question_sub_words_ids + title_sub_words_ids + context_sub_words_ids]

#         return {
#             "input_ids": input_ids,
#             "words_lengths": words_lengths,
#             "start_idx": (example['answer_word_start_idx'] + len(question_sub_words_ids) + len(title_sub_words_ids)) if len(example["answer_text"]) > 0 else 0,
#             "end_idx": (example['answer_word_end_idx'] + len(question_sub_words_ids) + len(title_sub_words_ids)) if len(example["answer_text"]) > 0 else 0,
#             "valid": valid
#         }

In [10]:
from transformers import AutoTokenizer

class TokenizerHelper:
    def __init__(self, model_name):
        # Khởi tạo AutoTokenizer từ model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_function(self, example):
        """
        Hàm xử lý token hóa câu hỏi, ngữ cảnh, và tiêu đề.
        """
        # Xác định các trường thông tin cần thiết từ input
        question = example.get("question", "")
        context = example.get("context", "")
        title = example.get("title", "")

        # Token hóa input với tokenizer
        inputs = self.tokenizer(
            text=question,
            text_pair=context + " " + title,  # Gộp ngữ cảnh và tiêu đề thành một chuỗi
            max_length=368,  # Giới hạn độ dài đầu vào
            truncation=True,  # Cắt chuỗi nếu quá dài
            padding="max_length",  # Thêm padding để đủ độ dài max_length
            add_special_tokens=True,  # Thêm token đặc biệt (CLS, SEP, ...)
            return_tensors=None  # Trả về danh sách, không phải tensor
        )

        # Lấy các chỉ số của câu trả lời nếu có
        answer_start_idx = example.get("answer_word_start_idx", -1)
        answer_end_idx = example.get("answer_word_end_idx", -1)

        # Xác thực dữ liệu
        valid = True
        if None in inputs["input_ids"] or len(inputs["input_ids"]) > 368:
            valid = False

        # Trả về kết quả token hóa và các thông tin cần thiết
        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs.get("attention_mask", []),  # Đảm bảo có attention mask
            "words_lengths": [len(self.tokenizer.decode([id]).strip()) for id in inputs["input_ids"]],  # Độ dài mỗi từ
            "start_idx": answer_start_idx + len(self.tokenizer.encode(question, add_special_tokens=True)) if answer_start_idx >= 0 else 0,
            "end_idx": answer_end_idx + len(self.tokenizer.encode(question, add_special_tokens=True)) if answer_end_idx >= 0 else 0,
            "valid": valid
        }


In [11]:
from transformers import BertPreTrainedModel, BertConfig, BertModel
from transformers.modeling_outputs import QuestionAnsweringModelOutput
import torch
from torch import nn
from torch.nn import CrossEntropyLoss


class MRCQuestionAnswering(BertPreTrainedModel):
    config_class = BertConfig  # Chỉ định config class của BERT

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        # Thay thế RobertaModel bằng BertModel
        self.bert = BertModel(config, add_pooling_layer=False)
        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

    def forward(
            self,
            input_ids=None,
            words_lengths=None,
            start_idx=None,
            end_idx=None,
            attention_mask=None,
            token_type_ids=None,
            position_ids=None,
            head_mask=None,
            inputs_embeds=None,
            start_positions=None,
            end_positions=None,
            span_answer_ids=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Gọi BertModel thay vì RobertaModel
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        context_embedding = sequence_output

        # Compute align word sub_word matrix
        batch_size = input_ids.shape[0]
        max_sub_word = input_ids.shape[1]
        max_word = words_lengths.shape[1]
        align_matrix = torch.zeros((batch_size, max_word, max_sub_word))

        for i, sample_length in enumerate(words_lengths):
            for j in range(len(sample_length)):
                start_idx = torch.sum(sample_length[:j])
                align_matrix[i][j][start_idx: start_idx + sample_length[j]] = 1 if sample_length[j] > 0 else 0

        align_matrix = align_matrix.to(context_embedding.device)
        # Combine sub_word features to make word feature
        context_embedding_align = torch.bmm(align_matrix, context_embedding)

        logits = self.qa_outputs(context_embedding_align)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1).contiguous()
        end_logits = end_logits.squeeze(-1).contiguous()

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # Nếu đang sử dụng multi-GPU, thêm một chiều
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # Xử lý các vị trí ngoài phạm vi input
            ignored_index = start_logits.size(1)
            start_positions = start_positions.clamp(0, ignored_index)
            end_positions = end_positions.clamp(0, ignored_index)

            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss,) + output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [12]:
model = MRCQuestionAnswering.from_pretrained("bert-base-uncased")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of MRCQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Sử dụng
tokenizer_helper = TokenizerHelper("bert-base-uncased")

train_dataset = train.map(tokenizer_helper.tokenize_function, batched=False, num_proc=10)
valid_dataset = valid.map(tokenizer_helper.tokenize_function, batched=False, num_proc=10)


Map (num_proc=10):   0%|          | 0/30000 [00:00<?, ? examples/s]

Map (num_proc=10):   0%|          | 0/3000 [00:00<?, ? examples/s]

In [14]:
training_args = TrainingArguments("model-bin/test",
                                      do_train=True,
                                      do_eval=True,
                                      num_train_epochs= 5, # 8, #10,
                                      learning_rate=1e-5, #1e-4
                                      warmup_ratio=0.05,
                                      weight_decay=0.01,
                                      per_device_train_batch_size=4,
                                      per_device_eval_batch_size=4,
                                      gradient_accumulation_steps=1,
                                      logging_dir='./log',
                                      logging_steps=5,
                                      label_names=['start_positions',
                                                   'end_positions',
                                                   'span_answer_ids',
                                                   'input_ids',
                                                   'words_lengths'],
                                      group_by_length=True,
                                      save_strategy="epoch",
                                      save_safetensors=False,
                                      metric_for_best_model='f1',
                                      load_best_model_at_end=True,
                                      save_total_limit=2,
                                      #eval_steps=1,
                                      #evaluation_strategy="steps",
                                      evaluation_strategy="epoch",
                                      )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Exact Match,F1
1,5.1758,5.31615,19.4,27.73349
2,5.5832,5.169879,11.5,28.147681
3,5.1147,5.116936,18.7,28.827032
4,4.7291,5.172539,17.933333,29.303436
5,4.6417,5.2584,19.866667,29.845335


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


TrainOutput(global_step=37500, training_loss=5.043306376190186, metrics={'train_runtime': 12816.4035, 'train_samples_per_second': 11.704, 'train_steps_per_second': 2.926, 'total_flos': 2.81710565856e+16, 'train_loss': 5.043306376190186, 'epoch': 5.0})

In [16]:
trainer.save_model("/kaggle/working/abc")