In [26]:
!pip install -qq datasets

In [28]:
# load data
import os
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, "data")
train_conll= os.path.join(DATA_DIR, "train.conll")
val_conll = os.path.join(DATA_DIR, "val.conll")
test_conll = os.path.join(DATA_DIR, "test.conll")
print(train_conll)
print(val_conll)
print(test_conll)

C:\Users\Admin\DHCNTT\HK6\NER-VNLegalText-BERT\data\train.conll
C:\Users\Admin\DHCNTT\HK6\NER-VNLegalText-BERT\data\val.conll
C:\Users\Admin\DHCNTT\HK6\NER-VNLegalText-BERT\data\test.conll


In [29]:
def read_conll(file_path):
    sentences = []
    sentence_labels = []
    unique_labels = set()  # To collect unique labels

    with open(file_path, 'r', encoding='utf-8') as file:
        current_sentence_tokens = []
        current_sentence_labels = []

        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace, including '\n'

            # If it's an empty line, sentence boundary detected
            if not line:
                if current_sentence_tokens:  # Check if there's a sentence to append
                    sentences.append(' '.join(current_sentence_tokens))
                    sentence_labels.append(' '.join(current_sentence_labels))
                current_sentence_tokens = []  # Reset for the next sentence
                current_sentence_labels = []  # Reset for the next sentence
            else:
                line_parts = line.split()  # Split line into token and label
                current_sentence_tokens.append(line_parts[0])

                if len(line_parts) >= 2:
                    current_sentence_labels.append(line_parts[1])
                    unique_labels.add(line_parts[1])  # Add label to the set of unique labels
                else:
                    current_sentence_labels.append('O')  # Default to 'O' if no label provided

    # Append the last sentence if the file doesn't end with an empty line
    if current_sentence_tokens:
        sentences.append(' '.join(current_sentence_tokens))
        sentence_labels.append(' '.join(current_sentence_labels))

    print(f"Unique labels found: {unique_labels}")
    return sentences, sentence_labels

# Load the datasets
test_sentences, test_labels = read_conll(test_conll)
dev_sentences, dev_labels = read_conll(val_conll)
train_sentences, train_labels = read_conll(train_conll)

# Now, test_sentences, test_labels, dev_sentences, dev_labels, train_sentences, and train_labels are arrays of strings


Unique labels found: {'I-TTLT', 'B-TT', 'I-HP', 'B-PL', 'B-TTLT', 'B-QĐ', 'I-PL', 'I-BL', 'B-HP', 'I-TT', 'I-NQ', 'B-BL', 'I-L', 'B-L', 'I-QĐ', 'I-NĐ', 'B-NĐ', 'O', 'B-NQ'}
Unique labels found: {'B-TT', 'I-HP', 'B-PL', 'I-PL', 'B-QĐ', 'B-TTLT', 'I-BL', 'B-HP', 'I-TT', 'B-BL', 'I-NQ', 'I-L', 'B-L', 'I-QĐ', 'I-NĐ', 'B-NĐ', 'O', 'B-NQ', 'I-TTLT'}
Unique labels found: {'I-TTLT', 'B-TT', 'I-HP', 'B-PL', 'B-TTLT', 'B-QĐ', 'I-PL', 'I-BL', 'B-HP', 'I-TT', 'I-NQ', 'B-BL', 'I-L', 'B-L', 'I-QĐ', 'I-NĐ', 'B-NĐ', 'O', 'B-NQ'}


In [30]:
test_sentences[1]

'Nghị định Điều chỉnh thu nhập tháng đã đóng bảo hiểm xã hội đối với người lao động tham gia bảo hiểm xã hội tự nguyện Căn_cứ Luật Tổ_chức Chính_phủ ngày 25 tháng 12 năm 2001 .'

In [31]:
test_labels[1]

'O O O O O O O O O O O O O O O O O O O O O O O O O O O B-L I-L I-L I-L I-L I-L I-L I-L I-L O'

In [32]:
from datasets import Dataset

# Step 1: Prepare the datasets from sentences and labels
def prepare_dataset(sentences, labels):
    return {'tokens': sentences, 'labels': labels}

train_dataset = prepare_dataset(train_sentences, train_labels)
dev_dataset = prepare_dataset(dev_sentences, dev_labels)
test_dataset = prepare_dataset(test_sentences, test_labels)

# Step 2: Convert strings of tokens and labels into arrays
def process_string_to_array(dataset):
    return {
        'tokens': [sentence.split() for sentence in dataset['tokens']],
        'labels': [label_seq.split() for label_seq in dataset['labels']]
    }

# Step 3: Process the dataset for token and label lists
train_dataset = process_string_to_array(train_dataset)
dev_dataset = process_string_to_array(dev_dataset)
test_dataset = process_string_to_array(test_dataset)

# Step 4: Convert processed datasets into Hugging Face Dataset objects
train_dataset = Dataset.from_dict(train_dataset)
dev_dataset = Dataset.from_dict(dev_dataset)
test_dataset = Dataset.from_dict(test_dataset)

# Print the size of each dataset and a sample for verification
print(f"Train dataset size: {len(train_dataset)}")
print(f"Dev dataset size: {len(dev_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")
print("Train dataset sample:", train_dataset[0])
print("Dev dataset sample:", dev_dataset[0])
print("Test dataset sample:", test_dataset[0])

# Step 5: Define an Example class
class Example:
    def __init__(self, words, slot_labels, guid=None):
        self.words = words
        self.slot_labels = slot_labels
        self.guid = guid

# Step 6: Convert the dataset to Example objects
def convert_to_examples(dataset):
    return [
        Example(words=tokens, slot_labels=labels, guid=i)
        for i, (tokens, labels) in enumerate(zip(dataset['tokens'], dataset['labels']))
    ]

# Convert datasets into Example objects
train_examples = convert_to_examples(train_dataset)
dev_examples = convert_to_examples(dev_dataset)
test_examples = convert_to_examples(test_dataset)


Train dataset size: 34180
Dev dataset size: 4272
Test dataset size: 4273
Train dataset sample: {'tokens': ['Trong', 'thời_gian', 'từ', 'khi', 'nộp', 'hồ_sơ', 'đăng_ký_chủ', 'nguồn', 'thải', 'CTNH', 'cho', 'đến', 'khi', 'được', 'cấp', 'Sổ', 'đăng_ký', ',', 'chủ', 'nguồn', 'thải', 'CTNH', 'được', 'coi', 'là', 'đã', 'thực_hiện', 'trách_nhiệm', 'đăng_ký', 'về', 'việc', 'phát_sinh', 'CTNH', 'với', 'cơ_quan', 'chuyên_môn', 'về', 'bảo_vệ', 'môi_trường', 'cấp', 'tỉnh', 'theo', 'quy_định', 'tại', 'Khoản', '1', 'Điều', '70', 'Luật', 'Bảo_vệ', 'môi_trường', '.'], 'labels': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-L', 'I-L', 'I-L', 'O']}
Dev dataset sample: {'tokens': ['Sửa_đổi', ',', 'bổ_sung', 'một_số', 'điều', 'của', 'Nghị_định', 'số', '204/2004/NĐ-CP', 'ngày', '14', 'tháng', '12', 'năm', '2004'

In [33]:
import logging
logger = logging.getLogger(__name__)

import copy
import json
import logging
import os

In [37]:
def extract_labels(file_path):
    labels = set()
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line:  # Bỏ qua dòng trống
                parts = line.split()
                if len(parts) >= 2:  # Nếu có label
                    labels.add(parts[1])
    return labels

# Gộp labels từ tất cả các file
all_labels = set()
for file_path in [train_conll, val_conll, test_conll]:
    file_labels = extract_labels(file_path)
    all_labels.update(file_labels)

sorted_labels = sorted(list(all_labels))
label_map = {label: i for i, label in enumerate(sorted_labels)}

print(f"\n✨ Total labels: {len(label_map)}")
label_list= list(label_map.keys())
print("Label Map:", label_list)



✨ Total labels: 19
Label Map: ['B-BL', 'B-HP', 'B-L', 'B-NQ', 'B-NĐ', 'B-PL', 'B-QĐ', 'B-TT', 'B-TTLT', 'I-BL', 'I-HP', 'I-L', 'I-NQ', 'I-NĐ', 'I-PL', 'I-QĐ', 'I-TT', 'I-TTLT', 'O']


In [38]:
def convert_examples_to_features(
    examples,
    max_seq_len,
    tokenizer,
    pad_label_id=-100,
    cls_token_segment_id=0,
    pad_token_segment_id=0,
    sequence_segment_id=0,
    mask_padding_with_zero=True,
):
    # Get special tokens from the tokenizer
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    unk_token = tokenizer.unk_token
    pad_token_id = tokenizer.pad_token_id

    # List to hold the converted features
    features = []

    for example_index, example in enumerate(examples):
        # Log progress every 5000 examples
        if example_index % 400 == 0:
            logger.info(f"Processing example {example_index} of {len(examples)}")

        # Tokenize each word and align its corresponding label
        tokens = []
        label_ids = []

        for word, label in zip(example.words, example.slot_labels):
            word_tokens = tokenizer.tokenize(word)

            # If the word cannot be tokenized, use [UNK] token
            if not word_tokens:
                word_tokens = [unk_token]

            tokens.extend(word_tokens)

            # Map string label to integer ID, apply pad_label_id for subword tokens
            label_id = label_map[label]
            label_ids.extend([label_id] + [pad_label_id] * (len(word_tokens) - 1))

        # Handle sequence truncation for [CLS] and [SEP] tokens
        special_tokens_count = 2
        if len(tokens) > max_seq_len - special_tokens_count:
            tokens = tokens[:max_seq_len - special_tokens_count]
            label_ids = label_ids[:max_seq_len - special_tokens_count]

        # Add [SEP] token at the end of the sentence
        tokens.append(sep_token)
        label_ids.append(pad_label_id)
        token_type_ids = [sequence_segment_id] * len(tokens)

        # Add [CLS] token at the start of the sentence
        tokens = [cls_token] + tokens
        label_ids = [pad_label_id] + label_ids
        token_type_ids = [cls_token_segment_id] + token_type_ids

        # Convert tokens to input IDs
        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # Create attention masks (1 for real tokens, 0 for padding tokens)
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)


        # Pad sequences to the maximum sequence length
        padding_length = max_seq_len - len(input_ids)
        input_ids += [pad_token_id] * padding_length
        attention_mask += [0 if mask_padding_with_zero else 1] * padding_length
        token_type_ids += [pad_token_segment_id] * padding_length
        label_ids += [pad_label_id] * padding_length

        # Create InputFeatures object and append it to the list of features
        features.append(
            InputFeatures(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                slot_labels_ids=label_ids,
            )
        )

    return features


In [39]:
import json

In [40]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, attention_mask, token_type_ids, slot_labels_ids):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.slot_labels_ids = slot_labels_ids

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

In [41]:
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("roberta-base")

tokenizer.cls_token, tokenizer.sep_token, tokenizer.unk_token, tokenizer.pad_token_id

('<s>', '</s>', '<unk>', 1)

In [42]:
from transformers import RobertaTokenizerFast

# Initialize the tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', add_prefix_space=True)

# Set the maximum sequence length
max_seq_len = 128  # You can adjust this based on your model/input

# Convert examples to features
train_features = convert_examples_to_features(train_examples, max_seq_len, tokenizer)
dev_features = convert_examples_to_features(dev_examples, max_seq_len, tokenizer)
test_features = convert_examples_to_features(test_examples, max_seq_len, tokenizer)


In [44]:
import torch
from torch.utils.data import Dataset

# Define a Dataset class to wrap the tokenized features for training
class NERDataset(Dataset):
    def __init__(self, features):
        self.features = features

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        return {
            'input_ids': torch.tensor(feature.input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(feature.attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(feature.token_type_ids, dtype=torch.long),
            'labels': torch.tensor(feature.slot_labels_ids, dtype=torch.long),
        }

# Convert tokenized features into PyTorch datasets
train_dataset = NERDataset(train_features)
dev_dataset = NERDataset(dev_features)
test_dataset = NERDataset(test_features)


In [45]:
train_dataset[0]

{'input_ids': tensor([    0,  2393,  1657,  3553,  1376,  2023,    46,   118,  1215, 42859,
           326,  1376,  2023,  4958,   449,  3592,   295,  1376,  2023,    27,
           642,  1368,  1376,  2023,  9085,  1215,    29,  8188,  5543,  4236,
          3602,   649,   862,  2590,  1215,   330,  3849, 10809,  1215,   611,
          1376,  2023,  6248,   295,  5521,  1376,  2023,  9085,   282,  3553,
          1376,  3070,  2469,   118, 12464, 28812, 14310,  4236,  3602,  1376,
          3070,  9470,   282,   449,  3592,  4236,  3602,  8188,  7487,  1376,
          2023,  2469,   438,   740,  1376,  3070,  8210,   642,   208,  1376,
          2023, 15722,  4236,  3602,   649,   862,  2590,  1215,   330,  3849,
         10809,  2156,  1855,  1376,  2023,  6248,   295,  5521,  1376,  2023,
          9085,   282,  3553,  1376,  3070,  2469,   118, 12464, 28812,  4236,
          3602,  8188,  7487,  1376,  2023,  2469,   438,  1029,   118,   784,
          5269,  4236,  3602, 17682,  3

In [46]:
from transformers import RobertaForTokenClassification

# Define the number of unique labels (ensure this matches your dataset's label set)
num_labels = len(label_list)  # e.g., the number of unique labels such as O, B-ORG, etc.

# Load the RoBERTa model for token classification
model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=num_labels)


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
import os

output_dir = './results'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print(f"created folder: {output_dir}")


created folder: ./results


In [48]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,           # output directory to save model checkpoints and results
    evaluation_strategy="epoch",      # evaluation is done at the end of every epoch
    per_device_train_batch_size=16,   # batch size per device during training
    per_device_eval_batch_size=16,    # batch size for evaluation
    num_train_epochs=3,               # number of epochs to train the model
    weight_decay=0.01,                # strength of weight decay
    logging_dir='./logs',             # directory for storing logs
    logging_steps=10,                 # log every 10 steps
    save_steps=500,                   # save model checkpoint every 500 steps
    save_total_limit=2,               # limit the number of total checkpoints to save
)




In [49]:
!pip install seqeval



In [50]:
label_list

['B-BL',
 'B-HP',
 'B-L',
 'B-NQ',
 'B-NĐ',
 'B-PL',
 'B-QĐ',
 'B-TT',
 'B-TTLT',
 'I-BL',
 'I-HP',
 'I-L',
 'I-NQ',
 'I-NĐ',
 'I-PL',
 'I-QĐ',
 'I-TT',
 'I-TTLT',
 'O']

In [52]:
from transformers import EvalPrediction
def compute_metrics(p: EvalPrediction):
    predictions = p.predictions.argmax(axis=2)  # Get predicted label indices
    labels = p.label_ids  # True label IDs

    # Debugging: Print shapes of predictions and labels
    print(f"Shape of predictions: {predictions.shape}")
    print(f"Shape of labels: {labels.shape}")

    # Debugging: Log first few predictions and labels for inspection
    print(f"First few predictions: {predictions[:2]}")
    print(f"First few labels: {labels[:2]}")

    pred_labels = []
    true_labels = []

    # Iterate through predictions and labels
    for i, (pred_seq, true_seq) in enumerate(zip(predictions, labels)):
        pred_label_seq = []
        true_label_seq = []

        # Iterate through each token in the sequence
        for pred_idx, true_idx in zip(pred_seq, true_seq):
            if true_idx == -100:
                # Debugging: Log any padding tokens encountered
                # print(f"Padding token encountered at position {i}")
                continue

            # Check if the indices are within the valid range
            if pred_idx < len(label_list) and true_idx < len(label_list):
                pred_label_seq.append(label_list[pred_idx])
                true_label_seq.append(label_list[true_idx])
            else:
                # Debugging: Log when out-of-bound indices are encountered
                print(f"Index out of range: pred_idx={pred_idx}, true_idx={true_idx} at position {i}")

        pred_labels.append(pred_label_seq)
        true_labels.append(true_label_seq)

    # Debugging: Log final processed predictions and labels
    print(f"Processed pred_labels: {pred_labels[:2]}")
    print(f"Processed true_labels: {true_labels[:2]}")

    # Compute token-level F1, Precision, and Recall
    precision = precision_score(true_labels, pred_labels)
    # Trong 10 lần dự đoán nhãn PER: thì chúng ta đoán đúng 6 lần -> 6/10 = 60%

    recall = recall_score(true_labels, pred_labels)
    # Trong 8 nhãn PER thật: thì chúng ta đoán đúng 6 lần -> 6/8 = 75%

    f1 = f1_score(true_labels, pred_labels)

    # Debugging: Print classification report
    print("Classification Report:")
    print(classification_report(true_labels, pred_labels))

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [53]:
from transformers import Trainer
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
from transformers import EvalPrediction


In [None]:
# Initialize the Trainer with the modified compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Updated function
)

# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,0.1226,0.252638,0.78976,0.840457,0.81432
2,0.0608,0.131611,0.885309,0.89917,0.892186
3,0.0561,0.127506,0.885836,0.913574,0.899491


Shape of predictions: (2000, 128)
Shape of labels: (2000, 128)
First few predictions: [[ 6  6  6  6  6  6  6  6  6  6  6  6  6  6  6 16  6  6  6  6  6  6  6  6
   6  6  6  6  6  6  6  6  6 16 17  6  6 17 17 17 17 17 17 17  6  6  6  6
   6  6  6 16 16 17 17 17 17  6 16 17 17 17 17 17 17 17 17 17  6 17 17  6
   6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
   6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
   6  6  6  6 10  5  5  6]
 [ 6  6  6  6  6  6  6  6  6  6  6  6  6  7  7  6  6 14  6  6 19  6  6  6
   6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6 16
  16 17 17 17 17 17 17 17 17  6 16 17 17  6 17 17 17  6 16 16  6 17 17 17
  17 17  6  6  6  6  6  6  6  6 16 16 17 17 17 17 17 17 17 17  6 16 17 17
  17  6 17  6 17  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6
   6  7  7  6  6  6  6  6]]
First few labels: [[-100    6 -100 -100 -100 -100 -100 -100    6 -100 -100 -100 -100 -100
  -100 -100 -100 -100 -100 -1

  _warn_prf(average, modifier, msg_start, len(result))


                     precision    recall  f1-score   support

                AGE       0.88      0.97      0.92       308
               DATE       0.94      0.99      0.96       993
             GENDER       0.85      0.94      0.90       245
                JOB       0.00      0.00      0.00       112
           LOCATION       0.70      0.88      0.78      2295
               NAME       0.96      0.73      0.83       169
       ORGANIZATION       0.54      0.33      0.41       500
         PATIENT_ID       0.93      0.98      0.96      1067
SYMPTOM_AND_DISEASE       0.71      0.70      0.70       619
     TRANSPORTATION       0.86      0.85      0.85        79

          micro avg       0.79      0.84      0.81      6387
          macro avg       0.74      0.74      0.73      6387
       weighted avg       0.78      0.84      0.80      6387

Shape of predictions: (2000, 128)
Shape of labels: (2000, 128)
First few predictions: [[ 6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  6  

TrainOutput(global_step=945, training_loss=0.1582743813909551, metrics={'train_runtime': 380.1462, 'train_samples_per_second': 39.672, 'train_steps_per_second': 2.486, 'total_flos': 985314418007040.0, 'train_loss': 0.1582743813909551, 'epoch': 3.0})