In [1]:
!pip install transformers evaluate seqeval datasets -q

In [2]:
from transformers import AutoModel, AutoTokenizer, AutoModelForTokenClassification
from torch.utils.data import DataLoader
from torch.optim import SGD, Adam
from seqeval.metrics import classification_report
from tqdm import tqdm

In [3]:
tokenizer = AutoTokenizer.from_pretrained("uitnlp/visobert")

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/471k [00:00<?, ?B/s]



In [4]:
import datasets

data = datasets.load_dataset('neihc/key_phrase')
train_dataset = data["train"]
train_dataset = train_dataset.train_test_split(test_size=0.2)

updated_data.json:   0%|          | 0.00/11.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [5]:
def align_label(text, labels, flag=False):
    label_all_tokens = flag #flag xác định cách thực hiện align_label

    tokenized_input = tokenizer(text, padding='max_length', max_length=256, truncation=True, is_split_into_words=True)
    word_ids = tokenized_input.input_ids

    start_part = True
    label_ids = []
    count = 0

    for i in range(len(word_ids)):

        if word_ids[i] == 0 or word_ids[i] == 1 or word_ids[i] == 2:
            label_ids.append(-100)

        elif count < len(text) and ''.join(tokenizer.decode(tokenized_input['input_ids'][i]).split()) == text[count]:
            label_ids.append(labels[count])
            count+=1
            start_part = True
        else:
            if start_part:
                # Check if count is within the bounds of labels before accessing it
                if count < len(labels):
                    label_ids.append(labels[count])
                else:
                    # Handle the case where count is out of bounds, e.g., by appending -100
                    label_ids.append(-100)
                count+=1
                start_part = False
            else:
                label_ids.append(labels[count] if label_all_tokens and count < len(labels) else -100)


    return label_ids

In [6]:
from torch.utils.data import DataLoader
from torch.optim import SGD, Adam
from seqeval.metrics import classification_report
from tqdm import tqdm
import torch

class DataSet(torch.utils.data.Dataset):

    def __init__(self, data, flag_align_label=False):
        self.texts = [tokenizer(i, padding='max_length', max_length = 256,
                                truncation=True, return_tensors="pt", is_split_into_words=True) for i in data['text']]
        self.labels = [align_label(i,j,flag_align_label) for i,j in zip(data['text'], data['label'])]

    def __len__(self):

        return len(self.labels)

    def get_data(self, idx):
        return self.texts[idx]

    def get_labels(self, idx):
        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):
        data = self.get_data(idx)
        labels = self.get_labels(idx)

        return data, labels

In [7]:
class KeyphraseModel(torch.nn.Module):

    def __init__(self):

        super(KeyphraseModel, self).__init__()

        self.phobert = AutoModelForTokenClassification.from_pretrained("uitnlp/visobert", num_labels=2)

    def forward(self, input_id, mask, label):

        output = self.phobert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [8]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [9]:
def train_loop(model, train_df, val_df, flag_align_label):

    train_dataset = DataSet(train_df, flag_align_label)
    val_dataset = DataSet(val_df, flag_align_label)

    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=BATCH_SIZE)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

    if use_cuda:
        model = model.cuda()

    min_val_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            for i in range(logits.shape[0]):

                logits_clean = logits[i][train_label[i] != -100]
                label_clean = train_label[i][train_label[i] != -100]

                predictions = logits_clean.argmax(dim=1)
                acc = (predictions == label_clean).float().mean()
                total_acc_train += acc
                total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):

                logits_clean = logits[i][val_label[i] != -100]
                label_clean = val_label[i][val_label[i] != -100]

                predictions = logits_clean.argmax(dim=1)
                acc = (predictions == label_clean).float().mean()
                total_acc_val += acc
                total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(val_df)
        val_loss = total_loss_val / len(val_df)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(train_df): .3f} | Accuracy: {total_acc_train / len(train_df): .3f} | Val_Loss: {total_loss_val / len(val_df): .3f} | Accuracy: {total_acc_val / len(val_df): .3f}')
        if val_loss < min_val_loss:
            min_val_loss = val_loss
            torch.save(model.state_dict(), '/kaggle/working/results')
           

LEARNING_RATE = 5e-5
EPOCHS = 30
BATCH_SIZE = 32

model = KeyphraseModel()
train_loop(model, train_dataset['train'], train_dataset['test'], False)

pytorch_model.bin:   0%|          | 0.00/390M [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at uitnlp/visobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.pid = os.fork()
  self.pid = os.fork()
100%|██████████| 125/125 [01:33<00:00,  1.33it/s]


Epochs: 1 | Loss:  0.279 | Accuracy:  0.821 | Val_Loss:  0.249 | Accuracy:  0.837


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 2 | Loss:  0.178 | Accuracy:  0.897 | Val_Loss:  0.255 | Accuracy:  0.844


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 3 | Loss:  0.088 | Accuracy:  0.954 | Val_Loss:  0.327 | Accuracy:  0.844


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 4 | Loss:  0.044 | Accuracy:  0.980 | Val_Loss:  0.419 | Accuracy:  0.844


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 5 | Loss:  0.025 | Accuracy:  0.989 | Val_Loss:  0.493 | Accuracy:  0.839


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 6 | Loss:  0.018 | Accuracy:  0.994 | Val_Loss:  0.471 | Accuracy:  0.836


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 7 | Loss:  0.012 | Accuracy:  0.995 | Val_Loss:  0.567 | Accuracy:  0.839


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 8 | Loss:  0.010 | Accuracy:  0.997 | Val_Loss:  0.584 | Accuracy:  0.841


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 9 | Loss:  0.008 | Accuracy:  0.997 | Val_Loss:  0.602 | Accuracy:  0.839


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 10 | Loss:  0.009 | Accuracy:  0.997 | Val_Loss:  0.604 | Accuracy:  0.837


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 11 | Loss:  0.007 | Accuracy:  0.998 | Val_Loss:  0.629 | Accuracy:  0.840


100%|██████████| 125/125 [01:34<00:00,  1.32it/s]


Epochs: 12 | Loss:  0.007 | Accuracy:  0.998 | Val_Loss:  0.605 | Accuracy:  0.840


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 13 | Loss:  0.006 | Accuracy:  0.996 | Val_Loss:  0.659 | Accuracy:  0.838


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 14 | Loss:  0.008 | Accuracy:  0.997 | Val_Loss:  0.585 | Accuracy:  0.843


100%|██████████| 125/125 [01:34<00:00,  1.32it/s]


Epochs: 15 | Loss:  0.007 | Accuracy:  0.997 | Val_Loss:  0.597 | Accuracy:  0.828


100%|██████████| 125/125 [01:34<00:00,  1.32it/s]


Epochs: 16 | Loss:  0.007 | Accuracy:  0.995 | Val_Loss:  0.589 | Accuracy:  0.837


100%|██████████| 125/125 [01:34<00:00,  1.32it/s]


Epochs: 17 | Loss:  0.004 | Accuracy:  0.998 | Val_Loss:  0.689 | Accuracy:  0.841


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 18 | Loss:  0.002 | Accuracy:  0.999 | Val_Loss:  0.736 | Accuracy:  0.840


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 19 | Loss:  0.003 | Accuracy:  0.999 | Val_Loss:  0.691 | Accuracy:  0.842


100%|██████████| 125/125 [01:34<00:00,  1.32it/s]


Epochs: 20 | Loss:  0.010 | Accuracy:  0.996 | Val_Loss:  0.615 | Accuracy:  0.835


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 21 | Loss:  0.008 | Accuracy:  0.997 | Val_Loss:  0.634 | Accuracy:  0.836


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 22 | Loss:  0.005 | Accuracy:  0.998 | Val_Loss:  0.679 | Accuracy:  0.830


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 23 | Loss:  0.006 | Accuracy:  0.998 | Val_Loss:  0.646 | Accuracy:  0.823


100%|██████████| 125/125 [01:34<00:00,  1.32it/s]


Epochs: 24 | Loss:  0.006 | Accuracy:  0.998 | Val_Loss:  0.632 | Accuracy:  0.832


100%|██████████| 125/125 [01:34<00:00,  1.32it/s]


Epochs: 25 | Loss:  0.003 | Accuracy:  0.999 | Val_Loss:  0.741 | Accuracy:  0.830


100%|██████████| 125/125 [01:34<00:00,  1.32it/s]


Epochs: 26 | Loss:  0.004 | Accuracy:  0.998 | Val_Loss:  0.704 | Accuracy:  0.841


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 27 | Loss:  0.004 | Accuracy:  0.999 | Val_Loss:  0.709 | Accuracy:  0.840


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 28 | Loss:  0.004 | Accuracy:  0.999 | Val_Loss:  0.730 | Accuracy:  0.832


100%|██████████| 125/125 [01:34<00:00,  1.32it/s]


Epochs: 29 | Loss:  0.004 | Accuracy:  0.998 | Val_Loss:  0.666 | Accuracy:  0.825


100%|██████████| 125/125 [01:34<00:00,  1.33it/s]


Epochs: 30 | Loss:  0.005 | Accuracy:  0.997 | Val_Loss:  0.653 | Accuracy:  0.836


In [10]:
torch.save(model.state_dict(), '/kaggle/working/model')

In [11]:
def align_word_ids(text, flag):
    label_all_tokens = flag
    
    text = text.split()
  
    tokenized_inputs = tokenizer(text, padding='max_length', max_length=256, truncation=True, is_split_into_words=True)

    word_ids = tokenized_inputs.input_ids

    start_part = True
    label_ids = []
    count = 0
    
    for i in range(len(word_ids)):
        
        if word_ids[i] == 0 or word_ids[i] == 1 or word_ids[i] == 2:
            label_ids.append(-100)
            
        elif count < len(text) and ''.join(tokenizer.decode(tokenized_inputs['input_ids'][i]).split()) == text[count]:
            label_ids.append(1)
            count+=1
            start_part = True
        else:
            if start_part:
                label_ids.append(1)
                count+=1
                start_part = False
            else:
                label_ids.append(1 if label_all_tokens else -100)           
    return label_ids

In [12]:
def ner(model, sentence, flag_align_label):

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
    text = tokenizer(sentence, padding='max_length', max_length = 256, truncation=True, return_tensors="pt")

    mask = text['attention_mask'].to(device)
    input_id = text['input_ids'].to(device)
    label_ids = torch.Tensor(align_word_ids(sentence, flag_align_label)).unsqueeze(0).to(device)

    logits = model(input_id, mask, None)
    logits_clean = logits[0][label_ids != -100]

    predictions = logits_clean.argmax(dim=1).tolist()
    return predictions

In [13]:
ner(model,
    'Bệnh nhân nhập viện tối qua ở Bệnh Viện 115 là bệnh nhân thứ 82, di chuyển qua nhiều thành phố bằng xe biển hiệu E-402',
    flag_align_label=False)

[1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]

In [14]:
def concatenate_strings_with_format(strings, bits):
    if len(strings) != len(bits):
        raise ValueError("Hai mảng phải có cùng chiều dài.")
    
    result = []
    current_phrase = []
    
    for i in range(len(bits)):
        if bits[i] == 1:
            current_phrase.append(strings[i])
        else:
            if current_phrase:
                result.append(' '.join(current_phrase))
                current_phrase = []
    
    # Thêm cụm cuối cùng nếu còn lại
    if current_phrase:
        result.append(' '.join(current_phrase))
    
    # Định dạng kết qu
#     formatted_result = ', '.join([f'{phrase}' for  phrase in result])
    
    return result

# Ví dụ sử dụng hàm
strings = ["Bệnh", "nhân", "nhập", "viện", "tối", "qua", "ở", "Bệnh", "Viện", "115"]
bits = [1, 1, 0, 0, 1, 0, 0, 1, 1, 0]

output = concatenate_strings_with_format(strings, bits)
print(output)


['Bệnh nhân', 'tối', 'Bệnh Viện']


In [15]:
import json

def to_json(sentence):
    labels = ner(model, sentence, flag_align_label=False)
    words = sentence.split()
    keyphrases = concatenate_strings_with_format(words, labels)
    result = {
        "text": words,
        "custom_id": "task-1",
        "explaination": "None",
        "keyphrase": keyphrases,
        "label": labels
    }
    json_result = json.dumps(result, ensure_ascii=False, indent=4)
    return json_result

text ='''
30tỷ với người làm lương tháng 8tr như tao thì tao ăn hết đời luôn
''' 

print(to_json(text))

{
    "text": [
        "30tỷ",
        "với",
        "người",
        "làm",
        "lương",
        "tháng",
        "8tr",
        "như",
        "tao",
        "thì",
        "tao",
        "ăn",
        "hết",
        "đời",
        "luôn"
    ],
    "custom_id": "task-1",
    "explaination": "None",
    "keyphrase": [],
    "label": [
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0
    ]
}
