In [1]:
import torch
import torch.nn as nn
import numpy as np
from tqdm.auto import tqdm
from transformers import AutoModel, AutoModelForTokenClassification, AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader

MAX_LEN = 256

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
env = ".env"

try:
    with open(env, "r") as file:
        AUTH_TOKEN = file.read()
except FileNotFoundError:
    print(f"The file {env} does not exist.")
except Exception as e:
    print(f"An error occurred: {e}")


MRC_PATH = 'nguyenvulebinh/vi-mrc-base'

In [4]:
TRAIN_PATH = 'data/span_detection_datasets_split_word_IOB/train.jsonl'
DEV_PATH = 'data/span_detection_datasets_split_word_IOB/dev.jsonl'
TEST_PATH = 'data/span_detection_datasets_split_word_IOB/test.jsonl'

# Prepare material

In [5]:
# function read jsonl file as dataframe
import pandas as pd
import json

def read_jsonl_to_dataframe(file_path):
    data = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                json_obj = json.loads(line)
                data.append(json_obj)
            except json.JSONDecodeError as e:
                print(f"Skipping invalid JSON: {e}")

    df = pd.DataFrame(data)

    return df

In [10]:
import json

# load tag_to_id
with open('data/tag_to_id_bert.json', 'r') as f:
    tag_to_id = json.load((f))

# convert tag_to_id to id_to_tag
id_to_tag = {v: k for k, v in tag_to_id.items()}

# load train and dev data

df_train = read_jsonl_to_dataframe(TRAIN_PATH)
df_dev = read_jsonl_to_dataframe(DEV_PATH)

df_train.text = df_train.text.apply(lambda x: " ".join(x))
df_dev.text = df_dev.text.apply(lambda x: " ".join(x))


# train_sentences = list(df_train.text)
# dev_sentences = list(df_dev.text)

# train_labels = list(df_train.labels)
# dev_labels = list(df_dev.labels)

In [6]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MRC_PATH, use_auth_token=AUTH_TOKEN)



# Prepare data

In [11]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels


sample = df_train.text.iloc[0]
sample_label = df_train.labels.iloc[0]

tokenized_sentence, labels = tokenize_and_preserve_labels(sample, sample_label, tokenizer)
print(tokenized_sentence)
print(labels)

['▁pin', '▁s', 'à', 'i', '_', 't', 'ầ', 'm', '▁50', '▁h', '▁cho', '▁pin', '▁100', '▁100', '▁camera', '▁ổn', '▁tất', '_', 'c', 'ả', '▁đều', '▁ok', '▁nhân', '_', 'vi', 'ên', '▁thế', '_', 'gi', 'ới', '▁di', '_', 'đ', 'ộ', 'ng', '▁trần', '_', 'vă', 'n', '_', 'th', 'ời', '▁cà', '_', 'ma', 'u', '▁nhiệt', '_', 't', 'ình', '▁và', '▁vui', '_', 'v', 'ẻ', '▁chúc', '▁các', '▁a', 'e', '▁sức', '▁khỏe', '▁tốt', '▁và', '▁phục', '_', 'ok', '▁h', 'oài', '_', 'n', 'ha']
['B-BATTERY', 'I-BATTERY', 'I-BATTERY', 'I-BATTERY', 'I-BATTERY', 'I-BATTERY', 'I-BATTERY', 'I-BATTERY', 'I-BATTERY', 'I-BATTERY', 'I-BATTERY', 'I-BATTERY', 'I-BATTERY', 'I-BATTERY', 'B-CAMERA', 'I-CAMERA', 'B-GENERAL', 'B-GENERAL', 'B-GENERAL', 'B-GENERAL', 'I-GENERAL', 'I-GENERAL', 'B-SER&ACC', 'B-SER&ACC', 'B-SER&ACC', 'B-SER&ACC', 'I-SER&ACC', 'I-SER&ACC', 'I-SER&ACC', 'I-SER&ACC', 'I-SER&ACC', 'I-SER&ACC', 'I-SER&ACC', 'I-SER&ACC', 'I-SER&ACC', 'I-SER&ACC', 'I-SER&ACC', 'I-SER&ACC', 'I-SER&ACC', 'I-SER&ACC', 'I-SER&ACC', 'I-SER&ACC',

In [12]:
# define special tokens
pad_token = tokenizer.pad_token
sep_token = tokenizer.sep_token
cls_token = tokenizer.cls_token

In [13]:
class SpanDetectionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=MAX_LEN):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.text[index]  
        word_labels = self.data.labels[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
        
        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = [cls_token] + tokenized_sentence + [sep_token] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + [pad_token for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != pad_token else 0 for tok in tokenized_sentence]
        
        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [tag_to_id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]
        
        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [14]:
# create train and dev dataset
train_dataset = SpanDetectionDataset(df_train, tokenizer)
dev_dataset = SpanDetectionDataset(df_dev, tokenizer)

In [15]:
def print_sample(dataset, index, k=30): # k will be positive
    # print the first k tokens and corresponding labels
    for token, label in zip(tokenizer.convert_ids_to_tokens(dataset[index]["ids"][:k]), dataset[index]["targets"][:k]):
        print('{0:10}  {1}'.format(token, id_to_tag[label.item()]))

print_sample(train_dataset, 1, 30)

<s>         O
▁lag        B-PERFORMANCE
▁va         O
▁hao        B-BATTERY
▁pin        I-BATTERY
▁là         O
▁cái        O
▁tóm        O
_           O
t           O
ắ           O
t           O
▁về         O
▁máy        O
▁sam        B-GENERAL
▁làm        I-GENERAL
▁tệ         I-GENERAL
▁quá        I-GENERAL
▁không      I-GENERAL
▁bằng       I-GENERAL
▁mấy        I-GENERAL
▁con        I-GENERAL
▁tàu        I-GENERAL
▁cùng       I-GENERAL
▁phân       I-GENERAL
_           I-GENERAL
kh          I-GENERAL
úc          O
</s>        I-GENERAL
<pad>       O


In [16]:
# function to create dataloader
def create_data_loader(datasets, params):

    return DataLoader(
        datasets,
        **params
    )

# define dataloader params
train_params = {
    'batch_size': 16,
    'shuffle': True,
    'num_workers': 0
}

dev_params = {
    'batch_size': 8,
    'shuffle': False,
    'num_workers': 0
}

# create dataloader
train_dataloader = create_data_loader(train_dataset, train_params)
dev_dataloader = create_data_loader(dev_dataset, dev_params)

# Define model

Here we define the model, BertForTokenClassification, and load it with the pretrained weights of "bert-base-uncased". The only thing we need to additionally specify is the number of labels (as this will determine the architecture of the classification head).

Note that only the base layers are initialized with the pretrained weights. The token classification head of top has just randomly initialized weights, which we will train, together with the pretrained weights, using our labelled dataset. This is also printed as a warning when you run the code cell below.

Then, we move the model to the GPU.

In [17]:
model = AutoModelForTokenClassification.from_pretrained(MRC_PATH,
                                                   num_labels=len(tag_to_id),
                                                   id2label=id_to_tag,
                                                   label2id=tag_to_id,
                                                   use_auth_token=AUTH_TOKEN)

model.to(device)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at nguyenvulebinh/vi-mrc-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [30]:
# define loss function for NER
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score


def loss_fn(logits, targets, masks):
    """
    outputs: (batch_size, seq_len, num_labels)
    targets: (batch_size, seq_len)
    masks: (batch_size, seq_len)
    """
    # reshape to (batch_size * seq_len, num_labels)
    logits = logits.view(-1, logits.shape[-1])
    targets = targets.view(-1)

    # ignore padded tokens
    masks = masks.view(-1)
    logits = logits[masks == 1]
    targets = targets[masks == 1]

    # compute cross entropy loss
    return nn.CrossEntropyLoss()(logits, targets)

def accuracy_f1(logits, targets, masks):
    """
    outputs: (batch_size, seq_len, num_labels)
    targets: (batch_size, seq_len)
    masks: (batch_size, seq_len)
    """
    # reshape to (batch_size * seq_len, num_labels)
    logits = logits.view(-1, logits.shape[-1])
    targets = targets.view(-1)

    # ignore padded tokens
    masks = masks.view(-1)
    logits = logits[masks == 1]
    targets = targets[masks == 1]

    # compute accuracy
    preds = torch.argmax(logits, dim=1)
    acc = accuracy_score(targets.cpu().numpy(), preds.cpu().numpy())

    # compute f1 score
    f1 = f1_score(targets.cpu().numpy(), preds.cpu().numpy(), average='macro')

    return acc, f1

In [None]:
# sample input
model.train()
sample = next(iter(train_dataloader))
print(sample['ids'].shape)
print(sample['mask'].shape)
print(sample['targets'].shape)

out = model(sample['ids'].to(device), sample['mask'].to(device))
print(out)

In [31]:
accuracy_f1(out.logits, sample['targets'].to(device), sample['mask'].to(device))
# loss_fn(out.logits, sample['targets'].to(device), sample['mask'].to(device))

(0.006666666666666667, 0.004806066161403662)

In [139]:
# optimizer
from transformers import get_linear_schedule_with_warmup
import torch.optim as optim

def optimizer_scheduler(model, num_train_steps, lr=5e-5):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", 'LayerNorm.bias', "LayerNorm.weight"]
    optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.001,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]

    opt = optim.AdamW(optimizer_parameters, lr=lr)
    sch = get_linear_schedule_with_warmup(
        opt,
        num_warmup_steps=int(0.05*num_train_steps),
        num_training_steps=num_train_steps,
        last_epoch=-1,
    )
    return opt, sch

In [147]:
# prarams for training
epochs = 10
accumulation_steps = 4

total_steps = len(train_dataloader) * epochs // accumulation_steps
optimizer, scheduler = optimizer_scheduler(model, total_steps)

In [152]:
def train(epoch):

    loss, accuracy, f1_score = 0, 0, 0
    train_steps = len(train_dataloader) // accumulation_steps
    model.train()

    for step, batch in tqdm(enumerate(train_dataloader), total=train_steps):

        ids = batch['ids'].to(device, dtype=torch.long)
        mask = batch['mask'].to(device, dtype=torch.long)
        targets = batch['targets'].to(device, dtype=torch.long)

        optimizer.zero_grad()

        # forward pass
        outputs = model(ids, mask)
        logits = outputs.logits

        # compute loss
        batch_loss = loss_fn(logits, targets, mask)

        if step % 100 == 0:
            print(f"Batch loss of epoch {epoch} at step {step}: {batch_loss.item()}")
        
        # backward pass
        batch_loss /= accumulation_steps
        batch_loss.backward()

        # update weights
        if (step + 1) % accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        # compute accuracy and f1 score
        batch_acc, batch_f1 = accuracy_f1(logits, targets, mask)

        loss += batch_loss.item()
        accuracy += batch_acc.item()
        f1_score += batch_f1     

    return loss / train_steps, accuracy / train_steps, f1_score / train_steps

def evaluate(epoch):

    with torch.no_grad():
        loss, accuracy, f1_score = 0, 0, 0
        dev_steps = len(dev_dataloader)
        model.eval()
        for step, batch in tqdm(enumerate(dev_dataloader), total=dev_steps):

            ids = batch['ids'].to(device, dtype=torch.long)
            mask = batch['mask'].to(device, dtype=torch.long)
            targets = batch['targets'].to(device, dtype=torch.long)

            # forward pass
            outputs = model(ids, mask)
            logits = outputs.logits

            # compute loss
            batch_loss = loss_fn(logits, targets, mask)

            # compute accuracy and f1 score
            batch_acc, batch_f1 = accuracy_f1(logits, targets, mask)

            loss += batch_loss.item()
            accuracy += batch_acc.item()
            f1_score += batch_f1     

    return loss / dev_steps, accuracy / dev_steps, f1_score / dev_steps

In [155]:
for epoch in range(epochs):
    print(f"Epoch {epoch + 1} of {epochs}")
    train_loss, train_acc, train_f1 = train(epoch)
    print(f"Train loss: {train_loss}, Train accuracy: {train_acc}, Train F1 score: {train_f1}")
    dev_loss, dev_acc, dev_f1 = evaluate(epoch)
    print(f"Dev loss: {dev_loss}, Dev accuracy: {dev_acc}, Dev F1 score: {dev_f1}")

Epoch 1 of 10


  0%|          | 0/119 [00:00<?, ?it/s]

Batch loss of epoch 0 at step 0: 3.228870153427124


In [None]:
# save model
model.save_pretrained('model/span_detection_bert_base')

# End