<a href="https://colab.research.google.com/github/duahauby/character-classifier-cnn-chars74k/blob/master/Tagging_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!gdown --id 1wzHGvZWoehUnBT_xlF3DM9ZoEeOd8Ibt
!gdown --id 1aUPi1SowAscj6liNRZ1rW8YcOrs2b5hl
!gdown --id 12rUJLDuRd5gokNMwk_yHIrI8uYqatuJj
!unzip lm_pretrained.zip
# !rm lm_pretrained.zip
!unzip ner_data.zip
!rm ner_data.zip
!ls

Downloading...
From: https://drive.google.com/uc?id=1wzHGvZWoehUnBT_xlF3DM9ZoEeOd8Ibt
To: /content/ner_data.zip
100% 2.07M/2.07M [00:00<00:00, 116MB/s]
Access denied with the following error:

 	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses. 

You may still be able to access the file from the browser:

	 https://drive.google.com/uc?id=1aUPi1SowAscj6liNRZ1rW8YcOrs2b5hl 

Downloading...
From: https://drive.google.com/uc?id=12rUJLDuRd5gokNMwk_yHIrI8uYqatuJj
To: /content/ner_absa.zip
100% 1.08M/1.08M [00:00<00:00, 96.6MB/s]
unzip:  cannot find or open lm_pretrained.zip, lm_pretrained.zip.zip or lm_pretrained.zip.ZIP.
Archive:  ner_data.zip
  inflating: data/dev_syl.txt        
  inflating: data/test_syl.txt       
  inflating: data/Untitled.ipynb     
  inflating: data/dev.txt            
  inflating: data/train_syl.txt      
  inflating: data/test.txt           
  inflating: data/train.txt          

In [None]:
!pip install transformers seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
ENTITIES = [
    "O",
    "B-PER",
    "I-PER",
    "B-LOC",
    "I-LOC",
    "B-ORG",
    "I-ORG",
    "B-MISC",
    "I-MISC",
]

ENTITIES2ID = {entity: i for (i, entity) in enumerate(ENTITIES)}
ID2ENTITIES = {i: entity for (i, entity) in enumerate(ENTITIES)}

NER_TRAINING_FILE = './data/train.txt'
NER_VALID_FILE = './data/dev.txt'
NER_TEST_FILE = './data/test.txt'

In [None]:
import numpy as np
import torch
import itertools


class NERDataset(torch.utils.data.Dataset):
    def __init__(self, samples, tokenizer, max_word_length=64):
        self.samples = samples
        self.tokenizer = tokenizer
        self.max_word_length = max_word_length
        self.max_subword_length = int(self.max_word_length * 2.0)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        words = [tk[0] for tk in sample]
        sub_word_ids, attention_mask, word_matrix = self.bpe_tokenizer(words)

        entity_labels = [ENTITIES2ID[tk[1]] for tk in sample]
        entity_labels = [ENTITIES2ID['O']] + entity_labels + [ENTITIES2ID['O']]
        entity_labels = entity_labels[:self.max_word_length]
        entity_labels += [-100] * (self.max_word_length - len(entity_labels))

        item = {}
        item["input_ids"] = torch.tensor(sub_word_ids)
        item["input_mask"] = torch.tensor(attention_mask)
        item["word_matrix"] = torch.tensor(word_matrix, dtype=torch.float32)
        item["entity_labels"] = torch.tensor(entity_labels, dtype=torch.long)
        return item

    def bpe_tokenizer(self, words):
        token_tmp = [self.tokenizer.bos_token]
        token_tmp.extend(words)
        token_tmp.append(self.tokenizer.eos_token)
        # token_tmp = words
        sub_words = [
            self.tokenizer.encode(token, add_special_tokens=False)
            for token in token_tmp
        ]
        sub_words = sub_words[: self.max_word_length]

        word_matrix = np.zeros((self.max_word_length, self.max_subword_length))

        j = 0
        for i, tks in enumerate(sub_words):
            if tks[0] == self.tokenizer.pad_token_id:
                break
            for tk in tks:
                word_matrix[i, j] = 1
                j += 1
        sub_word_ids = list(itertools.chain.from_iterable(sub_words))
        sub_word_ids.extend(
            [self.tokenizer.pad_token_id] * (self.max_subword_length - len(sub_word_ids))
        )  # <pad> index
        attention_mask = np.ones(len(sub_word_ids))
        attention_mask[np.array(sub_word_ids) == self.tokenizer.pad_token_id] = 0
        return sub_word_ids, attention_mask, word_matrix

    def __len__(self):
        return len(self.samples)

In [None]:
import torch
from tqdm.auto import tqdm
from seqeval.metrics import f1_score


def ner_evaluate(y_true, y_pred):
    pres, trues = [], []
    for sent_true, sent_out in zip(y_true, y_pred):
        tmp = [ID2ENTITIES[i] for i in sent_true if i != -100]
        trues.append(tmp)
        pres.append([ID2ENTITIES[i] for i in sent_out[:len(tmp)]])
    return f1_score(trues, pres, average='macro')


def train_fn(
    dataloader, model, entity_criterion, optimizer, scheduler, device="cuda", accu_step=1
):
    model.train()
    total_loss = 0

    pbar = tqdm(dataloader, total=len(dataloader))
    for i, (batch) in enumerate(pbar):
        input_ids = batch["input_ids"].to(device)
        input_mask = batch["input_mask"].to(device)
        word_matrix = batch["word_matrix"].to(device)
        entity_labels = batch["entity_labels"].to(device)

        entity_logits = model(input_ids, input_mask, word_matrix)

        # Loss calculate

        entity_logits = torch.transpose(entity_logits, 2, 1)  # loss
        entity_loss = entity_criterion(entity_logits, entity_labels)

        loss = entity_loss

        # Loss backward
        loss.backward()
        if (i + 1) % accu_step == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            optimizer.zero_grad()

        total_loss += loss.item()

    total_loss /= len(dataloader)

    return total_loss


def validation_fn(dataloader, model, entity_criterion, device="cuda"):
    model.eval()
    total_loss = 0
    entity_pres, entity_golds = [], []

    with torch.no_grad():
        pbar = tqdm(dataloader, total=len(dataloader))
        for i, (batch) in enumerate(pbar):
            input_ids = batch["input_ids"].to(device)
            input_mask = batch["input_mask"].to(device)
            word_matrix = batch["word_matrix"].to(device)
            entity_labels = batch["entity_labels"].to(device)

            entity_logits = model(input_ids, input_mask, word_matrix)

            # Loss calculate
            entity_logits = torch.transpose(entity_logits, 2, 1)  # loss
            entity_loss = entity_criterion(entity_logits, entity_labels)

            loss = entity_loss

            total_loss += loss.item()

            # Evaluate
            entity_logits = torch.transpose(entity_logits, 2, 1)
            entity_outputs = torch.argmax(entity_logits, dim=-1)
            entity_outputs = entity_outputs.detach().cpu().numpy()
            entity_labels = entity_labels.detach().cpu().numpy()
            entity_pres.extend(entity_outputs)
            entity_golds.extend(entity_labels)

        entity_f1 = ner_evaluate(entity_golds, entity_pres)

        print("F1 score: ", entity_f1)

        total_loss /= len(dataloader)

        return total_loss, entity_f1

In [None]:
import numpy as np
import torch
import random
import os


def seed_all(seed_value):
    np.random.seed(seed_value)  # cpu vars
    torch.manual_seed(seed_value)  # cpu  vars
    random.seed(seed_value)
    os.environ["PYTHONHASHSEED"] = str(seed_value)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)  # gpu vars
        torch.backends.cudnn.deterministic = True  # needed
        torch.backends.cudnn.benchmark = False


def count_parameters(model) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad is True)

In [None]:
import torch
from transformers import RobertaModel, RobertaConfig
import torch.nn.functional as F
import torch.nn as nn


class NERModel(nn.Module):
    def __init__(self, model_name="vinai/phobert-base", max_word_length=64, num_bert_layer=12, device='cuda'):
        super().__init__()
        self.model_name = model_name
        self.device = device
        self.config = RobertaConfig.from_pretrained(
            self.model_name, from_tf=False, output_hidden_states=True
        )
        if 'envibert' not in self.model_name:
            self.config = RobertaConfig.from_pretrained(
                self.model_name, from_tf=False, output_hidden_states=True
            )
            self.roberta = RobertaModel.from_pretrained(self.model_name, config=self.config)
        else:
            from transformers import AutoModel, AutoConfig
            self.config = AutoConfig.from_pretrained(self.model_name, from_tf=False, output_hidden_states=True)
            self.roberta = AutoModel.from_pretrained(self.model_name, config=self.config)

        self.max_word_length = max_word_length
        self.num_entity_classes = len(ENTITIES)

        # Entity head
        self.activation = nn.Tanh()
        self.entity_hidden_layer = nn.Linear(
            self.config.hidden_size * 1, self.config.hidden_size
        )
        self.entity_dropout = nn.Dropout(0.2)
        self.entity_classifier = nn.Linear(
            self.config.hidden_size, self.num_entity_classes
        )

    def agg_bpe2word(self, bpe_embeddings, word_bpe_matrix, mode="sum"):
        word_embeddings = torch.bmm(word_bpe_matrix, bpe_embeddings)
        if mode == "sum":
            return word_embeddings
        elif mode == "mean":
            d_n = word_bpe_matrix.sum(dim=-1).unsqueeze(-1)
            d_n[d_n == 0] = 1
            return word_embeddings / d_n

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        word_matrix=None
    ):
        bert_outputs = self.roberta(
            input_ids,
            attention_mask,
        )

        sequence_represent_last = torch.cat(
            bert_outputs[2][-1:], dim=-1
        )  # Batch-size, num_subword, bert_embedding_size
        word_embedding_last = self.agg_bpe2word(
            sequence_represent_last, word_matrix, "sum"
        )  # Batch-size, num_word, bert_embedding_size

        # Ẹntity classification
        entity_hidden = self.entity_hidden_layer(word_embedding_last)
        entity_hidden = self.activation(entity_hidden)
        entity_dropout = self.entity_dropout(entity_hidden)
        entity_logits = self.entity_classifier(entity_dropout)  # [batch, sent_len, n_labels]

        return entity_logits

In [None]:
# model_name = "./lm_pretrained/"
model_name = "vinai/phobert-base"
training_file = "./intent_smt.csv"
batch_size = 64
lr = 3e-5
n_epochs = 15
seed = 96
accu_step = 1
n_bert_layers = 1

In [None]:
import gc
import argparse
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, SequentialSampler
from transformers import RobertaTokenizer, AutoTokenizer
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW
import torch.nn as nn


seed_all(seed_value=seed)
device = "cuda" if torch.cuda.is_available() else "cpu"

training_samples = []
with open(NER_TRAINING_FILE, 'r') as f_r:
    sentences = f_r.read().split('\n\n')
    for sent in sentences:
        tokens = sent.strip().split('\n')
        training_samples.append([token.split() for token in tokens if len(token.split()) == 2])

valid_samples = []
with open(NER_VALID_FILE, 'r') as f_r:
    sentences = f_r.read().split('\n\n')
    for sent in sentences:
        tokens = sent.strip().split('\n')
        valid_samples.append([token.split() for token in tokens if len(token.split()) == 2])

test_samples = []
with open(NER_TEST_FILE, 'r') as f_r:
    sentences = f_r.read().split('\n\n')
    for sent in sentences:
        tokens = sent.strip().split('\n')

        test_samples.append([token.split() for token in tokens if len(token.split()) == 2])
# training_samples, valid_samples = train_test_split(samples, test_size=0.1, random_state=args.seed)


print('Number of training samples: ', len(training_samples))
print('Number of validation samples: ', len(valid_samples))
try:
  tokenizer = RobertaTokenizer.from_pretrained(model_name)
except:
  tokenizer = AutoTokenizer.from_pretrained(model_name)

train_dataset = NERDataset(
    training_samples, tokenizer=tokenizer
)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=1
)

valid_dataset = NERDataset(
    valid_samples,
    tokenizer=tokenizer
)
valid_sampler = SequentialSampler(valid_dataset)
valid_loader = DataLoader(
    valid_dataset,
    batch_size=batch_size,
    shuffle=False,
    sampler=valid_sampler,
    num_workers=1
)

test_dataset = NERDataset(
    test_samples,
    tokenizer=tokenizer
)
test_sampler = SequentialSampler(valid_dataset)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    sampler=test_sampler,
    num_workers=1
)

model = NERModel(model_name, num_bert_layer=n_bert_layers, device=device)
print('The number of parameters of the model: ', count_parameters(model))
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ["LayerNorm.bias", "LayerNorm.weight"]


optimizer_grouped_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if (not any(nd in n for nd in no_decay))
        ],
        "weight_decay": 0.01,
    },
    {
        "params": [
            p for n, p in param_optimizer if (any(nd in n for nd in no_decay))
        ],
        "weight_decay": 0.0,
    },
]
print('The number of parameters of the model: ', count_parameters(model))
optimizer = AdamW(optimizer_grouped_parameters, lr=lr, correct_bias=False)

total_steps = len(train_loader) * n_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=200, num_training_steps=total_steps
)

entity_criterion = nn.CrossEntropyLoss()

max_score = -1
for epoch in range(n_epochs):
    gc.collect()
    print("Training on epoch", epoch + 1)

    total_loss = train_fn(
        dataloader=train_loader,
        model=model,
        entity_criterion=entity_criterion,
        optimizer=optimizer,
        device=device,
        scheduler=scheduler,
        accu_step=accu_step
    )
    print('Training loss: ', total_loss)

    total_loss, entity_f1 = validation_fn(
        valid_loader, model, entity_criterion, device
    )
    print('Validation loss', total_loss)

    if max_score <= entity_f1:
        max_score = entity_f1
        print('#')
        _, _ = validation_fn(
            test_loader, model, entity_criterion, device
        )
        print('#')
    print('*'*100)

Number of training samples:  14861
Number of validation samples:  2000


Downloading:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/895k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The number of parameters of the model:  135595785
The number of parameters of the model:  135595785




Training on epoch 1


  0%|          | 0/233 [00:00<?, ?it/s]

Training loss:  0.3208655013419016


  0%|          | 0/32 [00:00<?, ?it/s]

F1 score:  0.7552791886474955
Validation loss 0.03038227641081903
#


  0%|          | 0/32 [00:00<?, ?it/s]

F1 score:  0.7309303326968698
#
****************************************************************************************************
Training on epoch 2


  0%|          | 0/233 [00:00<?, ?it/s]

Training loss:  0.02864701554874686


  0%|          | 0/32 [00:00<?, ?it/s]

F1 score:  0.8278226871259399
Validation loss 0.022516585973789915
#


  0%|          | 0/32 [00:00<?, ?it/s]

F1 score:  0.817587203160573
#
****************************************************************************************************
Training on epoch 3


  0%|          | 0/233 [00:00<?, ?it/s]

Training loss:  0.016438562463961


  0%|          | 0/32 [00:00<?, ?it/s]

F1 score:  0.8581097245274468
Validation loss 0.02029654111720447
#


  0%|          | 0/32 [00:00<?, ?it/s]

F1 score:  0.8211024569720222
#
****************************************************************************************************
Training on epoch 4


  0%|          | 0/233 [00:00<?, ?it/s]

Training loss:  0.011132971009403435


  0%|          | 0/32 [00:00<?, ?it/s]

F1 score:  0.8289977932748821
Validation loss 0.02227721925919468
****************************************************************************************************
Training on epoch 5


  0%|          | 0/233 [00:00<?, ?it/s]

Training loss:  0.007756049484973813


  0%|          | 0/32 [00:00<?, ?it/s]

F1 score:  0.8339414420477521
Validation loss 0.02336326714066672
****************************************************************************************************
Training on epoch 6


  0%|          | 0/233 [00:00<?, ?it/s]

Training loss:  0.005645445134798505


  0%|          | 0/32 [00:00<?, ?it/s]

F1 score:  0.8342957455491021
Validation loss 0.02464068989365842
****************************************************************************************************
Training on epoch 7


  0%|          | 0/233 [00:00<?, ?it/s]

Training loss:  0.00436193111056338


  0%|          | 0/32 [00:00<?, ?it/s]

F1 score:  0.8391352905018383
Validation loss 0.024897033521483536
****************************************************************************************************
Training on epoch 8


  0%|          | 0/233 [00:00<?, ?it/s]

Training loss:  0.0032114531633008446


  0%|          | 0/32 [00:00<?, ?it/s]

F1 score:  0.826420281828018
Validation loss 0.027236046653342783
****************************************************************************************************
Training on epoch 9


  0%|          | 0/233 [00:00<?, ?it/s]

Training loss:  0.0025501228499729906


  0%|          | 0/32 [00:00<?, ?it/s]

F1 score:  0.8333148479074873
Validation loss 0.02763855239527402
****************************************************************************************************
Training on epoch 10


  0%|          | 0/233 [00:00<?, ?it/s]