In [1]:
from src.module.data_module import DataModule
from src.model.phobert_base import PhoBert_base
from src.model.phobert_large import *
from src.model.phobert_lstm import *
from src.model.phobert_cnn import *
from src.trainer.fasttext import *
from src.trainer.phobert import *
import os
import yaml
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from lightning.pytorch.loggers import TensorBoardLogger
from lightning.pytorch import Trainer, seed_everything


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

class config:
    root_data_dir = './dataset/datashopee/'
    model_type = 'phobert'  
    batch_size = 32
    max_epochs = 50
    drop_out = 0.1
    num_labels = 2
    vector_size = 300  
    num_workers = 2
    fasttext_embedding = None  # Otherwise specify path to embedding like src/embedding/fasttext_train_dev.model
    seed = 42
    freeze_backbone = False
    val_each_epoch = 2
    learning_rate = 1e-4
    accelarator = "gpu"

    tensorboard = {
        'dir': 'logging',
        'name': 'experiment',
        'version': 0
    }

    ckpt_dir = 'logging/experiment/0/ckpt'

In [3]:
dm = DataModule(root_data_dir=config.root_data_dir,
                    model_type=config.model_type,
                    batch_size=config.batch_size,
                    num_workers=config.num_workers,
                    fasttext_embedding=config.fasttext_embedding)

dm.setup('fit')
loss_weight = dm.train_data.class_weights

MODEL TYPE: phobert


In [4]:

print("Chon model:\n")
print("(1)PhoBERT(base)")
print("(2)PhoBERT(large)")
print("(3)PhoBERT(base) + LSTM")
print("(4)PhoBERT(base) + CNN")
print("(5)FastText + LSTM")

model_num = int(input("Chon model so: "))

Chon model:

(1)PhoBERT(base)
(2)PhoBERT(large)
(3)PhoBERT(base) + LSTM
(4)PhoBERT(base) + CNN
(5)FastText + LSTM


In [5]:

if model_num == 1:
    model = PhoBert_base(freeze_backbone=config.freeze_backbone,drop_out=config.drop_out,num_labels=config.num_labels)
elif model_num == 2:
    model = PhoBert_large(drop_out=config.drop_out,num_labels=config.num_labels)
elif model_num == 3:
    model = PhoBERTLSTM(drop_out=config.drop_out,num_labels=config.num_labels)
elif model_num == 4:
    model = PhoBERTCNN_base(drop_out=config.drop_out,num_labels=config.num_labels)
elif model_num == 5:
    pass
else:
    raise ValueError(f"Not support model")

if model_num == 5:
    system = FastTextLSTMModel(dropout=config.drop_out,
                                num_labels=config.num_labels,
                                hidden_size=config.vector_size,
                                loss_weight=loss_weight)
else:
    system = PhoBERTModel(model=model,
                            num_labels=config.num_labels,
                            loss_weight=loss_weight)

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
checkpoint_callback = ModelCheckpoint(dirpath=config.ckpt_dir, monitor="val_loss",save_top_k=5, mode="min")

early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=40)

logger = TensorBoardLogger(save_dir=config.tensorboard['dir'], name=config.tensorboard['name'], version=config.tensorboard['version'])

trainer = Trainer(accelerator=config.accelarator, check_val_every_n_epoch=config.val_each_epoch,gradient_clip_val=1.0,
                max_epochs=config.max_epochs,enable_checkpointing=True, deterministic=True, default_root_dir=config.ckpt_dir,
                callbacks=[checkpoint_callback, early_stopping], logger=logger, accumulate_grad_batches=4,log_every_n_steps=1)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [7]:
trainer.fit(model=system, datamodule=dm)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type               | Params
-----------------------------------------------------
0 | model         | PhoBert_base       | 136 M 
1 | train_loss_fn | CrossEntropyLoss   | 0     
2 | loss_fn       | CrossEntropyLoss   | 0     
3 | test_metrics  | MetricCollection   | 0     
4 | val_acc_fn    | MulticlassAccuracy | 0     
-----------------------------------------------------
136 M     Trainable params
0         Non-trainable params
136 M     Total params
544.458   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

e:\DOAN\phobert-classifier\env\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'val_dataloader' to speed up the dataloader worker initialization.


                                                                           

e:\DOAN\phobert-classifier\env\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:436: Consider setting `persistent_workers=True` in 'train_dataloader' to speed up the dataloader worker initialization.


Epoch 0:   1%|          | 3/566 [00:44<2:18:11,  0.07it/s, v_num=0, train_loss_step=0.697]

In [7]:
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaConfig
import torch.nn.functional as F

class PhoBertFeedForward_base(nn.Module):
    def __init__(self, from_pretrained: bool = True, freeze_backbone: bool = False, drop_out: float = 0.1, out_channels: int = 2):
        super(PhoBertFeedForward_base, self).__init__()
        phobert_config = RobertaConfig.from_pretrained("vinai/phobert-base-v2")
        self.bert = RobertaModel(config=phobert_config)
        if from_pretrained:
            self.bert = RobertaModel.from_pretrained("vinai/phobert-base-v2")
        self.classifier = nn.Sequential(
            nn.Linear(768, 768),
            nn.Dropout(drop_out),
            nn.Linear(768, out_channels))

        if freeze_backbone:
            for param in self.bert.parameters():
                param.require_grad = False

    def forward(self, input_ids, attn_mask):
        bert_feature = self.bert(input_ids=input_ids, attention_mask=attn_mask)
        last_hidden_cls = bert_feature[0][:, 0, :]
        logits = self.classifier(last_hidden_cls)
        return logits

In [8]:
import torch
state_dict = torch.load('./dataset/datatiki/model/PhoBertbs.pth')
state_dict.pop('train_loss_fn.weight', None)

# Create model and load modified state_dict
model = PhoBertFeedForward_base()
model.load_state_dict(state_dict, strict=False)

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


_IncompatibleKeys(missing_keys=['bert.embeddings.word_embeddings.weight', 'bert.embeddings.position_embeddings.weight', 'bert.embeddings.token_type_embeddings.weight', 'bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.self.query.weight', 'bert.encoder.layer.0.attention.self.query.bias', 'bert.encoder.layer.0.attention.self.key.weight', 'bert.encoder.layer.0.attention.self.key.bias', 'bert.encoder.layer.0.attention.self.value.weight', 'bert.encoder.layer.0.attention.self.value.bias', 'bert.encoder.layer.0.attention.output.dense.weight', 'bert.encoder.layer.0.attention.output.dense.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.intermediate.dense.weight', 'bert.encoder.layer.0.intermediate.dense.bias', 'bert.encoder.layer.0.output.dense.weight', 'bert.encoder.layer.0.output.dense.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.en

In [27]:
from transformers import PhobertTokenizer, RobertaForSequenceClassification
import torch

# Load pre-trained model and tokenizer
model_name = "vinai/phobert-base-v2"
tokenizer = PhobertTokenizer.from_pretrained(model_name)

# Text input
text_input = 'áo đẹp xinh nên mua'
# Tokenize text
tokenized_input = tokenizer(text_input, return_tensors="pt", truncation=True, padding=True)
# Extract input_ids and attention_mask
input_ids = tokenized_input["input_ids"]
attention_mask = tokenized_input["attention_mask"]

# Thực hiện dự đoán
with torch.no_grad():
    logits = model(input_ids, attention_mask)
    pred = torch.nn.functional.log_softmax(logits, dim=1)
    pred = torch.argmax(pred, dim=1).item()

# In ra dự đoán
if pred == 0:
    print("Tiêu cực")
else: print("Tích cực")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


tensor([[-0.6454, -0.7433]])
Tiêu cực
