In [27]:
!pip install evaluate
!pip install pyvi



In [28]:
import numpy as np
from tqdm.auto import tqdm
import collections

import torch
from torch.utils.data import DataLoader
from pyvi import ViTokenizer

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
import evaluate
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import random

In [29]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()

In [30]:
ds = load_dataset("tridm/UIT-VSMEC")

In [31]:
MODEL_NAME = 'vinai/phobert-base-v2'
MAX_LENGTH = 128
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 7
WEIGHT_DECAY = 0.01

In [32]:
ds

DatasetDict({
    train: Dataset({
        features: ['Sentence', 'Emotion'],
        num_rows: 5548
    })
    validation: Dataset({
        features: ['Sentence', 'Emotion'],
        num_rows: 686
    })
    test: Dataset({
        features: ['Sentence', 'Emotion'],
        num_rows: 693
    })
})

In [33]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [34]:
sentence = ds["train"][10]['Sentence']
emotion = ds["train"][10]['Emotion']
segmented_sentence = ViTokenizer.tokenize(sentence)
inputs = tokenizer(segmented_sentence, emotion)
tokenizer.decode(inputs["input_ids"])

'<s> công_nhận sáng_tạo thật đấy </s> </s> Enjoyment </s>'

In [35]:
emotion_counts = collections.Counter(ds["train"]["Emotion"])

for emotion, count in emotion_counts.items():
    print(f"{emotion}: {count}")

Other: 1021
Disgust: 1071
Enjoyment: 1558
Anger: 391
Surprise: 242
Sadness: 947
Fear: 318


In [36]:
emotions = sorted(list(set(ds["train"]["Emotion"])))
label2id = {label: i for i, label in enumerate(emotions)}
id2label = {i: label for label, i in label2id.items()}

In [37]:
def preprocess(data_points):
    segmented_sentences = [ViTokenizer.tokenize(sentence) for sentence in data_points["Sentence"]]
    inputs = tokenizer(
        segmented_sentences,
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )
    
    inputs["labels"] = [label2id[emotion] for emotion in data_points["Emotion"]]
    return inputs

In [38]:
processed_data = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)

In [39]:
print(processed_data)
print(f"Training examples: {len(processed_data['train'])}")
print(f"Validation examples: {len(processed_data['validation'])}")
print(f"Test examples: {len(processed_data['test'])}")

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5548
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 686
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 693
    })
})
Training examples: 5548
Validation examples: 686
Test examples: 693


In [40]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)

    f1_weighted = f1_score(labels, predictions, average='weighted', zero_division=0)
    f1_macro = f1_score(labels, predictions, average='macro', zero_division=0)
    precision = precision_score(labels, predictions, average='weighted', zero_division=0)
    recall = recall_score(labels, predictions, average='weighted', zero_division=0)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_weighted': f1_weighted,
        'f1_macro': f1_macro
    }

In [42]:
training_args = TrainingArguments(
    output_dir="./phobert-finetuned-vsmec",
    learning_rate=2e-5,  
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64, 
    num_train_epochs=7,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model="f1_weighted",
    report_to="tensorboard",
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_data["train"],
    eval_dataset=processed_data["validation"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

In [43]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Weighted,F1 Macro
100,1.6098,1.45495,0.495627,0.413045,0.495627,0.414009,0.251991
200,1.3633,1.257619,0.581633,0.603991,0.581633,0.529758,0.362944
300,1.3311,1.168302,0.61516,0.585178,0.61516,0.587014,0.462572
400,1.1176,1.179988,0.590379,0.594861,0.590379,0.554538,0.418819
500,1.0047,1.122329,0.604956,0.640222,0.604956,0.59074,0.47189
600,1.0202,1.102531,0.61516,0.647222,0.61516,0.587904,0.486007
700,0.9441,1.045654,0.650146,0.664483,0.650146,0.641297,0.56368
800,0.7548,1.086554,0.620991,0.626554,0.620991,0.601784,0.539625
900,0.7634,1.058242,0.631195,0.642727,0.631195,0.630276,0.560526
1000,0.7663,1.065319,0.644315,0.64187,0.644315,0.63457,0.553141


TrainOutput(global_step=2429, training_loss=0.7021565835826336, metrics={'train_runtime': 642.5079, 'train_samples_per_second': 60.444, 'train_steps_per_second': 3.78, 'total_flos': 2554659917644800.0, 'train_loss': 0.7021565835826336, 'epoch': 7.0})

In [44]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 1.2160348892211914, 'eval_accuracy': 0.6530612244897959, 'eval_precision': 0.6511711288918175, 'eval_recall': 0.6530612244897959, 'eval_f1_weighted': 0.6484461101383624, 'eval_f1_macro': 0.5935782360031142, 'eval_runtime': 2.5524, 'eval_samples_per_second': 268.768, 'eval_steps_per_second': 4.31, 'epoch': 7.0}


In [45]:
test_results = trainer.evaluate(processed_data["test"])
print(f"Test Results: {test_results}")

Test Results: {'eval_loss': 1.1548160314559937, 'eval_accuracy': 0.6608946608946609, 'eval_precision': 0.6595301960053936, 'eval_recall': 0.6608946608946609, 'eval_f1_weighted': 0.6584137145033844, 'eval_f1_macro': 0.6409220354903379, 'eval_runtime': 2.6393, 'eval_samples_per_second': 262.567, 'eval_steps_per_second': 4.168, 'epoch': 7.0}


In [46]:
model_save_path = "/kaggle/working/phobert-finetuned-vsmec-final"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

('/kaggle/working/phobert-finetuned-vsmec-final/tokenizer_config.json',
 '/kaggle/working/phobert-finetuned-vsmec-final/special_tokens_map.json',
 '/kaggle/working/phobert-finetuned-vsmec-final/vocab.txt',
 '/kaggle/working/phobert-finetuned-vsmec-final/bpe.codes',
 '/kaggle/working/phobert-finetuned-vsmec-final/added_tokens.json')

In [48]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [49]:
def infer(text, model, tokenizer):
    segmented_sentence = ViTokenizer.tokenize(text) 
    inputs = tokenizer(
        segmented_sentence,
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
    predicted_emotion = id2label[predicted_class_id]
    
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)[0]
    confidence = probabilities[predicted_class_id].item()
    
    return {
        "emotion": predicted_emotion,
        "confidence": confidence,
        "all_probabilities": {id2label[i]: prob.item() for i, prob in enumerate(probabilities)}
    }

In [50]:
inference_model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
inference_model.to(device)
inference_tokenizer = AutoTokenizer.from_pretrained(model_save_path)

for idx, sentence in enumerate(ds["test"]["Sentence"][:10]):
    result = infer(sentence, inference_model, inference_tokenizer)
    print(f"Predicted: {sentence}, {result['emotion']}, confidence: {result['confidence']:.4f}")
    print(f"True label: {ds['test']['Emotion'][idx]}")
    for emotion, prob in sorted(result['all_probabilities'].items(), key=lambda x: x[1], reverse=True):
        print(f"{emotion}: {prob:.4f}")
    print()

Predicted: người ta có bạn bè nhìn vui thật, Enjoyment, confidence: 0.9772
True label: Sadness
Enjoyment: 0.9772
Other: 0.0069
Sadness: 0.0044
Surprise: 0.0039
Disgust: 0.0028
Fear: 0.0026
Anger: 0.0021

Predicted: cho nghỉ viêc mói đúng sao goi là kỷ luật, Disgust, confidence: 0.7041
True label: Surprise
Disgust: 0.7041
Anger: 0.2039
Other: 0.0578
Surprise: 0.0138
Fear: 0.0112
Enjoyment: 0.0050
Sadness: 0.0041

Predicted: kinh vãi 😡, Disgust, confidence: 0.9568
True label: Disgust
Disgust: 0.9568
Anger: 0.0217
Other: 0.0057
Fear: 0.0048
Sadness: 0.0042
Enjoyment: 0.0036
Surprise: 0.0031

Predicted: nhà thì không xa lắm nhưng chưa bao giờ đi vì sợ bị đè bẹp luôn người =))))), Enjoyment, confidence: 0.5041
True label: Fear
Enjoyment: 0.5041
Fear: 0.3556
Other: 0.0713
Sadness: 0.0355
Surprise: 0.0198
Anger: 0.0088
Disgust: 0.0049

Predicted: bố không thích nộp đấy mày thích ý kiến không, Other, confidence: 0.9495
True label: Anger
Other: 0.9495
Enjoyment: 0.0205
Disgust: 0.0091
Surprise: