In [26]:
!pip install evaluate



In [27]:
import numpy as np
from tqdm.auto import tqdm
import collections

import torch
from torch.utils.data import DataLoader
from pyvi import ViTokenizer

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
import evaluate
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import random

In [28]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()

In [29]:
ds = load_dataset("tridm/UIT-VSMEC")

In [30]:
MODEL_NAME = 'vinai/phobert-base-v2'
MAX_LENGTH = 128
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 7
WEIGHT_DECAY = 0.01

In [31]:
ds

DatasetDict({
    train: Dataset({
        features: ['Sentence', 'Emotion'],
        num_rows: 5548
    })
    validation: Dataset({
        features: ['Sentence', 'Emotion'],
        num_rows: 686
    })
    test: Dataset({
        features: ['Sentence', 'Emotion'],
        num_rows: 693
    })
})

In [32]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [33]:
sentence = ds["train"][10]['Sentence']
emotion = ds["train"][10]['Emotion']
inputs = tokenizer(sentence, emotion)
tokenizer.decode(inputs["input_ids"])

'<s> công nhận sáng tạo thật đấy </s> </s> Enjoyment </s>'

In [34]:
emotion_counts = collections.Counter(ds["train"]["Emotion"])

for emotion, count in emotion_counts.items():
    print(f"{emotion}: {count}")

Other: 1021
Disgust: 1071
Enjoyment: 1558
Anger: 391
Surprise: 242
Sadness: 947
Fear: 318


In [35]:
emotions = sorted(list(set(ds["train"]["Emotion"])))
label2id = {label: i for i, label in enumerate(emotions)}
id2label = {i: label for label, i in label2id.items()}

In [36]:
def preprocess(data_points):
    inputs = tokenizer(
        data_points["Sentence"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )
    
    inputs["labels"] = [label2id[emotion] for emotion in data_points["Emotion"]]
    return inputs

In [37]:
processed_data = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)

Map:   0%|          | 0/5548 [00:00<?, ? examples/s]

Map:   0%|          | 0/686 [00:00<?, ? examples/s]

Map:   0%|          | 0/693 [00:00<?, ? examples/s]

In [38]:
print(processed_data)
print(f"Training examples: {len(processed_data['train'])}")
print(f"Validation examples: {len(processed_data['validation'])}")
print(f"Test examples: {len(processed_data['test'])}")

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 5548
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 686
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 693
    })
})
Training examples: 5548
Validation examples: 686
Test examples: 693


In [39]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)

    f1_weighted = f1_score(labels, predictions, average='weighted', zero_division=0)
    f1_macro = f1_score(labels, predictions, average='macro', zero_division=0)
    precision = precision_score(labels, predictions, average='weighted', zero_division=0)
    recall = recall_score(labels, predictions, average='weighted', zero_division=0)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_weighted': f1_weighted,
        'f1_macro': f1_macro
    }

In [41]:
training_args = TrainingArguments(
    output_dir="./phobert-finetuned-vsmec",
    learning_rate=2e-5,  
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64, 
    num_train_epochs=7,
    weight_decay=0.01,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model="f1_weighted",
    report_to="tensorboard",
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_data["train"],
    eval_dataset=processed_data["validation"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

In [42]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Weighted,F1 Macro
100,1.6203,1.466131,0.488338,0.428939,0.488338,0.40759,0.247397
200,1.4121,1.294645,0.543732,0.452724,0.543732,0.49247,0.314875
300,1.3815,1.221966,0.577259,0.555923,0.577259,0.546636,0.400235
400,1.1378,1.216116,0.574344,0.635524,0.574344,0.54212,0.40877
500,1.0562,1.163149,0.599125,0.613594,0.599125,0.596811,0.520857
600,1.0905,1.12302,0.588921,0.610197,0.588921,0.573265,0.49367
700,0.9645,1.067624,0.607872,0.60475,0.607872,0.600383,0.541664
800,0.7792,1.128487,0.603499,0.620152,0.603499,0.599343,0.548016
900,0.7969,1.044357,0.631195,0.627214,0.631195,0.62365,0.571815
1000,0.7788,1.102238,0.623907,0.633845,0.623907,0.612676,0.570248


TrainOutput(global_step=2429, training_loss=0.7303706738820239, metrics={'train_runtime': 641.1029, 'train_samples_per_second': 60.577, 'train_steps_per_second': 3.789, 'total_flos': 2554659917644800.0, 'train_loss': 0.7303706738820239, 'epoch': 7.0})

In [43]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 1.2517346143722534, 'eval_accuracy': 0.6399416909620991, 'eval_precision': 0.6489090935108826, 'eval_recall': 0.6399416909620991, 'eval_f1_weighted': 0.6404711424200802, 'eval_f1_macro': 0.5983947981334242, 'eval_runtime': 2.5204, 'eval_samples_per_second': 272.183, 'eval_steps_per_second': 4.364, 'epoch': 7.0}


In [44]:
test_results = trainer.evaluate(processed_data["test"])
print(test_results)

{'eval_loss': 1.1593859195709229, 'eval_accuracy': 0.6623376623376623, 'eval_precision': 0.6679908654640972, 'eval_recall': 0.6623376623376623, 'eval_f1_weighted': 0.663880379023954, 'eval_f1_macro': 0.6520484324436367, 'eval_runtime': 2.5532, 'eval_samples_per_second': 271.419, 'eval_steps_per_second': 4.308, 'epoch': 7.0}


In [45]:
model_save_path = "/kaggle/working/phobert-finetuned-vsmec-final"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

('/kaggle/working/phobert-finetuned-vsmec-final/tokenizer_config.json',
 '/kaggle/working/phobert-finetuned-vsmec-final/special_tokens_map.json',
 '/kaggle/working/phobert-finetuned-vsmec-final/vocab.txt',
 '/kaggle/working/phobert-finetuned-vsmec-final/bpe.codes',
 '/kaggle/working/phobert-finetuned-vsmec-final/added_tokens.json')

In [46]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [49]:
def infer(text, model, tokenizer):
    inputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.inference_mode():
        outputs = model(**inputs)
    
    predicted_class_id = torch.argmax(outputs.logits, dim=1).item()
    predicted_emotion = id2label[predicted_class_id]
    
    probs = torch.softmax(outputs.logits, dim=1)[0]
    confidence = probs[predicted_class_id].item()
    
    return {
        "emotion": predicted_emotion,
        "confidence": confidence,
        "all_probs": {id2label[i]: prob.item() for i, prob in enumerate(probs)}
    }

In [50]:
inference_model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
inference_model.to(device)
inference_tokenizer = AutoTokenizer.from_pretrained(model_save_path)

for idx, sentence in enumerate(ds["test"]["Sentence"][:10]):
    result = infer(sentence, inference_model, inference_tokenizer)
    print(f"Predicted: {sentence}, {result['emotion']}, confidence: {result['confidence']:.4f}")
    print(f"True label: {ds['test']['Emotion'][idx]}")
    for emotion, prob in sorted(result['all_probs'].items(), key=lambda x: x[1], reverse=True):
        print(f"{emotion}: {prob:.4f}")
    print()

Predicted: người ta có bạn bè nhìn vui thật, Enjoyment, confidence: 0.9684
True label: Sadness
Enjoyment: 0.9684
Other: 0.0146
Disgust: 0.0051
Surprise: 0.0048
Sadness: 0.0033
Fear: 0.0022
Anger: 0.0017

Predicted: cho nghỉ viêc mói đúng sao goi là kỷ luật, Disgust, confidence: 0.7575
True label: Surprise
Disgust: 0.7575
Anger: 0.1768
Other: 0.0276
Surprise: 0.0206
Fear: 0.0099
Sadness: 0.0044
Enjoyment: 0.0032

Predicted: kinh vãi 😡, Disgust, confidence: 0.9320
True label: Disgust
Disgust: 0.9320
Anger: 0.0404
Fear: 0.0078
Sadness: 0.0066
Other: 0.0053
Enjoyment: 0.0042
Surprise: 0.0037

Predicted: nhà thì không xa lắm nhưng chưa bao giờ đi vì sợ bị đè bẹp luôn người =))))), Fear, confidence: 0.8047
True label: Fear
Fear: 0.8047
Enjoyment: 0.0937
Other: 0.0352
Surprise: 0.0253
Sadness: 0.0192
Anger: 0.0161
Disgust: 0.0058

Predicted: bố không thích nộp đấy mày thích ý kiến không, Other, confidence: 0.9475
True label: Anger
Other: 0.9475
Disgust: 0.0189
Surprise: 0.0073
Enjoyment: 0.00