In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.
Collecting transformers
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.33.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.53.2-py3-none-any.whl (10.8 MB)
   ---------------------------------------- 0.0/10.8 MB ? eta -:--:--
   ----------- ---------------------------- 3.1/10.8 MB 20.6 MB/s eta 0:00:01
   ----------------------------------- ---- 9.7/10.8 MB 28.8 MB/s eta 0:00:01
   ---------------------------------------- 10.8/10.8 MB 22.6 MB/s eta 0:00:00
Downloading huggingface_hub-0.33.4-py3-none-any.whl (515 kB)
Downloading safetensors-0.5.3-cp38


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
import numpy as np
import torch
import re
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from sklearn.metrics import f1_score, accuracy_score, classification_report

  from .autonotebook import tqdm as notebook_tqdm


ImportError: Traceback (most recent call last):
  File "c:\Users\MY MSI\AppData\Local\Programs\Python\Python312\Lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: A dynamic link library (DLL) initialization routine failed.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.

In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^\w\sàáảãạăắằẳẵặâấầẩẫậèéẻẽẹêếềểễệ"
                  r"ìíỉĩịòóỏõọôốồổỗộơớờởỡợùúủũụưứừửữự"
                  r"ỳýỷỹỵđ]", " ", text)
    text = re.sub(r"(.)\1{2,}", r"\1", text)  # Giảm lặp
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NLP/ABSA/dataset/train.csv")
df_val = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NLP/ABSA/dataset/dev.csv")
df_test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/NLP/ABSA/dataset/test.csv")

aspect_cols = df_train.columns.tolist()[1:]  # Bỏ cột 'review'
print("Các aspect:", aspect_cols)

for df in [df_train, df_val, df_test]:
    df["Review"] = df["Review"].apply(clean_text)
    df[aspect_cols] = df[aspect_cols].fillna(0).astype(int)


Các aspect: ['AMBIENCE#GENERAL', 'DRINKS#PRICES', 'DRINKS#QUALITY', 'DRINKS#STYLE&OPTIONS', 'FOOD#PRICES', 'FOOD#QUALITY', 'FOOD#STYLE&OPTIONS', 'LOCATION#GENERAL', 'RESTAURANT#GENERAL', 'RESTAURANT#MISCELLANEOUS', 'RESTAURANT#PRICES', 'SERVICE#GENERAL']


In [None]:
# Đọc file CSV ABSA: data + các cột aspect
def load_absa_csv(filepath):
    df = pd.read_csv(filepath)
    records = []
    for _, row in df.iterrows():
        text = row['Review']
        cents = []
        for aspect in row.index:
            if aspect == 'Review':
                continue
            sentiment = row[aspect]
            if pd.notna(sentiment):
                cents.append((aspect, sentiment))
        records.append({'text': text, 'cents': cents})
    return pd.DataFrame(records)

In [None]:
class ABSADataset(Dataset):
    def __init__(self, df, tokenizer, aspect_cols, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.aspect_cols = aspect_cols
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row["Review"]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        labels = torch.tensor([row[a] for a in self.aspect_cols], dtype=torch.long)
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": labels
        }

In [None]:

class MultiTaskPhoBERT(nn.Module):
    def __init__(self, model_name, num_labels=4, num_aspects=4):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden = self.encoder.config.hidden_size
        self.classifiers = nn.ModuleList([
            nn.Linear(hidden, num_labels) for _ in range(num_aspects)
        ])

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0, :]  # CLS
        logits = torch.stack([clf(pooled) for clf in self.classifiers], dim=1)  # [B, A, 4]

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = sum([loss_fn(logits[:, i], labels[:, i]) for i in range(labels.size(1))]) / labels.size(1)
        return {"loss": loss, "logits": logits}


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    preds_flat = preds.reshape(-1)
    labels_flat = labels.reshape(-1)

    # Aspect Detection
    aspect_pred = preds_flat != 0
    aspect_true = labels_flat != 0
    f1_aspect = f1_score(aspect_true, aspect_pred)
    acc_aspect = accuracy_score(aspect_true, aspect_pred)

    # Sentiment classification (mask nhãn 0)
    mask = labels_flat != 0
    f1_sent = f1_score(labels_flat[mask], preds_flat[mask], average='macro')
    acc_sent = accuracy_score(labels_flat[mask], preds_flat[mask])

    # Tổng thể
    f1_all = f1_score(labels_flat, preds_flat, average='macro')
    acc_all = accuracy_score(labels_flat, preds_flat)

    print("\n📊 Classification report (sentiment only):")
    print(classification_report(labels_flat[mask], preds_flat[mask], digits=4))

    return {
        "F1_aspect_detection": f1_aspect,
        "Acc_aspect_detection": acc_aspect,
        "F1_sentiment": f1_sent,
        "Acc_sentiment": acc_sent,
        "F1_overall": f1_all,
        "Acc_overall": acc_all,
    }


In [None]:
model_name = "vinai/phobert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

train_dataset = ABSADataset(df_train, tokenizer, aspect_cols)
val_dataset   = ABSADataset(df_val, tokenizer, aspect_cols)
test_dataset  = ABSADataset(df_test, tokenizer, aspect_cols)

model = MultiTaskPhoBERT(model_name, num_labels=4, num_aspects=len(aspect_cols))

In [None]:
training_args = TrainingArguments(
    output_dir="./absa_outputs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    save_strategy="no",           # không lưu checkpoint
    logging_strategy="epoch",
    load_best_model_at_end=False, # tránh lỗi
    report_to="none",
    seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Step,Training Loss


KeyboardInterrupt: 

In [None]:
print("\nĐánh giá trên tập test:")
results = trainer.predict(test_dataset)
