In [1]:
!pip -q install transformers==4.47.0 datasets accelerate underthesea imblearn

import torch
import numpy as np
import pandas as pd
import os
from datasets import load_dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq, EarlyStoppingCallback
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from underthesea import word_tokenize
from tqdm import tqdm
from IPython.display import display_html
import logging
import gc

# Thiết lập biến môi trường để tránh phân mảnh bộ nhớ
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Thiết lập logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Định nghĩa lớp PolarityMapping
class PolarityMapping:
    INDEX_TO_POLARITY = {0: None, 1: 'positive', 2: 'negative', 3: 'neutral'}
    INDEX_TO_ONEHOT = {0: [1, 0, 0, 0], 1: [0, 1, 0, 0], 2: [0, 0, 1, 0], 3: [0, 0, 0, 1]}
    POLARITY_TO_INDEX = {None: 0, 'positive': 1, 'negative': 2, 'neutral': 3}

# Định nghĩa lớp VLSP2018SklearnEvaluator
class VLSP2018SklearnEvaluator:
    def __init__(self, y_test, y_pred, aspect_category_names):
        aspect_cate_test, aspect_cate_pred = [], []
        aspect_cate_polar_test, aspect_cate_polar_pred = [], []

        for row_test, row_pred in zip(y_test, y_pred):
            for index, (col_test, col_pred) in enumerate(zip(row_test, row_pred)):
                aspect_cate_test.append(aspect_category_names[index] if col_test != 0 else 'None#None')
                aspect_cate_pred.append(aspect_category_names[index] if col_pred != 0 else 'None#None')
                aspect_cate_polar_test.append(aspect_category_names[index] + f',{PolarityMapping.INDEX_TO_POLARITY[col_test]}')
                aspect_cate_polar_pred.append(aspect_category_names[index] + f',{PolarityMapping.INDEX_TO_POLARITY[col_pred]}')

        self.aspect_cate_polar_report = classification_report(aspect_cate_polar_test, aspect_cate_polar_pred, output_dict=True, zero_division=1.0)
        self.aspect_cate_report = classification_report(aspect_cate_test, aspect_cate_pred, output_dict=True, zero_division=1.0)
        self.polarity_report = classification_report(y_test.flatten(), y_pred.flatten(), target_names=PolarityMapping.POLARITY_TO_INDEX, output_dict=True)
        self._merge_all_reports()
        self._build_macro_avg_df()

    def report(self, report_type='all'):
        if report_type.lower() == 'all': self._display_all_reports()
        elif report_type.lower() == 'aspect#category,polarity': return pd.DataFrame(self.aspect_cate_polar_report).T
        elif report_type.lower() == 'aspect#category': return pd.DataFrame(self.aspect_cate_report).T
        elif report_type.lower() == 'polarity': return pd.DataFrame(self.polarity_report).T
        elif report_type.lower() == 'macro_avg': return self.macro_avg_df
        else: raise ValueError('report_type must be in ["all", "aspect#category,polarity", "aspect#category", "polarity", "macro_avg"]')

    def _merge_all_reports(self):
        self.merged_report = {}
        for key, metrics in self.aspect_cate_polar_report.items():
            if key in ['accuracy', 'macro avg', 'weighted avg']:
                self.merged_report[key] = {
                    'aspect#category': self.aspect_cate_report[key],
                    'aspect#category,polarity': metrics
                }
            else:
                aspect_cate, polarity = key.split(',')
                if aspect_cate not in self.merged_report:
                    self.merged_report[aspect_cate] = {'aspect#category': self.aspect_cate_report[aspect_cate]}
                self.merged_report[aspect_cate][polarity] = metrics

    def _build_macro_avg_df(self):
        self.macro_avg_df = pd.DataFrame([{
            'accuracy': f"{report['accuracy']:.3f}",
            'precision': f"{report['macro avg']['precision']:.3f}",
            'recall': f"{report['macro avg']['recall']:.3f}",
            'f1-score': f"{report['macro avg']['f1-score']:.3f}",
            'support': report['macro avg']['support']
        } for report in [self.aspect_cate_polar_report, self.aspect_cate_report, self.polarity_report]])
        self.macro_avg_df.index = ['Aspect#Category,Polarity', 'Aspect#Category', 'Polarity']

    def _display_all_reports(self):
        metric_names = list(self.merged_report.values())[0]['aspect#category']
        html_str = f"""
            <tr>
                <th style="font-weight: bold; text-align: center;" rowspan="2">ACSA Report (w/o "None" polarity)</th>
                <th style="font-weight: bold; text-align: center;" colspan="{len(metric_names)}">Aspect#Category</th>
                <th style="font-weight: bold; text-align: center;" colspan="{len(metric_names)}">Aspect#Category,Polarity</th>
            </tr>
            <tr>
                {''.join([f'<th>{metric_name}</th>' for metric_name in metric_names] * 2)}
            </tr>
        """

        for key, merged_metrics in tqdm(self.merged_report.items()):
            if key in ['accuracy', 'macro avg', 'weighted avg']: continue
            polarities = merged_metrics.keys() - {'aspect#category', 'None'}
            aspect_cate_html = ''.join(
                f'<td rowspan="{len(polarities)}">{value if name == "support" else f"{value:.3f}"}</td>'
                for name, value in self.merged_report[key]['aspect#category'].items()
            )
            for index, polarity in enumerate(polarities):
                aspect_cate_polar_html = ''.join(
                    f'<td>{value if name == "support" else f"{value:.3f}"}</td>'
                    for name, value in self.merged_report[key][polarity].items()
                )
                html_str += f"""
                    <tr>
                        <td>{key},{polarity}</td>
                        {aspect_cate_html if index == 0 else ''}
                        {aspect_cate_polar_html}
                    </tr>
                """

        display_html(f'''
            <div style="display: flex; align-items: flex-start; flex-wrap: nowrap">
                <table style="margin-right: 10px">{html_str}</table>
                <div style="display: flex; align-items: center; flex-direction: column">
                    <b>Polarity Report</b><br>
                    {pd.DataFrame(self.polarity_report).T.to_html()}<br>
                    <b>Macro Avg Report</b><br>
                    {self.macro_avg_df.to_html()}
                </div>
            </div>
        ''', raw=True)

# Tải dữ liệu
TRAIN_PATH = "/kaggle/input/vnese-datasets/train_df_with_VNESE_output.csv"
VAL_PATH = "/kaggle/input/vnese-datasets/val_df_with_VNESE_output.csv"
TEST_PATH = "/kaggle/input/vnese-datasets/test_df_with_VNESE_output.csv"
dataset = load_dataset("csv", data_files={"train": TRAIN_PATH, "val": VAL_PATH, "test": TEST_PATH})

# Lọc dữ liệu để loại bỏ giá trị None hoặc chuỗi rỗng
dataset = dataset.filter(lambda x: x['clean_comment'] is not None and x['VNESE_text_output'] is not None and x['clean_comment'].strip() != "" and x['VNESE_text_output'].strip() != "")
logger.info("Dữ liệu sau khi lọc: %s", dataset)

# Kiểm tra phân phối dữ liệu
logger.info("Phân phối VNESE_text_output trong tập train:")
print(dataset['train'].to_pandas()['VNESE_text_output'].value_counts())

# Oversampling để xử lý mất cân bằng
def oversample_dataset(dataset):
    X = dataset['clean_comment']
    y = dataset['VNESE_text_output']
    ros = RandomOverSampler(random_state=42)
    X_res, y_res = ros.fit_resample(np.array(X).reshape(-1, 1), y)
    return {'clean_comment': X_res.flatten(), 'VNESE_text_output': y_res}

# Áp dụng oversampling
oversampled_data = oversample_dataset(dataset['train'])
dataset['train'] = dataset['train'].from_dict(oversampled_data)
logger.info("Dữ liệu train sau oversampling: %s", len(dataset['train']))

# Tải tokenizer và mô hình
tokenizer = T5Tokenizer.from_pretrained("VietAI/vit5-base")
model = T5ForConditionalGeneration.from_pretrained("VietAI/vit5-base")
model.gradient_checkpointing_enable()  # Bật gradient checkpointing để tiết kiệm bộ nhớ
logger.info("Đã tải tokenizer và mô hình VietAI/vit5-base")

# Tiền xử lý với đệm đồng nhất
def preprocess(example):
    # Kiểm tra dữ liệu đầu vào
    clean_comments = [c for c in example['clean_comment'] if isinstance(c, str) and c.strip()]
    vnese_outputs = [o for o in example['VNESE_text_output'] if isinstance(o, str) and o.strip()]
    if not clean_comments or not vnese_outputs:
        logger.warning("Tìm thấy mẫu không hợp lệ trong batch, trả về empty input")
        return {'input_ids': [], 'attention_mask': [], 'labels': []}

    # Mã hóa đầu vào
    model_input = tokenizer(
        clean_comments,
        truncation=True,
        max_length=384,
        padding="max_length"  # Thêm đệm đồng nhất
    )
    # Mã hóa nhãn
    labels = tokenizer(
        text_target=vnese_outputs,
        truncation=True,
        max_length=128,
        padding="max_length"  # Thêm đệm đồng nhất
    )
    model_input['labels'] = labels['input_ids']
    # Chuyển lists thành numpy arrays
    model_input['input_ids'] = np.array(model_input['input_ids'])
    model_input['attention_mask'] = np.array(model_input['attention_mask'])
    model_input['labels'] = np.array(model_input['labels'])
    return model_input

tokenized_dataset = dataset.map(preprocess, batched=True)
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
logger.info("Đã tiền xử lý dữ liệu: %s", tokenized_dataset)

# Làm sạch bộ nhớ GPU
gc.collect()
torch.cuda.empty_cache()
logger.info("Đã làm sạch bộ nhớ GPU")

# Tham số huấn luyện tối ưu
training_args = TrainingArguments(
    output_dir="./t5-finetuned",
    run_name="t5-finetune-v8",
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    learning_rate=3e-5,
    lr_scheduler_type="cosine",
    gradient_accumulation_steps=4,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    logging_steps=100,
    logging_dir="./logs",
    fp16=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Sử dụng Trainer mặc định
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['val'],
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)
logger.info("Đã khởi tạo Trainer")

# Huấn luyện mô hình
logger.info("Bắt đầu huấn luyện mô hình...")
trainer.train()
logger.info("Hoàn tất huấn luyện")

# Chuyển mô hình sang thiết bị
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
logger.info("Mô hình đã được chuyển sang %s", device)

# Thử nghiệm với một bình luận
comment = """
Nhà hàng này món ăn cực kỳ ngon và lạ nữa. Không gian đẹp, nhẹ nhàng và tinh tế. Phục vụ cũng rất ok, giá thì hơi cao nhưng so với mặt bằng Q. 1 thì giá như thế là phù hợp rồi. chả cá chiên thuyền buồm 199k. Xôi đùi gà quay 139k. Black Currant 50k. Coronarita 199k. Xôi ghẹ nè...
"""
input_text = f"Phân tích bình luận sau và rút trích các khía cạnh (không gian, chất lượng đồ ăn, phục vụ,...) cùng trạng thái (tốt, tệ, tạm): {comment}"
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

output = model.generate(input_ids, max_length=512, num_beams=6, top_k=50, temperature=0.7)
print("Đầu ra mẫu:", tokenizer.decode(output[0], skip_special_tokens=True))

# Đánh giá trên tập test
test_dataloader = DataLoader(tokenized_dataset['test'], batch_size=4)
predictions = []

logger.info("Bắt đầu đánh giá trên tập test...")
for batch in test_dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=128,
            num_beams=6,
            top_k=50,
            temperature=0.7,
            early_stopping=True
        )
    preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    predictions.extend(preds)
logger.info("Hoàn tất sinh dự đoán trên tập test")

# Phân tích đầu ra cải tiến
def convert_output_to_list(output_text, aspect_category_list):
    sentiment_map = {'tốt': 1, 'tệ': 2, 'tạm': 3}
    result = [0] * len(aspect_category_list)
    tokens = word_tokenize(output_text.lower())
    for i, aspect in enumerate(aspect_category_list):
        aspect_lower = aspect.lower()
        for sentiment_word, sentiment_value in sentiment_map.items():
            if aspect_lower in output_text.lower() and sentiment_word in output_text.lower():
                result[i] = sentiment_value
                break
    return result

aspect_category_list = [
    'không gian', 'giá tiền đồ uống', 'chất lượng đồ uống', 'lựa chọn đồ uống',
    'giá tiền món ăn', 'chất lượng đồ ăn', 'lựa chọn đồ ăn', 'địa chỉ', 'nhà hàng nói chung',
    'vấn đề khác', 'giá cả nhà hàng', 'phục vụ'
]

# Chuyển đổi dự đoán và nhãn
y_pred = [convert_output_to_list(pred, aspect_category_list) for pred in predictions]
y_test_raw_text = dataset['test']['VNESE_text_output']
y_test = [convert_output_to_list(raw, aspect_category_list) for raw in y_test_raw_text]

y_pred = np.array(y_pred)
y_test = np.array(y_test)
logger.info("Đã chuyển đổi dự đoán và nhãn")

# Đánh giá
ASPECT_CATEGORY_NAMES = ['AMBIENCE#GENERAL', 'DRINKS#PRICES', 'DRINKS#QUALITY', 'DRINKS#STYLE&OPTIONS', 'FOOD#PRICES', 'FOOD#QUALITY', 'FOOD#STYLE&OPTIONS', 'LOCATION#GENERAL', 'RESTAURANT#GENERAL', 'RESTAURANT#MISCELLANEOUS', 'RESTAURANT#PRICES', 'SERVICE#GENERAL']
sk_eval = VLSP2018SklearnEvaluator(y_test, y_pred, ASPECT_CATEGORY_NAMES)
logger.info("Bắt đầu đánh giá...")
sk_eval.report(report_type='all')
logger.info("Hoàn tất đánh giá")

# Phân tích lỗi
errors = []
for i, (pred, true) in enumerate(zip(y_pred, y_test)):
    if not np.array_equal(pred, true):
        errors.append({
            "comment": dataset['test'][i]['clean_comment'],
            "true": true,
            "pred": pred
        })
pd.DataFrame(errors).to_csv("errors.csv")
logger.info("Lưu phân tích lỗi vào errors.csv")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m90.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

2025-05-08 04:55:23.663844: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746680123.844417      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746680123.903739      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/2961 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1290 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

VNESE_text_output
chất lượng đồ ăn tốt                                                                                                               269
lựa chọn đồ ăn tốt và chất lượng đồ ăn tốt                                                                                         145
chất lượng đồ ăn tốt và lựa chọn đồ ăn tốt                                                                                         131
chất lượng đồ ăn tốt và giá tiền món ăn tốt                                                                                         68
chất lượng đồ ăn tạm                                                                                                                51
                                                                                                                                  ... 
chất lượng đồ ăn tốt và địa chỉ tạm và lựa chọn đồ ăn tốt và nhà hàng nói chung tốt                                                  1
chất lượng đồ uống tốt và giá tiền đồ

tokenizer_config.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/820k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/904M [00:00<?, ?B/s]

Map:   0%|          | 0/382518 [00:00<?, ? examples/s]

Map:   0%|          | 0/1290 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
500,0.0426,0.049384
1000,0.0186,0.059367
1500,0.0098,0.072975
2000,0.0067,0.06808
2500,0.0048,0.075082
3000,0.004,0.073438


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Đầu ra mẫu: nhà hàng nói chung tốt và chất lượng đồ ăn tốt và không gian tốt và phục vụ tốt và giá tiền món ăn tạm


100%|██████████| 15/15 [00:00<00:00, 64793.57it/s]


"ACSA Report (w/o ""None"" polarity)",Aspect#Category,Aspect#Category,Aspect#Category,Aspect#Category,"Aspect#Category,Polarity","Aspect#Category,Polarity","Aspect#Category,Polarity","Aspect#Category,Polarity"
"ACSA Report (w/o ""None"" polarity)",precision,recall,f1-score,support,precision,recall,f1-score,support
"AMBIENCE#GENERAL,neutral",0.858,0.902,0.88,255.0,0.0,1.0,0.0,0.0
"AMBIENCE#GENERAL,negative",0.858,0.902,0.88,255.0,1.0,0.4,0.571,5.0
"AMBIENCE#GENERAL,positive",0.858,0.902,0.88,255.0,0.845,0.892,0.868,250.0
"DRINKS#PRICES,neutral",0.667,0.447,0.535,76.0,0.0,1.0,0.0,0.0
"DRINKS#PRICES,negative",0.667,0.447,0.535,76.0,1.0,0.0,0.0,2.0
"DRINKS#PRICES,positive",0.667,0.447,0.535,76.0,0.653,0.432,0.52,74.0
"DRINKS#QUALITY,negative",0.913,0.592,0.718,71.0,1.0,0.333,0.5,3.0
"DRINKS#QUALITY,positive",0.913,0.592,0.718,71.0,0.889,0.588,0.708,68.0
"DRINKS#STYLE&OPTIONS,negative",0.479,0.761,0.588,46.0,0.0,0.0,0.0,2.0
"DRINKS#STYLE&OPTIONS,positive",0.479,0.761,0.588,46.0,0.458,0.75,0.569,44.0

Unnamed: 0,precision,recall,f1-score,support
,0.882437,0.861491,0.871838,3581.0
positive,0.786535,0.821965,0.80386,2331.0
negative,0.680851,0.421053,0.520325,76.0
neutral,0.142857,0.25,0.181818,12.0
accuracy,0.839333,0.839333,0.839333,0.839333
macro avg,0.62317,0.588627,0.59446,6000.0
weighted avg,0.841147,0.839333,0.839596,6000.0

Unnamed: 0,accuracy,precision,recall,f1-score,support
"Aspect#Category,Polarity",0.839,0.659,0.616,0.52,6000.0
Aspect#Category,0.849,0.752,0.714,0.698,6000.0
Polarity,0.839,0.623,0.589,0.594,6000.0
