In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install -U -qqq transformers datasets accelerate sentencepiece wandb

# Import library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import torch
import datasets
from sklearn.metrics import *

# Load data

In [None]:
sufix_path = "/content/drive/MyDrive/NLU_NCKH/notebook/Data/"
df_train = pd.read_csv(f"{sufix_path}Hotel Preprocessed/Train.csv")
df_dev = pd.read_csv(f"{sufix_path}Hotel Preprocessed/Dev.csv")
df_test = pd.read_csv(f"{sufix_path}Hotel Preprocessed/Test.csv")

print("Train: ", df_train.shape)
print("Dev: ",  df_dev.shape)
print("Test: ", df_test.shape )

Train:  (7180, 35)
Dev:  (795, 35)
Test:  (2030, 35)


In [None]:
!wget https://raw.githubusercontent.com/stopwords/vietnamese-stopwords/master/vietnamese-stopwords-dash.txt

--2024-07-14 05:49:27--  https://raw.githubusercontent.com/stopwords/vietnamese-stopwords/master/vietnamese-stopwords-dash.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20475 (20K) [text/plain]
Saving to: ‘vietnamese-stopwords-dash.txt’


2024-07-14 05:49:28 (25.6 MB/s) - ‘vietnamese-stopwords-dash.txt’ saved [20475/20475]



In [None]:
aspects_map = {
  'FACILITIES#CLEANLINESS' : 'Vệ sinh cơ sở vật chất',
  'FACILITIES#COMFORT' : 'Sự thoải mái cơ sở vật chất',
  'FACILITIES#DESIGN&FEATURES' : "Thiết kế cơ sở hạ tầng",
  'FACILITIES#GENERAL' : 'Cơ sở hạ tầng',
  'FACILITIES#MISCELLANEOUS' : 'Vấn đề tiện nghi',
  'FACILITIES#PRICES' : 'Giá cả tiện nghi',
  'FACILITIES#QUALITY' : 'Chất lượng tiện nghi',
  'FOOD&DRINKS#MISCELLANEOUS' : 'Vấn đề đồ ăn',
  'FOOD&DRINKS#PRICES' : 'Giá cả đồ ăn',
  'FOOD&DRINKS#QUALITY' : 'Chất lượng đồ ăn',
  'FOOD&DRINKS#STYLE&OPTIONS' : 'Lựa chọn đồ ăn',
  'HOTEL#CLEANLINESS' : 'Vệ sinh khách sạn',
  'HOTEL#COMFORT' : 'Sự thoải mái khách sạn',
  'HOTEL#DESIGN&FEATURES':'Thiết kế khách sạn',
  'HOTEL#GENERAL' : 'Khách sạn',
  'HOTEL#MISCELLANEOUS' : 'Vấn đề khách sạn',
  'HOTEL#PRICES' : 'Giá tiền khách sạn',
  'HOTEL#QUALITY' : 'Chất lượng khách sạn',
  'LOCATION#GENERAL' : 'Địa chỉ',
  'ROOMS#CLEANLINESS' : "Vệ sinh phòng",
  'ROOMS#COMFORT' : 'Sự thoải mái phòng',
  'ROOMS#DESIGN&FEATURES': 'Thiết kế phòng',
  'ROOMS#GENERAL' : 'Phòng',
  'ROOMS#MISCELLANEOUS' : 'Vấn đề về phòng',
  'ROOMS#PRICES' : 'Giá phòng',
  'ROOMS#QUALITY' : 'Chất lượng phòng',
  'ROOM_AMENITIES#CLEANLINESS' : 'Vệ sinh tiện nghi phòng',
  'ROOM_AMENITIES#COMFORT' : 'Thoải mái tiện nghi phòng',
  'ROOM_AMENITIES#DESIGN&FEATURES' : 'Thiết kế tiện nghi phòng',
  'ROOM_AMENITIES#GENERAL' : 'Tiện nghi phòng',
  'ROOM_AMENITIES#MISCELLANEOUS' : 'Vấn đề về tiện nghi phòng',
  'ROOM_AMENITIES#PRICES' : 'Giá cả tiện nghi phòng',
  'ROOM_AMENITIES#QUALITY' : 'Chất lượng tiện nghi phòng',
  'SERVICE#GENERAL' : 'Dịch vụ'
}
polarity_map = {
    1: "tệ",
    2: "tạm",
    3: "tốt"
}

# Preprocess data

In [None]:
aspects = [
 'FACILITIES#CLEANLINESS',
 'FACILITIES#COMFORT',
 'FACILITIES#DESIGN&FEATURES',
 'FACILITIES#GENERAL',
 'FACILITIES#MISCELLANEOUS',
 'FACILITIES#PRICES',
 'FACILITIES#QUALITY',
 'FOOD&DRINKS#MISCELLANEOUS',
 'FOOD&DRINKS#PRICES',
 'FOOD&DRINKS#QUALITY',
 'FOOD&DRINKS#STYLE&OPTIONS',
 'HOTEL#CLEANLINESS',
 'HOTEL#COMFORT',
 'HOTEL#DESIGN&FEATURES',
 'HOTEL#GENERAL',
 'HOTEL#MISCELLANEOUS',
 'HOTEL#PRICES',
 'HOTEL#QUALITY',
 'LOCATION#GENERAL',
 'ROOMS#CLEANLINESS',
 'ROOMS#COMFORT',
 'ROOMS#DESIGN&FEATURES',
 'ROOMS#GENERAL',
 'ROOMS#MISCELLANEOUS',
 'ROOMS#PRICES',
 'ROOMS#QUALITY',
 'ROOM_AMENITIES#CLEANLINESS',
 'ROOM_AMENITIES#COMFORT',
 'ROOM_AMENITIES#DESIGN&FEATURES',
 'ROOM_AMENITIES#GENERAL',
 'ROOM_AMENITIES#MISCELLANEOUS',
 'ROOM_AMENITIES#PRICES',
 'ROOM_AMENITIES#QUALITY',
 'SERVICE#GENERAL'
]

In [None]:
import numpy as np

def toSeq(row):
    dict_label = {1: "negative", 2: "neutral", 3: "positive"}
    seqs = []
    labels = [
        (aspect, polarity)
        for aspect, polarity in zip(row.index, row.values) if polarity != 0
    ]
    for label in labels:
        seq = aspects_map[label[0]] + " " + polarity_map[label[1]]
        seqs.append(seq)
    return ", ".join(seqs).capitalize()

# df_train.drop("review", axis=1).apply(toTextLabel, axis=1).iloc[0]
df_train["Labels"] = df_train.drop("review", axis=1).apply(toSeq, axis=1)
df_test["Labels"] = df_test.drop("review", axis=1).apply(toSeq, axis=1)
df_dev["Labels"] = df_dev.drop("review", axis=1).apply(toSeq, axis=1)
df_train['Labels'].head()

0                           Khách sạn tốt, dịch vụ tốt
1               Chất lượng đồ ăn tệ, lựa chọn đồ ăn tệ
2                     Chất lượng đồ ăn tệ, dịch vụ tốt
3                                          Dịch vụ tốt
4    Chất lượng khách sạn tốt, thiết kế tiện nghi p...
Name: Labels, dtype: object

In [None]:
label_lengths = df_train["Labels"].map(len)
review_lengths = df_train["review"].map(len)

# Tìm độ dài lớn nhất và chỉ số của hàng có độ dài lớn nhất
max_review_length = review_lengths.max()
max_review_index = review_lengths.idxmax()

max_labels_length = label_lengths.max()
max_labels_index = label_lengths.idxmax()

print(f"Max length review: {max_review_length}")
print(f"Index of max review: {max_review_index}")
print(f"Value of Review at max index: {df_train['Labels'][max_review_index]}")
print(f"Value of Review at max index: {df_train['review'][max_review_index]}")
print("-"*80)
print(f"Max length labels: {max_labels_length}")
print(f"Index of max length: {max_labels_index}")
print(f"Value of Labels at max index: {df_train['Labels'][max_labels_index]}")
print(f"Value of Labels at max index: {df_train['review'][max_labels_index]}")

Max length review: 814
Index of max review: 4060
Value of Review at max index: Khách sạn tốt
Value of Review at max index: chúng_tôi đi team building tại cần giờ và đặt phòng tại resort này rất nhiều điểm cộng nhân_viên tiếp_tân rất thân_thiện lúc_nào cũng nở nụ cười giúp_đỡ khách rất ân_cần chu_đáo nhà_hàng carot khá đẹp nấu_ăn rất ngon nhân_viên thân_thiện niềm_nở tư_vấn menu rất nhiệt_tình hồ bơi đẹp khá sạch_sẽ vị_trí ngay bãi 304 gần chợ gần biển gần trạm xe_buýt thuận_tiện cho việc ăn_chơi nhảy_múa khá nhiều nhà_hàng quán ăn gần resort nấu thức_ăn cũng rất ngon hải_sản luôn_luôn tươi phòng superior khá nhỏ nhưng cũng đầy_đủ vật_dụng cần_thiết cho khách quang_cảnh vườn yên_tĩnh và mát_mẻ xíu xíu điểm trừ nhà_hàng carot nấu ngon nhưng giá_cả khá mắc phòng superior khá nhỏ lúc muốn tụ_tập đông người tại 1 phòng để chơi hơi chật phòng tắm cũng nhỏ không có bồn_tắm trong nhà_tắm tổng_quan resort 3 sao rất đáng để nghỉ_ngơi
--------------------------------------------------------------

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import datasets
MODEL_NAME = "VietAI/vit5-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
max_len_input = max(len(tokenizer.encode(review)) for review in df_train['review']) + 13
max_len_target = len(tokenizer.encode(df_train['Labels'][max_labels_index]))
print('Max len input:', max_len_input)
print('Max len target:', max_len_target)

Max len input: 274
Max len target: 46


In [None]:
import random
import torch
from torch.utils.data import Dataset, DataLoader


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
from datasets import Dataset
dataset_train =  Dataset.from_pandas(df_train)
dataset_dev = Dataset.from_pandas(df_dev)
dataset_test = Dataset.from_pandas(df_test)

example = dataset_train[0]

print("Review:", example["review"])
print("Traget:", example["Labels"])

Review: vừa_qua tôi có dùng dịch_vụ tại khách_sạn tc hotel premium ngọc_lan ngọc_lan đà_lạt
Traget: Khách sạn tốt, dịch vụ tốt


In [None]:
dataset_train

Dataset({
    features: ['review', 'FACILITIES#CLEANLINESS', 'FACILITIES#COMFORT', 'FACILITIES#DESIGN&FEATURES', 'FACILITIES#GENERAL', 'FACILITIES#MISCELLANEOUS', 'FACILITIES#PRICES', 'FACILITIES#QUALITY', 'FOOD&DRINKS#MISCELLANEOUS', 'FOOD&DRINKS#PRICES', 'FOOD&DRINKS#QUALITY', 'FOOD&DRINKS#STYLE&OPTIONS', 'HOTEL#CLEANLINESS', 'HOTEL#COMFORT', 'HOTEL#DESIGN&FEATURES', 'HOTEL#GENERAL', 'HOTEL#MISCELLANEOUS', 'HOTEL#PRICES', 'HOTEL#QUALITY', 'LOCATION#GENERAL', 'ROOMS#CLEANLINESS', 'ROOMS#COMFORT', 'ROOMS#DESIGN&FEATURES', 'ROOMS#GENERAL', 'ROOMS#MISCELLANEOUS', 'ROOMS#PRICES', 'ROOMS#QUALITY', 'ROOM_AMENITIES#CLEANLINESS', 'ROOM_AMENITIES#COMFORT', 'ROOM_AMENITIES#DESIGN&FEATURES', 'ROOM_AMENITIES#GENERAL', 'ROOM_AMENITIES#MISCELLANEOUS', 'ROOM_AMENITIES#PRICES', 'ROOM_AMENITIES#QUALITY', 'SERVICE#GENERAL', 'Labels'],
    num_rows: 7180
})

# Prepare input for model

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import datasets

# Initialize constants and tokenizer
# MAX_LEN_INPUT = 170
# MAX_LEN_TARGET = 70
# MODEL_NAME = "VietAI/vit5-base"
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_examples(examples):
    prefix = "aspect-based sentiment analysis: "
    reviews = [prefix + review for review in examples['review']]
    labels = [label for label in examples['Labels']]

    # Tokenize the reviews
    model_inputs = tokenizer(
        reviews,
        max_length=max_len_input,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )

    # Tokenize the labels
    labels = tokenizer(
        labels,
        max_length=max_len_target,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )

    # Adjust labels for compatibility with PyTorch
    temp_labels = labels['input_ids']
    temp_labels[temp_labels == tokenizer.pad_token_id] = -100

    # Add target_ids and target_mask to model_inputs
    model_inputs['labels'] = temp_labels
    model_inputs['decoder_attention_mask'] = labels['attention_mask']

    return model_inputs


# Apply the preprocessing function to the datasets
dataset_train = dataset_train.map(preprocess_examples, batched=True)
dataset_dev = dataset_dev.map(preprocess_examples, batched=True)
dataset_test = dataset_test.map(preprocess_examples, batched=True)

# Set the format of the datasets to PyTorch tensors
dataset_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'])
dataset_dev.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'])
dataset_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'])

Map:   0%|          | 0/7180 [00:00<?, ? examples/s]

Map:   0%|          | 0/795 [00:00<?, ? examples/s]

Map:   0%|          | 0/2030 [00:00<?, ? examples/s]

# Define metrics

In [None]:
import re
import numpy as np
from sklearn.metrics import classification_report

# true = ["Địa điểm tốt, nhà hàng nói chung tốt, chất lượng món ăn tốt", "Chất lượng đồ uống tốt, lựa chọn đồ uống tốt"]
# pred = ["Gía tiền món ăn tốt, nhà hàng nói chung tốt, chất lượng món ăn tạm", "Chất lượng đồ uống tệ, lựa chọn đồ uống tốt"]

polarity_map = {
    "tệ": "negative",
    "tạm": "neutral",
    "tốt": "positive"
}

aspects = df_train.drop(["review", "Labels"], axis=1).columns.tolist()
aspects = [aspect.lower() for aspect in aspects]
sentiments = {'negative': 1, 'neutral': 2, 'positive': 3}
replacements = {0: 'none', 1: 'negative', 2: 'neutral', 3: 'positive'}

def parse_output(batchs):
    outputs = []
    for idx, label in enumerate(batchs):
      output = []
      sentences = label.split(',')
      sentences = [sentence.strip().lower() for sentence in sentences]
      for sentence in sentences:
          aspect, polarity = None, None
          for aspect_key, aspect_value in aspects_map.items():
              if aspect_value.lower() in sentence:
                aspect = aspect_key
                break
          for polarity_key, polarity_value in polarity_map.items():
              if polarity_key in sentence:
                polarity = polarity_value
                break
          output.append((aspect, polarity))

      outputs.append(output)
    aspects_polarity = []
    for idx, output in enumerate(outputs):
        y = [0] * len(aspects)
        for aspect, polarity in output:
          if aspect is None:
            aspect = "HOTEL#GENERAL"
          index = aspects.index(aspect.lower())
          y[index] = sentiments[polarity]
        aspects_polarity.append(y)
    return aspects_polarity
# y_true = parse_output(true)
# y_pred = parse_output(pred)

In [None]:
def aspect_detection_eval(y_true, y_pred):
    aspect_test = []
    aspect_pred = []

    for row_test, row_pred in zip(y_true, y_pred):
        for index, (col_test, col_pred) in enumerate(zip(row_test, row_pred)):
            aspect_test.append(bool(col_test) * aspects[index])
            aspect_pred.append(bool(col_pred) * aspects[index])
    metrics = classification_report(aspect_test, aspect_pred, zero_division=1, digits=4)
    return metrics

def sentiment_classification_eval(y_true, y_pred):
    y_true_flat = np.array(y_true).flatten()
    y_pred_flat = np.array(y_pred).flatten()
    target_names = list(map(str, replacements.values()))
    metrics = classification_report(y_true_flat, y_pred_flat, zero_division=1, target_names=target_names, digits=4)
    return metrics

def combination_eval(y_true, y_pred):
    aspect_polarity_true = []
    aspect_polarity_pred = []

    for row_test, row_pred in zip(y_true, y_pred):
        for index, (col_test, col_pred) in enumerate(zip(row_test, row_pred)):
            aspect_polarity_true.append(f'{aspects[index]},{replacements[col_test]}')
            aspect_polarity_pred.append(f'{aspects[index]},{replacements[col_pred]}')
    metrics = classification_report(aspect_polarity_true, aspect_polarity_pred, zero_division=1, digits=4)
    return metrics

# print(list(map(str, replacements.values())))
# # Đánh giá
# print("## Aspect Detection Evaluate ##")
# metrics = aspect_detection_eval(y_true, y_pred)
# print(metrics)

# print("\n## Sentiment Classification Evaluate ##")
# metrics = sentiment_classification_eval(y_true, y_pred)
# print(metrics)

# print("\n## Combination Evaluate (Aspect + Polarity detection) ##")
# metrics = combination_eval(y_true, y_pred)
# print(metrics)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print('Device:', device)
!nvidia-smi

Device: cuda
Sun Jul 14 06:21:59 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                       

# Training model

In [None]:
import wandb
api = wandb.Api()
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m21130500[0m ([33mquid[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
!pwd

/content


In [None]:
model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/NLU_NCKH/notebook/Checkpoints/Hotel_S2S_U/")
# model.to(device)
# Define training arguments
DATA = "Hotel"
OUT_DIR = "/content/drive/MyDrive/NLU_NCKH/notebook/Checkpoints/"
training_args = TrainingArguments(
    output_dir= f'./{DATA}_seqtoseq_unified_{MODEL_NAME.split("/")[-1]}',
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_dev,
    # compute_metrics=compute_metrics
)

# Train the model
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33m21130500[0m ([33mquid[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.294675
2,0.247300,0.287343


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=718, training_loss=0.23642950668972515, metrics={'train_runtime': 1074.2881, 'train_samples_per_second': 13.367, 'train_steps_per_second': 0.668, 'total_flos': 4679746239283200.0, 'train_loss': 0.23642950668972515, 'epoch': 2.0})

# Load model

In [None]:
save = "/content/drive/MyDrive/NLU_NCKH/notebook/Checkpoints/Hotel_S2S_U/"
trainer.save_model(save)

In [None]:
trainer.save_model()

In [None]:
import gc
del model
torch.cuda.empty_cache()
gc.collect()

NameError: name 'model' is not defined

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
device = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "VietAI/vit5-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(save).to(device)

# Prepare input to inference

In [None]:
class customDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.labels)

# Inference and Evaluate

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm
# Evaluate the model
# results = trainer.evaluate()
# Generate predictions
# MAX_LEN_INPUT = 170
# MAX_LEN_TARGET = 70

def generate_predictions(dataset):
    prefix = "aspect-based sentiment analysis: "
    input_texts = [prefix + review for review in dataset['review']]
    y_test = dataset['Labels']
    # tokenizer.add_tokens(["#", "&"])
    # Tokenize the reviews
    inputs = tokenizer(
        input_texts,
        max_length=max_len_input,
        padding='max_length',
        truncation=True,
        return_tensors="pt"
    )
    encoding_data = customDataset(inputs, y_test)
    data_loader = DataLoader(encoding_data, batch_size=8, shuffle=False, num_workers=0)
    y_preds = []

    model.eval()
    for index, data in enumerate(tqdm(data_loader, desc="Predicting")):
      outputs = model.generate(
          input_ids=data['input_ids'].to(device),
          attention_mask=data['attention_mask'].to(device),
          max_length=max_len_target,
          num_beams=4,
          early_stopping=True
      )
      generate = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
      y_preds.extend(generate)

    return y_test, y_preds

dataset_test = datasets.Dataset.from_pandas(df_test)

y_test, y_preds = generate_predictions(dataset_test)

Predicting: 100%|██████████| 254/254 [03:16<00:00,  1.29it/s]


In [None]:
for y, p in zip(y_test[:10], y_preds[:10]):
  print(y)
  print(p)
  print("-"*80)

Dịch vụ tốt
Dịch vụ tốt
--------------------------------------------------------------------------------
Thiết kế phòng tạm
Thiết kế phòng tốt
--------------------------------------------------------------------------------
Vệ sinh phòng tốt, thiết kế phòng tốt
Vệ sinh phòng tốt, tiện nghi phòng tốt
--------------------------------------------------------------------------------
Khách sạn tốt
Khách sạn tốt
--------------------------------------------------------------------------------
Chất lượng đồ ăn tốt
Chất lượng đồ ăn tốt
--------------------------------------------------------------------------------
Khách sạn tốt
Phòng tạm
--------------------------------------------------------------------------------
Cơ sở hạ tầng tốt
Cơ sở hạ tầng tốt
--------------------------------------------------------------------------------
Chất lượng đồ ăn tốt
Chất lượng đồ ăn tốt, lựa chọn đồ ăn tốt
--------------------------------------------------------------------------------
Dịch vụ tốt
Dịch vụ t

In [None]:
y_true = parse_output(y_test)
y_pred = parse_output(y_preds)

print("## Aspect Detection Evaluate ##")
report = aspect_detection_eval(y_true, y_pred)
print(report)

print("\n## Sentiment Classification Evaluate ##")
report = sentiment_classification_eval(y_true, y_pred)
print(report)

print("\n## Combination Evaluate (Aspect + Polarity detection) ##")
report = combination_eval(y_true, y_pred)
print(report)

## Aspect Detection Evaluate ##
                            precision    recall  f1-score   support

                               0.9899    0.9910    0.9905     65852
    facilities#cleanliness     0.8750    0.8750    0.8750        16
        facilities#comfort     0.0000    0.0000    0.0000         6
facilities#design&features     0.6364    0.4118    0.5000        17
        facilities#general     0.5000    0.2444    0.3284        45
  facilities#miscellaneous     0.3000    0.2727    0.2857        11
         facilities#prices     0.3750    0.4286    0.4000         7
        facilities#quality     0.5867    0.5432    0.5641        81
 food&drinks#miscellaneous     0.2500    0.1111    0.1538         9
        food&drinks#prices     0.5000    0.3750    0.4286         8
       food&drinks#quality     0.8426    0.8922    0.8667       102
 food&drinks#style&options     0.8125    0.8784    0.8442        74
         hotel#cleanliness     0.8125    0.5000    0.6190        52
             ho

In [None]:
save = pd.DataFrame({
    'y_test': y_true,
    'y_pred': y_pred
})

save.to_csv("Unified_Seq2Seq_predicted.csv")