In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd '/content/drive/MyDrive/NLU_NCKH/notebook/hotel_data_pre/'

/content/drive/MyDrive/NLU_NCKH/notebook/hotel_data_pre


In [3]:
!pip install --upgrade scikit-learn



In [4]:
import sklearn
sklearn.set_config(enable_metadata_routing=True)

# Import library

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import torch
from sklearn.metrics import *

# Load data

In [6]:
df_train = pd.read_csv("./Train.csv")
df_dev = pd.read_csv("./Dev.csv")
df_test = pd.read_csv("./Test.csv")

print("Train: ", df_train.shape)
print("Dev: ",  df_dev.shape)
print("Test: ", df_test.shape )

Train:  (7180, 35)
Dev:  (795, 35)
Test:  (2030, 35)


In [51]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

aspects = df_train.columns[1:]
def make_outputs(df):
    outputs = []
    for row in range(len(df)):
        row_one_hot = []
        for col in range(1, len(df.columns)):
            sentiment = df.iloc[row, col]
            if   sentiment == 0: one_hot = [1, 0, 0, 0] # None
            elif sentiment == 1: one_hot = [0, 1, 0, 0] # Pos
            elif sentiment == 2: one_hot = [0, 0, 1, 0] # Neg
            elif sentiment == 3: one_hot = [0, 0, 0, 1] # Neu
            row_one_hot.append(one_hot)
        outputs.append(np.array(row_one_hot).reshape(-1))
    return np.array(outputs, dtype='uint8')

x_train = df_train['review']
y_train_136 = make_outputs(df_train)
y_train = df_train[df_train.columns[1:]]

x_test = df_test['review']
y_test_136 = make_outputs(df_test)
y_test = df_test[df_test.columns[1:]]


vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                             min_df=2, max_df=0.9)

# x data using basic clean up class and basic features extrator
x_train = vectorizer.fit_transform(x_train)
x_test   = vectorizer.transform(x_test)

x_train.shape, y_train.shape

((7180, 19589), (7180, 34))

In [8]:
import os
import math
from tqdm import tqdm, trange
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, f1_score

label_map= {'None': 0,
            'negative':1,
            'neutral':2,
            'positive':3}

replacements={0: 'None',
              1: 'negative',
              2: 'neutral',
              3: 'positive'}
target_names = list(map(str, replacements.values()))

def aspect_detection_eval(y_test, y_pred):
  """
  y_test: grouth_true test, DataFrame
  y_pred: grouth_true predict, DataFrame
  """
  categories= y_pred.columns
  y_test= y_test.fillna('not_exist').replace(label_map).values.tolist()
  y_pred= y_pred.fillna('not_exist').replace(label_map).values.tolist()

  aspect_test = []
  aspect_pred = []

  for row_test, row_pred in zip(y_test, y_pred):
      for index, (col_test, col_pred) in enumerate(zip(row_test, row_pred)):
          aspect_test.append(bool(col_test) * categories[index])
          aspect_pred.append(bool(col_pred) * categories[index])

  print("## Aspect Detection Evaluate ##")
  print("F1-score micro: ", f1_score(aspect_test, aspect_pred, average='micro'))
  print("F1-score macro: ", f1_score(aspect_test, aspect_pred, average='macro'))
  print(classification_report(aspect_test, aspect_pred, digits=4, zero_division=1))
  return classification_report(aspect_test, aspect_pred, digits=4, zero_division=1, output_dict=True)


def sentiment_classification_eval(y_test, y_pred):
  """
  y_test: grouth_true test, DataFrame
  y_pred: grouth_true predict, DataFrame
  """
  categories= y_pred.columns
  y_test= y_test.fillna('not_exist').replace(label_map).values.tolist()
  y_pred= y_pred.fillna('not_exist').replace(label_map).values.tolist()

  y_test_flat = np.array(y_test).flatten()
  y_pred_flat = np.array(y_pred).flatten()
  target_names = list(map(str, replacements.values()))

  print("## Sentiment Classification Evaluate ##")
  print("F1-score micro: ", f1_score(y_test_flat, y_pred_flat, average='micro'))
  print("F1-score macro: ", f1_score(y_test_flat, y_pred_flat, average='macro'))
  print(classification_report(y_test_flat, y_pred_flat, target_names=target_names, digits=4))

  return classification_report(y_test_flat, y_pred_flat, digits=4, output_dict=True)

def combination_eval(y_test, y_pred):
  """
  y_test: grouth_true test, DataFrame
  y_pred: grouth_true predict, DataFrame
  """
  categories= y_pred.columns
  y_test= y_test.fillna('not_exist').replace(label_map).values.tolist()
  y_pred= y_pred.fillna('not_exist').replace(label_map).values.tolist()

  aspect_polarity_test = []
  aspect_polarity_pred = []

  for row_test, row_pred in zip(y_test, y_pred):
      for index, (col_test, col_pred) in enumerate(zip(row_test, row_pred)):
          aspect_polarity_test.append(f'{categories[index]},{replacements[col_test]}')
          aspect_polarity_pred.append(f'{categories[index]},{replacements[col_pred]}')

  print("## Combination Evaluate (Aspect + Polarity dection) ##")
  print("F1-score micro: ", f1_score(aspect_polarity_test, aspect_polarity_pred, average='micro'))
  print("F1-score macro: ", f1_score(aspect_polarity_test, aspect_polarity_pred, average='macro'))
  print(classification_report(aspect_polarity_test, aspect_polarity_pred, digits=4, zero_division=1))
  return classification_report(aspect_polarity_test, aspect_polarity_pred, digits=4, zero_division=1)

In [52]:
def analyze_label_distribution(y):
      label_counts = np.sum(y, axis=0)
      total_samples = y.shape[0]

      print("Label distribution:")
      for i, count in enumerate(label_counts):
          percentage = (count / total_samples) * 100
          print(f"Label {i}: {count} ({percentage:.2f}%)")

      print("\nLabels with potential issues:")
      for i, count in enumerate(label_counts):
          if count == 0:
              print(f"Label {i}: No positive samples")
          elif count == total_samples:
              print(f"Label {i}: No negative samples")
      return label_counts

label_counts = analyze_label_distribution(y_train_136)
# Identify problematic labels
zero_labels = np.where(label_counts == 0)[0]
all_one_labels = np.where(label_counts == y_train.shape[0])[0]

print("Labels with all zeros:", zero_labels)
print("Labels with all ones:", all_one_labels)

Label distribution:
Label 0: 7122 (99.19%)
Label 1: 42 (0.58%)
Label 2: 2 (0.03%)
Label 3: 14 (0.19%)
Label 4: 7158 (99.69%)
Label 5: 11 (0.15%)
Label 6: 0 (0.00%)
Label 7: 11 (0.15%)
Label 8: 7121 (99.18%)
Label 9: 27 (0.38%)
Label 10: 0 (0.00%)
Label 11: 32 (0.45%)
Label 12: 7020 (97.77%)
Label 13: 23 (0.32%)
Label 14: 10 (0.14%)
Label 15: 127 (1.77%)
Label 16: 7140 (99.44%)
Label 17: 35 (0.49%)
Label 18: 2 (0.03%)
Label 19: 3 (0.04%)
Label 20: 7159 (99.71%)
Label 21: 11 (0.15%)
Label 22: 3 (0.04%)
Label 23: 7 (0.10%)
Label 24: 7101 (98.90%)
Label 25: 60 (0.84%)
Label 26: 1 (0.01%)
Label 27: 18 (0.25%)
Label 28: 7148 (99.55%)
Label 29: 27 (0.38%)
Label 30: 1 (0.01%)
Label 31: 4 (0.06%)
Label 32: 7149 (99.57%)
Label 33: 15 (0.21%)
Label 34: 4 (0.06%)
Label 35: 12 (0.17%)
Label 36: 6815 (94.92%)
Label 37: 88 (1.23%)
Label 38: 36 (0.50%)
Label 39: 241 (3.36%)
Label 40: 6916 (96.32%)
Label 41: 145 (2.02%)
Label 42: 12 (0.17%)
Label 43: 107 (1.49%)
Label 44: 6994 (97.41%)
Label 45: 15 (0.

In [55]:
from sklearn.svm import LinearSVC
from sklearn.multioutput import MultiOutputClassifier as MOC
from sklearn.multioutput import ClassifierChain as MOC_CC

# If you find labels with no positive or negative samples, you may want to remove them
base_clf = LinearSVC()
cc_clf = MOC_CC(base_clf)
cc_clf.fit(x_train, y_train)
ypred_svm_cc = cc_clf.predict(x_test)


clf_moc = MOC(LinearSVC())
clf_moc.fit(x_train, y_train)
ypred_svm_moc = clf_moc.predict(x_test)

# Evaluate

In [59]:
y_pred = ypred_svm_cc
y_test_rs = y_test

# y_test_flat = np.argmax(y_test_rs, axis=2)
# y_pred = np.argmax(ypred_svm_moc_rs, axis=2)

# Evaluate Classifier Chain
ad = aspect_detection_eval(df_test[aspects], pd.DataFrame(y_pred, columns=aspects))
ast = sentiment_classification_eval(df_test[aspects], pd.DataFrame(y_pred, columns=aspects))
cp = combination_eval(df_test[aspects], pd.DataFrame(y_pred, columns=aspects))

## Aspect Detection Evaluate ##
F1-score micro:  0.9780353520718632
F1-score macro:  0.4402872320437999
                                precision    recall  f1-score   support

                                   0.9828    0.9943    0.9885     65760
        FACILITIES#CLEANLINESS     1.0000    0.3750    0.5455        16
            FACILITIES#COMFORT     1.0000    0.0000    0.0000         6
    FACILITIES#DESIGN&FEATURES     1.0000    0.2941    0.4545        17
            FACILITIES#GENERAL     0.5833    0.1556    0.2456        45
      FACILITIES#MISCELLANEOUS     1.0000    0.0909    0.1667        11
             FACILITIES#PRICES     1.0000    0.0000    0.0000         6
            FACILITIES#QUALITY     1.0000    0.1818    0.3077        22
     FOOD&DRINKS#MISCELLANEOUS     1.0000    0.0000    0.0000         9
            FOOD&DRINKS#PRICES     1.0000    0.0000    0.0000         8
           FOOD&DRINKS#QUALITY     0.8857    0.6078    0.7209       102
     FOOD&DRINKS#STYLE&OPTIONS 

In [60]:
y_pred = ypred_svm_moc
y_test_rs = y_test


# Evaluate Multi ouput classifier
ad = aspect_detection_eval(df_test[aspects], pd.DataFrame(y_pred, columns=aspects))
ast = sentiment_classification_eval(df_test[aspects], pd.DataFrame(y_pred, columns=aspects))
cp = combination_eval(df_test[aspects], pd.DataFrame(y_pred, columns=aspects))

## Aspect Detection Evaluate ##
F1-score micro:  0.9778325123152709
F1-score macro:  0.422199376275569
                                precision    recall  f1-score   support

                                   0.9818    0.9951    0.9884     65760
        FACILITIES#CLEANLINESS     1.0000    0.3750    0.5455        16
            FACILITIES#COMFORT     1.0000    0.0000    0.0000         6
    FACILITIES#DESIGN&FEATURES     1.0000    0.2941    0.4545        17
            FACILITIES#GENERAL     0.5833    0.1556    0.2456        45
      FACILITIES#MISCELLANEOUS     1.0000    0.0000    0.0000        11
             FACILITIES#PRICES     1.0000    0.0000    0.0000         6
            FACILITIES#QUALITY     1.0000    0.1818    0.3077        22
     FOOD&DRINKS#MISCELLANEOUS     1.0000    0.0000    0.0000         9
            FOOD&DRINKS#PRICES     1.0000    0.0000    0.0000         8
           FOOD&DRINKS#QUALITY     0.8889    0.6275    0.7356       102
     FOOD&DRINKS#STYLE&OPTIONS  