In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install -U --q emoji underthesea

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
%cd '/content/drive/MyDrive/NLU_NCKH/notebook/res_data_preprocesed/'

/content/drive/MyDrive/NLU_NCKH/notebook/res_data_preprocesed


# Import library

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import torch
from sklearn.metrics import *

# Load data

In [5]:
df_train = pd.read_csv("train_res_preprocesed.csv")
df_dev = pd.read_csv("dev_res_preprocesed.csv")
df_test = pd.read_csv("test_res_preprocesed.csv")

print("Train: ", df_train.shape)
print("Dev: ",  df_dev.shape)
print("Test: ", df_test.shape )

Train:  (7028, 13)
Dev:  (771, 13)
Test:  (1938, 13)


In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

aspects = df_train.columns[1:]
def make_outputs(df):
    outputs = []
    for row in range(len(df)):
        row_one_hot = []
        for col in range(1, len(df.columns)):
            sentiment = df.iloc[row, col]
            if   sentiment == 0: one_hot = [1, 0, 0, 0] # None
            elif sentiment == 1: one_hot = [0, 1, 0, 0] # Pos
            elif sentiment == 2: one_hot = [0, 0, 1, 0] # Neg
            elif sentiment == 3: one_hot = [0, 0, 0, 1] # Neu
            row_one_hot.append(one_hot)
        outputs.append(np.array(row_one_hot).reshape(-1))
    return np.array(outputs, dtype='uint8')

x_train = df_train['review']
y_train = make_outputs(df_train)

x_test = df_test['review']
y_test = make_outputs(df_test)

vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                             min_df=2, max_df=0.9)

# x data using basic clean up class and basic features extrator
x_train = vectorizer.fit_transform(x_train)
x_test   = vectorizer.transform(x_test)

x_train.shape, y_train.shape

((7028, 20068), (7028, 48))

In [11]:
import os
import math
from tqdm import tqdm, trange
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, f1_score

label_map= {'None': 0,
            'negative':1,
            'neutral':2,
            'positive':3}

replacements={0: 'None',
              1: 'negative',
              2: 'neutral',
              3: 'positive'}
target_names = list(map(str, replacements.values()))

def aspect_detection_eval(y_test, y_pred):
  """
  y_test: grouth_true test, DataFrame
  y_pred: grouth_true predict, DataFrame
  """
  categories= y_pred.columns
  y_test= y_test.fillna('not_exist').replace(label_map).values.tolist()
  y_pred= y_pred.fillna('not_exist').replace(label_map).values.tolist()

  aspect_test = []
  aspect_pred = []

  for row_test, row_pred in zip(y_test, y_pred):
      for index, (col_test, col_pred) in enumerate(zip(row_test, row_pred)):
          aspect_test.append(bool(col_test) * categories[index])
          aspect_pred.append(bool(col_pred) * categories[index])

  print("## Aspect Detection Evaluate ##")
  print("F1-score micro: ", f1_score(aspect_test, aspect_pred, average='micro'))
  print("F1-score macro: ", f1_score(aspect_test, aspect_pred, average='macro'))
  print(classification_report(aspect_test, aspect_pred, digits=4, zero_division=1))
  return classification_report(aspect_test, aspect_pred, digits=4, zero_division=1, output_dict=True)


def sentiment_classification_eval(y_test, y_pred):
  """
  y_test: grouth_true test, DataFrame
  y_pred: grouth_true predict, DataFrame
  """
  categories= y_pred.columns
  y_test= y_test.fillna('not_exist').replace(label_map).values.tolist()
  y_pred= y_pred.fillna('not_exist').replace(label_map).values.tolist()

  y_test_flat = np.array(y_test).flatten()
  y_pred_flat = np.array(y_pred).flatten()
  target_names = list(map(str, replacements.values()))

  print("## Sentiment Classification Evaluate ##")
  print("F1-score micro: ", f1_score(y_test_flat, y_pred_flat, average='micro'))
  print("F1-score macro: ", f1_score(y_test_flat, y_pred_flat, average='macro'))
  print(classification_report(y_test_flat, y_pred_flat, target_names=target_names, digits=4))

  return classification_report(y_test_flat, y_pred_flat, digits=4, output_dict=True)

def combination_eval(y_test, y_pred):
  """
  y_test: grouth_true test, DataFrame
  y_pred: grouth_true predict, DataFrame
  """
  categories= y_pred.columns
  y_test= y_test.fillna('not_exist').replace(label_map).values.tolist()
  y_pred= y_pred.fillna('not_exist').replace(label_map).values.tolist()

  aspect_polarity_test = []
  aspect_polarity_pred = []

  for row_test, row_pred in zip(y_test, y_pred):
      for index, (col_test, col_pred) in enumerate(zip(row_test, row_pred)):
          aspect_polarity_test.append(f'{categories[index]},{replacements[col_test]}')
          aspect_polarity_pred.append(f'{categories[index]},{replacements[col_pred]}')

  print("## Combination Evaluate (Aspect + Polarity dection) ##")
  print("F1-score micro: ", f1_score(aspect_polarity_test, aspect_polarity_pred, average='micro'))
  print("F1-score macro: ", f1_score(aspect_polarity_test, aspect_polarity_pred, average='macro'))
  print(classification_report(aspect_polarity_test, aspect_polarity_pred, digits=4, zero_division=1))
  return classification_report(aspect_polarity_test, aspect_polarity_pred, digits=4, zero_division=1, output_dict=True)

In [12]:
from sklearn.svm import LinearSVC
from sklearn.multioutput import MultiOutputClassifier as MOC
from sklearn.multioutput import ClassifierChain as MOC_CC

clf_moc = MOC(LinearSVC(random_state=5))
clf_cc = MOC_CC(LinearSVC(random_state=5))

clf_moc.fit(x_train, y_train)
clf_cc.fit(x_train, y_train)

ypred_svm_moc = clf_moc.predict(x_test)
ypred_svm_cc = clf_cc.predict(x_test)

# Evaluate

In [13]:
ypred_svm_moc_rs = ypred_svm_cc.reshape(-1, 12, 4)
y_test_rs = y_test.reshape(-1, 12, 4)

y_test_flat = np.argmax(y_test_rs, axis=2)
y_pred = np.argmax(ypred_svm_moc_rs, axis=2)

# Evaluate Classifier Chain
ad = aspect_detection_eval(df_test[aspects], pd.DataFrame(y_pred, columns=aspects))
ast = sentiment_classification_eval(df_test[aspects], pd.DataFrame(y_pred, columns=aspects))
cp = combination_eval(df_test[aspects], pd.DataFrame(y_pred, columns=aspects))

## Aspect Detection Evaluate ##
F1-score micro:  0.9458204334365325
F1-score macro:  0.6913527755463968
                          precision    recall  f1-score   support

                             0.9621    0.9774    0.9697     20627
        AMBIENCE#GENERAL     0.9378    0.7974    0.8619       227
           DRINKS#PRICES     0.7273    0.1702    0.2759        47
          DRINKS#QUALITY     0.7222    0.7044    0.7132       203
    DRINKS#STYLE&OPTIONS     0.7374    0.5659    0.6404       129
             FOOD#PRICES     0.6098    0.2232    0.3268       112
            FOOD#QUALITY     0.7789    0.8520    0.8138       554
      FOOD#STYLE&OPTIONS     0.7353    0.7437    0.7395       437
        LOCATION#GENERAL     0.9859    0.6731    0.8000       104
      RESTAURANT#GENERAL     0.7674    0.5259    0.6241       251
RESTAURANT#MISCELLANEOUS     0.8667    0.5379    0.6638       145
       RESTAURANT#PRICES     0.6903    0.6667    0.6783       117
         SERVICE#GENERAL     0.9434  

In [None]:
ypred_svm_moc_rs = ypred_svm_moc.reshape(-1, 12, 4)
y_test_rs = y_test.reshape(-1, 12, 4)

y_test_flat = np.argmax(y_test_rs, axis=2)
y_pred = np.argmax(ypred_svm_moc_rs, axis=2)

# Evaluate Multi ouput classifier
ad = aspect_detection_eval(df_test[aspects], pd.DataFrame(y_pred, columns=aspects))
ast = sentiment_classification_eval(df_test[aspects], pd.DataFrame(y_pred, columns=aspects))
cp = combination_eval(df_test[aspects], pd.DataFrame(y_pred, columns=aspects))

## Aspect Detection Evaluate ##
F1-score micro:  0.9284055727554179
F1-score macro:  0.5189421725486433
                          precision    recall  f1-score   support

                             0.9309    0.9930    0.9609     20627
        AMBIENCE#GENERAL     0.9606    0.5374    0.6893       227
           DRINKS#PRICES     1.0000    0.0426    0.0816        47
          DRINKS#QUALITY     0.8056    0.4286    0.5595       203
    DRINKS#STYLE&OPTIONS     0.8529    0.2248    0.3558       129
             FOOD#PRICES     0.5769    0.1339    0.2174       112
            FOOD#QUALITY     0.8427    0.5704    0.6803       554
      FOOD#STYLE&OPTIONS     0.8549    0.3776    0.5238       437
        LOCATION#GENERAL     1.0000    0.4423    0.6133       104
      RESTAURANT#GENERAL     0.8696    0.2390    0.3750       251
RESTAURANT#MISCELLANEOUS     1.0000    0.2621    0.4153       145
       RESTAURANT#PRICES     0.9130    0.3590    0.5153       117
         SERVICE#GENERAL     0.9842  