### Step 1: Prepare the data

In [134]:
import pandas as pd

In [135]:
reviews = pd.read_csv('../02.Dataset/labeled/reviews.csv')

In [136]:
reviews.head()

Unnamed: 0,ProductID,CustomerID,Rating,Comment,Label
0,74021317,7991785,5,Một quyển sách hay,pos
1,187827003,18150739,5,"Mình đã từng đọc sơ nội dung sách, rất hay, rấ...",pos
2,271380890,497788,5,"Quyển sách đẹp về hình thức, nội dung mới đọc ...",pos
3,74021317,19165924,5,"Sách đẹp, hài lòng",pos
4,105483727,10170816,5,"sách đóng gói cẩn thận, giao hành nhanh",pos


In [137]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5018 entries, 0 to 5017
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ProductID   5018 non-null   int64 
 1   CustomerID  5018 non-null   int64 
 2   Rating      5018 non-null   int64 
 3   Comment     5018 non-null   object
 4   Label       5017 non-null   object
dtypes: int64(3), object(2)
memory usage: 196.1+ KB


In [138]:
reviews['Label'].value_counts()

Label
neg    2598
pos    1594
neu     825
Name: count, dtype: int64

In [139]:
reviews.dropna(inplace=True)

In [140]:
print(reviews['Label'].unique())

['pos' 'neu' 'neg']


### Step 2: Text preprocessing

In [141]:
import string
import emoji
import re


# Clean icons
def clean_icons(text):
    text = emoji.replace_emoji(text, replace="")
    text = re.sub(r"[:;][-~]?[)D(/\\|pP]", "", text)
    text = text.replace("_x000D_", " ")
    return text

def lower(text):
    return text.lower().strip()

def remove_links(text):
    return re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)

# Convert comment to a full sentence
def convert_teencode_to_vietnamese(sentence, dictionary):
    words = sentence.split()
    converted_words = []
    for word in words:
        if word in dictionary:
            converted_words.append(dictionary[word])
            continue

        punctuation = ""
        temp_word = word
        while temp_word and temp_word[-1] in string.punctuation:
            punctuation = temp_word[-1] + punctuation
            temp_word = temp_word[:-1]
        if temp_word in dictionary:
            converted_words.append(dictionary[temp_word] + punctuation)
            continue

        leading_punctuation = ""
        temp_word = word
        while temp_word and temp_word[0] in string.punctuation:
            leading_punctuation += temp_word[0]
            temp_word = temp_word[1:]
        if temp_word in dictionary:
            converted_words.append(leading_punctuation + dictionary[temp_word])
            continue

        converted_words.append(word)

    return " ".join(converted_words)

# Remove stopwords
vietnamese_stopwords = ["là", "thì", "và", "nhưng", "các", "một", "những", "với", "cho", "này", "đã", "ở", "được", "rất", "có"]


def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in vietnamese_stopwords])

In [142]:
from pyvi import ViTokenizer

def word_segmentation(text):
  if pd.isna(text):
      return ""
  return ViTokenizer.tokenize(text)

In [None]:
import csv

dictionary = {}
with open(
    "../02.Dataset/teencode.csv",
    mode="r",
    encoding="utf-8",
) as file:
    reader = csv.DictReader(file)
    for row in reader:
        dictionary[row["Teencode"]] = row["Meaning"]

reviews["Comment"] = reviews["Comment"].apply(clean_icons)
reviews["Comment"] = reviews["Comment"].apply(lower)
reviews["Comment"] = reviews["Comment"].apply(remove_links)
reviews["Comment"] = reviews["Comment"].apply(
    lambda x: convert_teencode_to_vietnamese(x, dictionary)
)

# Word segmentation
reviews["Comment"] = reviews["Comment"].apply(word_segmentation)

reviews.dropna(subset=["Comment"], inplace=True)
reviews.reset_index(drop=True, inplace=True)

In [144]:
# # Word segmentation
# from underthesea import word_tokenize

# text = "Sản phẩm rất tốt và giao hàng nhanh"
# print(word_tokenize(text))

In [145]:
reviews.head(100)

Unnamed: 0,ProductID,CustomerID,Rating,Comment,Label
0,74021317,7991785,5,một quyển sách hay,pos
1,187827003,18150739,5,"mình đã từng đọc sơ nội_dung sách , rất hay , ...",pos
2,271380890,497788,5,"quyển sách đẹp về hình_thức , nội_dung mới đọc...",pos
3,74021317,19165924,5,"sách đẹp , hài_lòng",pos
4,105483727,10170816,5,"sách đóng_gói cẩn_thận , giao hành nhanh",pos
...,...,...,...,...,...
95,136340700,16493140,5,"đặt sau 1 tiếng có hàng rồi , quá nhanh quá yê...",pos
96,54614797,10456907,5,"bìa đẹp , giao hàng siêu nhanh , đóng_gói cẩn_...",pos
97,113530805,18051370,5,sách viết về các trải nghiệm của tác_giả . 1 c...,pos
98,8886007,16290383,5,"ngay sau khi đọc xong người đua diều , mình đã...",pos


### Step 3: Split the dataset to train set and test set

In [146]:
from sklearn.model_selection import train_test_split

In [147]:
X = reviews["Comment"]

y = reviews["Label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

### Step 4: Train and fit the model

In [148]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [149]:
text_clf = Pipeline([("tfidf", TfidfVectorizer(max_features=3000, min_df=5, max_df=0.8, sublinear_tf=True)), ("clf", LinearSVC())])

In [150]:
text_clf.fit(X_train, y_train)

### Step 5: Run predictions and analyze the results

In [151]:
predictions = text_clf.predict(X_test)

In [152]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

print('Confusion matrix:\n', confusion_matrix(y_test, predictions))
print()
print(f"Accuracy score: {accuracy_score(y_test, predictions)}")
print()
print('Classsification report:\n', classification_report(y_test, predictions))

Confusion matrix:
 [[684  43  46]
 [130  56  62]
 [ 90  38 357]]

Accuracy score: 0.7284196547144755

Classsification report:
               precision    recall  f1-score   support

         neg       0.76      0.88      0.82       773
         neu       0.41      0.23      0.29       248
         pos       0.77      0.74      0.75       485

    accuracy                           0.73      1506
   macro avg       0.64      0.62      0.62      1506
weighted avg       0.70      0.73      0.71      1506



In [122]:
text_clf.predict(["Sách hay quá", "Sách dở quá", "Sách bình thường", "Sách không hay lắm"])

array(['pos', 'neg', 'pos', 'neg'], dtype=object)