In [906]:
import datetime


today = datetime.datetime.now().strftime("%Y%m%d")
print(today)


20250525


### Step 1: Prepare the data

In [907]:
import pandas as pd

In [908]:
reviews_df = pd.read_csv('../02.Dataset/labeled/reviews.csv')

In [909]:
reviews_df.head()

Unnamed: 0,ProductID,CustomerID,Rating,Comment,Label
0,74021317,7991785,5,M·ªôt quy·ªÉn s√°ch hay,pos
1,187827003,18150739,5,"M√¨nh ƒë√£ t·ª´ng ƒë·ªçc s∆° n·ªôi dung s√°ch, r·∫•t hay, r·∫•...",pos
2,271380890,497788,5,"Quy·ªÉn s√°ch ƒë·∫πp v·ªÅ h√¨nh th·ª©c, n·ªôi dung m·ªõi ƒë·ªçc ...",pos
3,74021317,19165924,5,"S√°ch ƒë·∫πp, h√†i l√≤ng",pos
4,105483727,10170816,5,"s√°ch ƒë√≥ng g√≥i c·∫©n th·∫≠n, giao h√†nh nhanh",pos


In [910]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4380 entries, 0 to 4379
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ProductID   4380 non-null   int64 
 1   CustomerID  4380 non-null   int64 
 2   Rating      4380 non-null   int64 
 3   Comment     4380 non-null   object
 4   Label       4380 non-null   object
dtypes: int64(3), object(2)
memory usage: 171.2+ KB


In [911]:
reviews_df["Label"].value_counts()

Label
neg    2103
pos    1921
neu     356
Name: count, dtype: int64

In [912]:
reviews_df = reviews_df[reviews_df['Label'] != 'neu'].copy()

In [913]:
reviews_df.dropna(inplace=True)

In [914]:
print(reviews_df["Label"].unique())

['pos' 'neg']


In [915]:
reviews_df["Label"].value_counts()

Label
neg    2103
pos    1921
Name: count, dtype: int64

### Step 2: Data preprocessing

In [916]:
import csv
import string
import emoji
import re
from pyvi import ViTokenizer
import pandas as pd
from functools import lru_cache


# === Cached loading ===
# @lru_cache(maxsize=1)
def load_teencode_dict():
    with open(
        "../02.Dataset/vietnamese/teencode.csv", mode="r", encoding="utf-8"
    ) as file:
        reader = csv.DictReader(file)
        return {row["Teencode"]: row["Meaning"] for row in reader}


# @lru_cache(maxsize=1)
def load_stopwords():
    with open("../02.Dataset/vietnamese/stopwords.txt", "r", encoding="utf-8") as f:
        return set(line.strip().lower() for line in f if line.strip())


# @lru_cache(maxsize=1)
def load_phrase_rules():
    rules = {}
    with open(
        "../02.Dataset/vietnamese/phrase_rules.csv", mode="r", encoding="utf-8"
    ) as file:
        reader = csv.DictReader(file)
        for row in reader:
            if row["Phrase"] and row["Normalized"]:
                rules[row["Phrase"].strip()] = row["Normalized"].strip()
    return rules


# === Text Cleaning ===
def clean_icons(text):
    if pd.isna(text):
        return ""
    text = emoji.replace_emoji(text, replace="")
    text = re.sub(r"[:;][-~]?[)D(/\\|pP]", "", text)
    return text.replace("_x000D_", " ")


def lower(text):
    return text.lower().strip() if isinstance(text, str) else ""


def remove_links(text):
    return re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)


def remove_punctuation(text):
    if not isinstance(text, str):
        return ""
    return text.translate(str.maketrans({p: " " for p in string.punctuation}))


def remove_numbers(text):
    if not isinstance(text, str):
        return ""
    return re.sub(r"\d+", "", text)


# === Teencode conversion ===
def convert_teencode_to_vietnamese(sentence):
    if pd.isna(sentence):
        return ""
    dictionary = load_teencode_dict()
    words = sentence.split()
    converted_words = []
    for word in words:
        core_word = word.strip(string.punctuation)
        if core_word in dictionary:
            new_word = word.replace(core_word, dictionary[core_word])
            new_word = new_word.replace(' ', '_')
            converted_words.append(new_word)
        else:
            converted_words.append(word)
    return " ".join(converted_words)


# === Stopwords removal ===
def remove_vietnamese_stopwords(text):
    if pd.isna(text):
        return ""
    stopwords = load_stopwords()
    words = text.split()
    result = []
    i = 0
    while i < len(words):
        if i < len(words) - 1:
            two_word = f"{words[i]}_{words[i+1]}"
            if two_word in stopwords:
                i += 2
                continue
        if words[i] not in stopwords:
            result.append(words[i])
        i += 1
    return " ".join(result)


def normalize_repeated_chars(text):
    if pd.isna(text):
        return ""

    text = re.sub(r"(.)\1{1,}", r"\1", text)

    return text


# === Tokenization ===
def word_tokenize(text):
    if pd.isna(text):
        return ""
    return ViTokenizer.tokenize(text)


def apply_phrase_rules(text):
    if pd.isna(text):
        return ""
    rules = load_phrase_rules()

    # Apply general phrase rules first
    for phrase, normalized in sorted(
        rules.items(), key=lambda x: len(x[0]), reverse=True
    ):
        text = re.sub(rf"\b{re.escape(phrase)}\b", normalized, text)

    return text


# === Full Preprocessing Pipeline ===
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = clean_icons(text)
    text = lower(text)
    text = remove_punctuation(text)
    text = remove_links(text)
    text = remove_numbers(text)
    text = convert_teencode_to_vietnamese(text)
    text = normalize_repeated_chars(text)
    text = apply_phrase_rules(text)
    text = word_tokenize(text)
    text = remove_vietnamese_stopwords(text)
    return text

In [917]:
preprocess_text("ch∆∞a bao gi·ªù")

'ch∆∞a_bao_gi·ªù'

In [None]:
reviews_df["Comment Cleaned"] = reviews_df["Comment"].apply(preprocess_text)

reviews_df.dropna(subset=["Comment Cleaned"], inplace=True)
reviews_df = reviews_df[reviews_df["Comment Cleaned"].str.strip().str.len() >= 2]

reviews_df.reset_index(drop=True, inplace=True)

In [None]:
reviews_df.to_csv(f'../02.Dataset/{today}_reviews_processed.csv')

### Step 3: Split the dataset to train set and test set

In [None]:
from sklearn.model_selection import train_test_split


X = reviews_df["Comment Cleaned"]
y = reviews_df["Label"]
original_comments = reviews_df["Comment"]

X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, y, reviews_df.index, test_size=0.3, random_state=42
)

### Step 4: Train and fit the model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.svm import LinearSVC
text_clf_svm = Pipeline([("tfidf", TfidfVectorizer(max_features=3000, min_df=5, max_df=0.8, sublinear_tf=True)), ("clf", LinearSVC())])
text_clf_svm.fit(X_train, y_train)

### Step 5: Run predictions and analyze the results

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

predictions_svm = text_clf_svm.predict(X_test)

print("-----------------SVM-----------------")
print("Confusion matrix:\n", confusion_matrix(y_test, predictions_svm))
print()
print(f"Accuracy score: {accuracy_score(y_test, predictions_svm)}")
print()
print("Classification report:\n", classification_report(y_test, predictions_svm))

-----------------SVM-----------------
Confusion matrix:
 [[560  57]
 [ 66 517]]

Accuracy score: 0.8975

Classification report:
               precision    recall  f1-score   support

         neg       0.89      0.91      0.90       617
         pos       0.90      0.89      0.89       583

    accuracy                           0.90      1200
   macro avg       0.90      0.90      0.90      1200
weighted avg       0.90      0.90      0.90      1200



In [None]:
wrong_preds_mask = y_test != predictions_svm
wrong_indices = idx_test[wrong_preds_mask]

errors_df = reviews_df.loc[
    wrong_indices, ["Comment", "Comment Cleaned", "Label"]
].copy()
errors_df["Predicted"] = pd.Series(predictions_svm, index=idx_test)[
    wrong_preds_mask
].values

errors_df.to_csv(f'../06.Log/{today}_error.csv')

print("========= Nh·ªØng b√¨nh lu·∫≠n d·ª± ƒëo√°n sai =========")
print(errors_df.head(10))

                                                Comment  \
1760  T·∫°i s k c√≥ th√™m m·ªôt l·ªõp b·∫£o v·ªá n·ªØa ·∫°:(((_x000D...   
2099  Giao nhanh nh∆∞ 1 c∆°n gi√≥, t·ªëi ƒë·∫∑t s√°ng ƒë√£ c√≥. ...   
109   s√°ch nh√¨n m·ªõi c·ª©ng c√°p, m√πi th∆°m nh∆∞ng ƒë√≥ng g√≥...   
1849  m√¨nh ƒë√£ nghƒ© preorder s·∫Ω c√≥ ch·ªØ k√Ω t√°c gi·∫£ nh∆∞...   
1231       S√°ch x·ªãn nh∆∞ng ko hi·ªÉu sao l·∫°i b·ªã cong m√©p üò•   
8     √î k√™ con d√™ giao nhanh ch√≥ng m·∫∑t tiki l√† nh·∫•t ...   
1288  S√°ch tuy l√† m√¨nh ch∆∞a ƒë·ªçc, ƒë·ªçc √Ω ki·∫øn c·ªßa mn t...   
2153  ƒêi·ªÉm tr·ª´ l·ªõn c·ªßa tiki l√† g√≥i h√†ng s√°ch m√† ko b...   
642   M√¨nh nghƒ© cu·ªën s√°ch n√†y c√≥ th·ªÉ ch·∫°m ƒë·∫øn tr√°i t...   
2118  h√†ng giao nhanh nh∆∞ng g√≥i h√†ng ·∫©u. ch·ªó trang 2...   

                                        Comment Cleaned Label Predicted  
1760  s kh√¥ng_c√≥ l·ªõp b·∫£o_v·ªá may s√°ch kh√¥ng_b·ªã s l·ªõp ...   neg       pos  
2099  giao_nhanh gi√≥ t·ªëi ƒë√≥ng_g√≥i s∆°_s√†i kh√¥ng h·

### TEST

In [None]:
test_data = [
    "K h·ª£p v·ªõi th·ªùi hi·ªán t·∫°i v·ªõi vƒÉn phong nh∆∞ th·∫ø",
    "B·ªçc c·∫ßn th·∫≠n...to·∫πt z·ªùi",
    "hayyyy",
    "tuy·ªáttttt",
    "S√°ch c√≥ nhi·ªÅu trang b·ªã d√≠nh v√†o nhau",
    "s√°ch hay xu·∫•t s·∫Øc",
    "th√≠ch"
]

In [None]:
# Predict labels
predictions = text_clf_svm.predict([preprocess_text(text) for text in test_data])

preprocessed_data = ["".join(preprocess_text(text)) for text in test_data]

# Display each text with predicted label using lambda and zip
list(
    map(
        lambda x: print(
            f"üìù Original: {x[0]}\nüßπ Preprocessed: {x[1]}\nüîñ Predicted label: {x[2]}\n"
        ),
        zip(test_data, preprocessed_data, predictions),
    )
)

üìù Original: K h·ª£p v·ªõi th·ªùi hi·ªán t·∫°i v·ªõi vƒÉn phong nh∆∞ th·∫ø
üßπ Preprocessed: kh√¥ng_h·ª£p th·ªùi vƒÉn_phong
üîñ Predicted label: neg

üìù Original: B·ªçc c·∫ßn th·∫≠n...to·∫πt z·ªùi
üßπ Preprocessed: b·ªçc c·∫ßn_th·∫≠n tuy·ªát_v·ªùi
üîñ Predicted label: pos

üìù Original: hayyyy
üßπ Preprocessed: hay
üîñ Predicted label: pos

üìù Original: tuy·ªáttttt
üßπ Preprocessed: tuy·ªát
üîñ Predicted label: pos

üìù Original: S√°ch c√≥ nhi·ªÅu trang b·ªã d√≠nh v√†o nhau
üßπ Preprocessed: s√°ch trang b·ªã_d√≠nh
üîñ Predicted label: neg

üìù Original: s√°ch hay xu·∫•t s·∫Øc
üßπ Preprocessed: s√°ch_hay
üîñ Predicted label: pos

üìù Original: th√≠ch
üßπ Preprocessed: th√≠ch
üîñ Predicted label: pos



[None, None, None, None, None, None, None]

### Step 6: Save the model

In [None]:
import joblib

joblib.dump(text_clf_svm, f"../05.Models/{today}_sentiment_analysis_model.pkl")

['../05.Models/20250525_sentiment_analysis_model.pkl']