# 1. Install & Import Libraries

In [1]:
!pip install underthesea

Collecting underthesea
  Downloading underthesea-6.8.4-py3-none-any.whl.metadata (15 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting underthesea-core==1.0.4 (from underthesea)
  Downloading underthesea_core-1.0.4-cp311-cp311-manylinux2010_x86_64.whl.metadata (1.7 kB)
Downloading underthesea-6.8.4-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m47.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading underthesea_core-1.0.4-cp311-cp311-manylinux2010_x86_64.whl (657 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m657.8/657.8 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m 

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import string

from underthesea import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

from tensorflow import keras
from keras.models import Model
from keras.layers import Dense, Input, Embedding, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

# 2. Load Dataset

In [3]:
df = pd.read_csv('/kaggle/input/vietnamese-text-classification-dataset/train.csv', names = ['label', 'content'])
df.head()

Unnamed: 0,label,content
0,0,máy dùng hay bị đơ máy
1,0,chỉ có dây cáp nguồn không có adapter sao sử d...
2,0,Chất lượng quá kém Mới dùng được 2 ngày loa ba...
3,0,Usb tôi vừa mới nhận usb này Rất bực bội vì cá...
4,2,Tuyệt vời. Hàng FPT cửa hàng


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3040 entries, 0 to 3039
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    3040 non-null   int64 
 1   content  3040 non-null   object
dtypes: int64(1), object(1)
memory usage: 47.6+ KB


# 3. Text Cleaning

In [5]:
def clean_text(text: str) -> str:
    """
    Làm sạch văn bản:
    - Chuyển thành chữ thường
    - Loại bỏ URL, domain phổ biến (.com, .net, .org, ...)
    - Loại bỏ số, ký tự đặc biệt, punctuation
    - Loại bỏ emoji
    - Chuẩn hóa khoảng trắng
    """
    text = str(text).lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', ' ', text)

    # Remove common domain tokens
    text = re.sub(r'\b(com|net|org)\b', ' ', text, flags=re.IGNORECASE)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove special characters (chỉ giữ chữ, số, dấu cách)
    text = re.sub(r'[^\w\s]', ' ', text)

    # Remove numbers
    text = re.sub(r'\d+', ' ', text)

    # Remove underscore
    text = text.replace('_', ' ')

    # Remove emoji
    emoji_pattern = re.compile(
        "[" 
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        u"\U00002702-\U000027B0"  
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub('', text)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


def preprocessing(df: pd.DataFrame, text_column: str) -> pd.DataFrame:
    """
    Làm sạch toàn bộ cột văn bản trong DataFrame
    """
    df[text_column] = df[text_column].astype(str).apply(clean_text)
    return df

In [6]:
df = preprocessing(df, "content")
df.head()

Unnamed: 0,label,content
0,0,máy dùng hay bị đơ máy
1,0,chỉ có dây cáp nguồn không có adapter sao sử d...
2,0,chất lượng quá kém mới dùng được ngày loa bass...
3,0,usb tôi vừa mới nhận usb này rất bực bội vì cá...
4,2,tuyệt vời hàng fpt cửa hàng


# 4. Train/Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    df["content"], df["label"], test_size=0.2, random_state=42, stratify=df["label"])

# 5. Classical Machine Learning (SVM + TF-IDF)

In [8]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

svm = SVC(kernel="linear")
svm.fit(X_train_vec, y_train)

y_pred = svm.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

Accuracy: 0.8026315789473685
              precision    recall  f1-score   support

           0      0.784     0.905     0.840       221
           1      0.710     0.621     0.663       177
           2      0.899     0.848     0.873       210

    accuracy                          0.803       608
   macro avg      0.798     0.791     0.792       608
weighted avg      0.802     0.803     0.800       608



# 6. Deep Learning with LSTM

In [9]:
# Tokenizer
list_tokens = [word_tokenize(x) for x in df["content"]]
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(list_tokens)
seqs = tokenizer.texts_to_sequences(list_tokens)
padded = pad_sequences(seqs, maxlen=200)

X_train, X_test, y_train, y_test = train_test_split(
    padded, df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

# Model LSTM
input_layer = Input(shape=(200,))
x = Embedding(10000, 128)(input_layer)
x = LSTM(64)(x)
x = Dropout(0.5)(x)
output = Dense(3, activation="softmax")(x)

model = Model(input_layer, output)
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

early_stop = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=2)

history = model.fit(
    X_train, y_train, validation_split=0.1, epochs=10, batch_size=64,
    callbacks=[early_stop, reduce_lr]
)

print("Evaluate:", model.evaluate(X_test, y_test))


2025-09-13 14:50:03.226236: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757775003.509206      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757775003.590317      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-09-13 14:50:27.778896: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Epoch 1/10
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 184ms/step - accuracy: 0.4301 - loss: 1.0759 - val_accuracy: 0.5738 - val_loss: 0.9135 - learning_rate: 0.0010
Epoch 2/10
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 174ms/step - accuracy: 0.6169 - loss: 0.8485 - val_accuracy: 0.6434 - val_loss: 0.7171 - learning_rate: 0.0010
Epoch 3/10
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 167ms/step - accuracy: 0.7368 - loss: 0.5976 - val_accuracy: 0.7582 - val_loss: 0.5939 - learning_rate: 0.0010
Epoch 4/10
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 169ms/step - accuracy: 0.8550 - loss: 0.4117 - val_accuracy: 0.7869 - val_loss: 0.5648 - learning_rate: 0.0010
Epoch 5/10
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 163ms/step - accuracy: 0.9235 - loss: 0.2667 - val_accuracy: 0.7746 - val_loss: 0.6402 - learning_rate: 0.0010
Epoch 6/10
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6

In [None]:
# !pip install --upgrade transformers
# !pip install transformers datasets torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from datasets import ClassLabel

# 1. Dataset
hf_dataset = Dataset.from_pandas(
    df[["content", "label"]].rename(columns={"label":"labels"})
)

num_classes = len(set(df["label"]))
class_label = ClassLabel(num_classes=num_classes, names=[str(i) for i in range(num_classes)])
hf_dataset = hf_dataset.cast_column("labels", class_label)

# 2. Tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
def tokenize_fn(batch):
    return tokenizer(batch["content"], padding="max_length", truncation=True, max_length=256)
hf_dataset = hf_dataset.map(tokenize_fn, batched=True)
hf_dataset = hf_dataset.train_test_split(test_size=0.2, stratify_by_column="labels")

# 3. Model
model = AutoModelForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=num_classes)

# 4. Training args
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="none"   
)

# 5. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset["train"],
    eval_dataset=hf_dataset["test"],
    processing_class=tokenizer   
)

# 6. Train
trainer.train()

# 7. Evaluate
print(trainer.evaluate())

# 8. Predict
test_text = "Sản phẩm này rất tốt và dùng ổn định"
inputs = tokenizer(test_text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(model.device)
outputs = model(**inputs)
pred = outputs.logits.argmax(dim=-1).item()
print("Predicted label:", pred)

Casting the dataset:   0%|          | 0/3040 [00:00<?, ? examples/s]

Map:   0%|          | 0/3040 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
