In [1]:
import os
import logging
import joblib
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from datetime import datetime
from pathlib import Path
from preprocessing.text_processor import TextProcessor

In [10]:
data_path = "data/labeled/reviews.csv"
teencode_path = "resources/teencode.csv"
stopwords_path = "resources/stopwords.txt"
phrases_path = "resources/phrase_mapping.csv"
output_model = "models/svm_model.pkl"

In [2]:
# Logging setup
log_dir = "log"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, "train_svm_model.log")
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.FileHandler(log_file, encoding="utf-8"), logging.StreamHandler()],
)

In [None]:
def load_and_preprocess_data(file_path, processor):
    df = pd.read_csv(file_path)
    df = df[df["label"] != "neu"].dropna(subset=["comment"])
    df["processed"] = df["comment"].apply(lambda x: processor.preprocess(x))
    df = df[df["processed"].str.strip().str.len() >= 2].reset_index(drop=True)
    return df

In [4]:
from sklearn.svm import LinearSVC

def train_model(X_train, y_train):
    pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(max_features=3000, min_df=5, max_df=0.8, sublinear_tf=True)),
        ("clf", LinearSVC(dual="auto")),
    ])
    pipeline.fit(X_train, y_train)
    return pipeline

In [5]:
def evaluate_model(model, X_test, y_test):
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    report = classification_report(y_test, preds)
    cm = confusion_matrix(y_test, preds)
    return acc, report, cm, preds

In [None]:
# Initialize text processor
processor = TextProcessor(
    teencode_path=teencode_path,
    stopword_path=stopwords_path,
    phrase_mapping_path=phrases_path,
)

logging.info("Loading and preprocessing data...")
df = load_and_preprocess_data(data_path, processor)
X = df["processed"]
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
logging.info("Training model...")
model = train_model(X_train, y_train)

acc, report, cm, preds = evaluate_model(model, X_test, y_test)
logging.info(f"Accuracy: {acc:.4f}")
logging.info(f"Classification Report:\n{report}")

today = datetime.now().strftime("%Y%m%d")
output_path = Path(output_model)
os.makedirs(output_path.parent, exist_ok=True)
model_path = output_path.parent / f"{today}_{output_path.name}"
joblib.dump(model, model_path)
logging.info(f"Model saved to {model_path}")
print(f"✅ Model saved to {model_path}")

2025-06-21 22:24:22,567 - INFO - Loading and preprocessing data...


2025-06-21 22:24:30,925 - INFO - Training model...
2025-06-21 22:24:31,060 - INFO - Accuracy: 0.8984
2025-06-21 22:24:31,061 - INFO - Classification Report:
              precision    recall  f1-score   support

         neg       0.87      0.94      0.90       607
         pos       0.94      0.85      0.89       594

    accuracy                           0.90      1201
   macro avg       0.90      0.90      0.90      1201
weighted avg       0.90      0.90      0.90      1201

2025-06-21 22:24:31,114 - INFO - Model saved to models\20250621_svm_model.pkl


✅ Model saved to models\20250621_svm_model.pkl


In [7]:
df.head()

Unnamed: 0,productID,userId,rating,comment,label,processed
0,74021317,7991785,5,Một quyển sách hay,pos,quyển_sách hay
1,187827003,18150739,5,"Mình đã từng đọc sơ nội dung sách, rất hay, rấ...",pos,đọc sơ nội_dung sách hay đáng_để nghiền_ngẫm t...
2,271380890,497788,5,"Quyển sách đẹp về hình thức, nội dung mới đọc ...",pos,quyển_sách đẹp hình_thức nội_dung mới đọc tò_m...
3,74021317,19165924,5,"Sách đẹp, hài lòng",pos,sách_đẹp hài_lòng
4,105483727,10170816,5,"sách đóng gói cẩn thận, giao hành nhanh",pos,sách đóng_gói_cẩn_thận giao_hàng_nhanh


In [8]:
def load_model(model_path):
    """Load a trained sentiment analysis model from file."""
    try:
        model = joblib.load(model_path)
        logging.info(f"Model loaded from {model_path}")
        return model
    except Exception as e:
        logging.error(f"Failed to load model: {e}")
        exit(1)

In [11]:
comments = [
    "Giao hàng nhanh, sản phẩm đúng như mô tả.",
    "Sách bị rách và giao sai hàng.",
    "Chất lượng ổn, giá cả hợp lý.",
    "Thái độ nhân viên không tốt.",
    "Đóng gói cẩn thận, rất hài lòng.",
]

# Initialize text processor
processor = TextProcessor(
    teencode_path=teencode_path,
    stopword_path=stopwords_path,
    phrase_mapping_path=phrases_path,
)

# Load model and preprocess input
model = load_model("models\\20250621_svm_model.pkl")

# Predict and display results
print("📦 Dự đoán cảm xúc:")
for cmt in comments:
    cleaned = processor.preprocess(cmt)
    label = model.predict([cleaned])[0]
    print(f'🗨️ "{cmt}" → 📌 {label}')

2025-06-21 22:41:18,451 - INFO - Model loaded from models\20250621_svm_model.pkl


📦 Dự đoán cảm xúc:
🗨️ "Giao hàng nhanh, sản phẩm đúng như mô tả." → 📌 pos
🗨️ "Sách bị rách và giao sai hàng." → 📌 neg
🗨️ "Chất lượng ổn, giá cả hợp lý." → 📌 pos
🗨️ "Thái độ nhân viên không tốt." → 📌 neg
🗨️ "Đóng gói cẩn thận, rất hài lòng." → 📌 pos
