In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, f1_score
import joblib

In [None]:

DATA_FOLDER = r"/home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/DATA/gold/v1/"
MODELS_FOLDER = r"/home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/models/tradition"
os.makedirs(MODELS_FOLDER, exist_ok=True)

TRAIN_FILE = DATA_FOLDER + "train_1.csv"
TEST_FILE = DATA_FOLDER + "test_1.csv"

target_cols = ['Commercial_success', 'Popular_success', 'Critical_success']

In [3]:
train_df = pd.read_csv(os.path.join(DATA_FOLDER, TRAIN_FILE))
test_df = pd.read_csv(os.path.join(DATA_FOLDER, TEST_FILE))

In [4]:
X_train = train_df.drop(columns=target_cols)
y_train = train_df[target_cols]
X_test = test_df.drop(columns=target_cols)
y_test = test_df[target_cols]

In [5]:
print(f"Train: {X_train.shape}, Test: {X_test.shape}")

Train: (1641, 5021), Test: (417, 5021)


In [6]:
y_train_xgb = y_train - 1
y_test_xgb = y_test - 1

In [8]:
models = {
    "logistic": LogisticRegression(max_iter=1000, class_weight='balanced', n_jobs=-1),
    "rf": RandomForestClassifier(
        n_estimators=400,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ),
    "svm": SVC(class_weight='balanced'),
    "xgb": XGBClassifier(
        n_estimators=400,
        max_depth=7,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
}

results = {}

print("\nBắt đầu training...\n")

for name, estimator in models.items():
    print(f"--- Training {name.upper()} ---")
    
    model = MultiOutputClassifier(estimator)
    
    # Chỉ shift cho XGBoost
    if name == "xgb":
        model.fit(X_train, y_train_xgb)
        y_pred_shifted = model.predict(X_test)
        y_pred = y_pred_shifted + 1  # chuyển lại 1-5
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    # Tính F1-macro
    f1_scores = {}
    for i, col in enumerate(target_cols):
        f1 = f1_score(y_test.iloc[:, i], y_pred[:, i], average='macro')
        f1_scores[col] = round(f1, 4)
    
    avg_f1 = np.mean(list(f1_scores.values()))
    f1_scores['avg'] = round(avg_f1, 4)
    results[name] = f1_scores
    
    # Lưu model
    model_path = os.path.join(MODELS_FOLDER, f"processed_{name}.joblib")
    joblib.dump(model, model_path)
    print(f"   → Lưu: {model_path}")
    
    print(f"   Commercial: {f1_scores['Commercial_success']:.3f} | "
          f"Popular: {f1_scores['Popular_success']:.3f} | "
          f"Critical: {f1_scores['Critical_success']:.3f} | "
          f"Avg: {avg_f1:.3f}")


Bắt đầu training...

--- Training LOGISTIC ---




   → Lưu: /home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/models/tradition/processed_logistic.joblib
   Commercial: 0.684 | Popular: 0.615 | Critical: 0.681 | Avg: 0.660
--- Training RF ---
   → Lưu: /home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/models/tradition/processed_rf.joblib
   Commercial: 0.805 | Popular: 0.554 | Critical: 0.604 | Avg: 0.655
--- Training SVM ---
   → Lưu: /home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/models/tradition/processed_svm.joblib
   Commercial: 0.701 | Popular: 0.595 | Critical: 0.653 | Avg: 0.650
--- Training XGB ---
   → Lưu: /home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/models/tradition/processed_xgb.joblib
   Commercial: 0.907 | Popular: 0.774 | Critical: 0.744 | Avg: 0.809


In [9]:
results_txt = os.path.join(MODELS_FOLDER, "processed_models_results.txt")
with open(results_txt, "w", encoding="utf-8") as f:
    f.write("KẾT QUẢ SAU KHI XỬ LÝ ĐỦ (log, ordinal, onehot, TF-IDF, sentiment)\n\n")
    f.write("Model      | Commercial | Popular   | Critical  | Avg F1\n")
    f.write("-----------|------------|-----------|-----------|-------\n")
    for name, scores in results.items():
        f.write(f"{name.upper():10} | {scores['Commercial_success']:.3f}     | "
                f"{scores['Popular_success']:.3f}     | {scores['Critical_success']:.3f}     | {scores['avg']:.3f}\n")

print(f"\n=== TỔNG KẾT ===")
for name, scores in results.items():
    print(f"{name.upper():10} | {scores['Commercial_success']:.3f} | {scores['Popular_success']:.3f} | {scores['Critical_success']:.3f} | {scores['avg']:.3f}")

print(f"\nHoàn tất! Kết quả lưu tại: {results_txt}")


=== TỔNG KẾT ===
LOGISTIC   | 0.684 | 0.615 | 0.681 | 0.660
RF         | 0.805 | 0.554 | 0.604 | 0.655
SVM        | 0.701 | 0.595 | 0.653 | 0.650
XGB        | 0.907 | 0.774 | 0.744 | 0.809

Hoàn tất! Kết quả lưu tại: /home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/models/tradition/processed_models_results.txt
