In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score
import joblib
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [3]:
DATA_FOLDER = r"/home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/DATA/gold/v1/"
MODELS_FOLDER = r"/home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/models/LightGBM & Catboost/"
os.makedirs(MODELS_FOLDER, exist_ok=True)

TRAIN_FILE = DATA_FOLDER+"train_1.csv"
TEST_FILE = DATA_FOLDER+"test_1.csv"

target_cols = ['Commercial_success', 'Popular_success', 'Critical_success']

In [4]:
# Load data
train_df = pd.read_csv(os.path.join(DATA_FOLDER, TRAIN_FILE))
test_df = pd.read_csv(os.path.join(DATA_FOLDER, TEST_FILE))

X_train = train_df.drop(columns=target_cols)
y_train = train_df[target_cols]
X_test = test_df.drop(columns=target_cols)
y_test = test_df[target_cols]

# Shift nhãn 1-5 → 0-4
y_train_shift = y_train - 1
y_test_shift = y_test - 1

In [7]:
advanced_models = {
    "lightgbm": LGBMClassifier(
        n_estimators=500,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbosity=-1
    ),
    "catboost": CatBoostClassifier(
        iterations=500,
        depth=8,
        learning_rate=0.05,
        random_seed=42,
        verbose=False,
        thread_count=-1
    )
}

In [9]:
for model_name, base_estimator in advanced_models.items():
    print(f"--- Training {model_name.upper()} ---")
    
    estimators = {}
    
    for i, col in enumerate(target_cols):
        print(f"   Training for {col}...")
        est = base_estimator.__class__(**base_estimator.get_params())
        est.fit(X_train, y_train_shift[col])
        estimators[col] = est
    
    # Predict
    y_pred_all = np.zeros((len(X_test), 3))
    f1_scores = {}
    for i, col in enumerate(target_cols):
        y_pred_shifted = estimators[col].predict(X_test)
        y_pred = y_pred_shifted.flatten() + 1  # ← SỬA TẠI ĐÂY: .flatten()
        y_pred_all[:, i] = y_pred
        
        f1 = f1_score(y_test[col], y_pred, average='macro')
        f1_scores[col] = round(f1, 4)
    
    avg_f1 = np.mean(list(f1_scores.values()))
    print(f"   F1 Scores: {f1_scores}")
    print(f"   → Avg F1-macro: {avg_f1:.4f}")
    
    # Lưu
    model_path = os.path.join(MODELS_FOLDER, f"processed_{model_name}_separate.joblib")
    joblib.dump(estimators, model_path)
    print(f"   → Đã lưu: {model_path}\n")

--- Training LIGHTGBM ---
   Training for Commercial_success...
   Training for Popular_success...
   Training for Critical_success...
   F1 Scores: {'Commercial_success': 0.9266, 'Popular_success': 0.7893, 'Critical_success': 0.7988}
   → Avg F1-macro: 0.8382
   → Đã lưu: /home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/models/LightGBM & Catboost/processed_lightgbm_separate.joblib

--- Training CATBOOST ---
   Training for Commercial_success...
   Training for Popular_success...
   Training for Critical_success...
   F1 Scores: {'Commercial_success': 0.9244, 'Popular_success': 0.8028, 'Critical_success': 0.7693}
   → Avg F1-macro: 0.8322
   → Đã lưu: /home/check/DATA/university/yr3, hk1/DS102 - ML for Statistic/Đồ án/goodread/DS102/models/LightGBM & Catboost/processed_catboost_separate.joblib

