In [19]:
import numpy as np
import joblib
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter

In [6]:
train = np.load("/Users/ttdat/Documents/Do_An_Co_So/code/code_final/data_split/train_embeddings.npz")
print(train.files)

['features', 'labels']


In [7]:
test = np.load("/Users/ttdat/Documents/Do_An_Co_So/code/code_final/data_split/test_embeddings.npz")
print(test.files)

['features', 'labels']


In [8]:
train = np.load("/Users/ttdat/Documents/Do_An_Co_So/code/code_final/data_split/train_embeddings.npz")
X_train, y_train = train["features"], train["labels"]


test = np.load("/Users/ttdat/Documents/Do_An_Co_So/code/code_final/data_split/test_embeddings.npz")
X_test, y_test = test["features"], test["labels"]

In [21]:
# Định nghĩa hàm objective dùng cross-validation trên train
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 2, 20)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2", None])
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])

    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        max_features=max_features,
        min_samples_split=min_samples_split,
        class_weight=class_weight,
        random_state=42,
        n_jobs=-1
    )

    score = cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy')
    return score.mean()

In [22]:

# Tối ưu với Optuna trên tập train
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

[I 2025-05-15 22:04:23,655] A new study created in memory with name: no-name-cd19335c-5e9d-48b5-8afa-e906540a0884
[I 2025-05-15 22:04:24,213] Trial 0 finished with value: 0.9663989856297549 and parameters: {'n_estimators': 166, 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_split': 9, 'class_weight': None}. Best is trial 0 with value: 0.9663989856297549.
[I 2025-05-15 22:04:24,394] Trial 1 finished with value: 0.9841856861087631 and parameters: {'n_estimators': 59, 'max_depth': 7, 'max_features': 'log2', 'min_samples_split': 4, 'class_weight': 'balanced'}. Best is trial 1 with value: 0.9841856861087631.
[I 2025-05-15 22:04:24,849] Trial 2 finished with value: 0.9683713722175261 and parameters: {'n_estimators': 147, 'max_depth': 13, 'max_features': 'sqrt', 'min_samples_split': 7, 'class_weight': None}. Best is trial 1 with value: 0.9841856861087631.
[I 2025-05-15 22:04:34,354] Trial 3 finished with value: 0.9683948530102376 and parameters: {'n_estimators': 151, 'max_depth': 14, '

In [23]:
# Huấn luyện lại mô hình tốt nhất trên toàn bộ tập train
best_params = study.best_trial.params
best_rf = RandomForestClassifier(**best_params, random_state=42, n_jobs=-1)
best_rf.fit(X_train, y_train)

In [24]:
# Đánh giá trên tập test
y_pred = best_rf.predict(X_test)
print("🎯 Accuracy trên tập test:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

🎯 Accuracy trên tập test: 1.0
              precision    recall  f1-score   support

    Duy Khôi       1.00      1.00      1.00         1
        Dũng       1.00      1.00      1.00         1
       Dương       1.00      1.00      1.00        13
        Hiếu       1.00      1.00      1.00        19
          Hà       1.00      1.00      1.00         1
        Hưng       1.00      1.00      1.00         1
        Khôi       1.00      1.00      1.00         1
        Linh       1.00      1.00      1.00        29
        Luân       1.00      1.00      1.00         2
        Lành       1.00      1.00      1.00         4
       Nghĩa       1.00      1.00      1.00         2
      Nguyên       1.00      1.00      1.00         7
   Nhật Tiến       1.00      1.00      1.00         1
         Phú       1.00      1.00      1.00         3
      Phương       1.00      1.00      1.00        14
       Quang       1.00      1.00      1.00        16
        Quân       1.00      1.00      1.00        

In [25]:
# Thống kê lỗi dự đoán
wrong = [(t, p) for t, p in zip(y_test, y_pred) if t != p]
print(f"[!] Tổng số mẫu dự đoán sai: {len(wrong)}")
print("Một số lỗi phổ biến:", Counter(wrong).most_common(5))

[!] Tổng số mẫu dự đoán sai: 0
Một số lỗi phổ biến: []


In [26]:
np.intersect1d(X_train.view([('', X_train.dtype)] * X_train.shape[1]),
               X_test.view([('', X_test.dtype)] * X_test.shape[1])).shape[0]

0