In [2]:
%store -r cluster_labels cluster_embeddings
len(cluster_labels), len(cluster_embeddings)

(778, 778)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

def split_input(X, y, test_size=0.2, random_state=42, stratify=None):
    """
    Chia dữ liệu X, y thành train/test.
    - stratify: nếu muốn giữ tỉ lệ nhãn (ví dụ stratify=y)
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=stratify
    )
    return X_train, y_train, X_test, y_test

def encode_labels(labels):
    """
    Encode nhãn từ chuỗi/kiểu bất kỳ sang số nguyên.
    """
    encoder = LabelEncoder()
    encoded_labels = encoder.fit_transform(labels)
    return encoded_labels, encoder

def decode_labels(encoded_labels, encoder):
    """
    Chuyển nhãn số về lại nhãn gốc dùng encoder.
    """
    return encoder.inverse_transform(encoded_labels)

encoded_labels, label_encoder = encode_labels(cluster_labels)

# 2. Tách train/test
X_train, y_train, X_test, y_test = split_input(
    cluster_embeddings,
    encoded_labels,
    test_size=0.2,
    random_state=42,
    stratify=encoded_labels  # giữ tỉ lệ nhãn
)

# 3. Khi cần decode lại nhãn số thành nhãn gốc
y_train_decoded = decode_labels(y_train, label_encoder)
y_test_decoded = decode_labels(y_test, label_encoder)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np

def train(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc_scores = accuracy_score(y_test, y_pred)
    print(model.__class__.__name__)
    print('Accuracy: ', acc_scores, end='  ')
    print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
    print('Recall: ', recall_score(y_test, y_pred, average='weighted'), end='  ')
    print('F1 Score: ', f1_score(y_test, y_pred, average='weighted'))
    return model

classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))
# class_weight_dict = None

models = []

# Khởi tạo model hỗ trợ class_weight
if class_weight_dict:
    models = [
        DecisionTreeClassifier(random_state=42, class_weight=class_weight_dict),
        RandomForestClassifier(random_state=42, class_weight=class_weight_dict),
        LogisticRegression(random_state=42, class_weight=class_weight_dict, max_iter=1000),
        SVC(random_state=42, class_weight=class_weight_dict)
    ]
else:
    models = [
        DecisionTreeClassifier(random_state=42),
        RandomForestClassifier(random_state=42),
        LogisticRegression(random_state=42, max_iter=1000),
        SVC(random_state=42),

        GradientBoostingClassifier(random_state=42),
        CatBoostClassifier(verbose=0, random_state=42),
        XGBClassifier(random_state=42)
    ]

In [8]:
class_weights

array([4.31944444, 0.52006689, 0.70681818, 2.32089552])

In [None]:
for model in models:
    # Cảnh báo cho model không hỗ trợ class_weight khi đa lớp
    if class_weight_dict and not hasattr(model, 'class_weight'):
        print(f"Chú ý: {model.__class__.__name__} không hỗ trợ class_weight trực tiếp.")
    model = train(model, X_train, y_train, X_test, y_test)

DecisionTreeClassifier
Accuracy:  0.5897435897435898  Precision:  0.610692496853211
Recall:  0.5897435897435898  F1 Score:  0.5983013647856188


RandomForestClassifier
Accuracy:  0.6730769230769231  Precision:  0.6789940828402367
Recall:  0.6730769230769231  F1 Score:  0.674087024087024


LogisticRegression
Accuracy:  0.6987179487179487  Precision:  0.7543910396426893
Recall:  0.6987179487179487  F1 Score:  0.7119653553948425


SVC
Accuracy:  0.6666666666666666  Precision:  0.7195556016310733
Recall:  0.6666666666666666  F1 Score:  0.6761916035353536




In [7]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

def train_with_oversampling(model, X_train, y_train, X_test, y_test):
    ros = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
    print(f'{model.__class__.__name__} (with oversampling)')
    return train(model, X_resampled, y_resampled, X_test, y_test)

models_without_class_weight = [
    GradientBoostingClassifier(random_state=42),
    XGBClassifier(random_state=42),
    CatBoostClassifier(verbose=0, random_state=42)
]

for model in models_without_class_weight:
    model = train_with_oversampling(model, X_train, y_train, X_test, y_test)
    print('\n')

GradientBoostingClassifier (with oversampling)


GradientBoostingClassifier
Accuracy:  0.7371794871794872  Precision:  0.7452818202818202
Recall:  0.7371794871794872  F1 Score:  0.7404618242495759


XGBClassifier (with oversampling)
XGBClassifier
Accuracy:  0.6538461538461539  Precision:  0.6670774398151901
Recall:  0.6538461538461539  F1 Score:  0.6585562549347596


CatBoostClassifier (with oversampling)
CatBoostClassifier
Accuracy:  0.6666666666666666  Precision:  0.6964739337030359
Recall:  0.6666666666666666  F1 Score:  0.677119923087665




In [1]:
from sklearn.utils.class_weight import compute_sample_weight

sample_weight = compute_sample_weight(class_weight='balanced', y=y_train)

def train(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train, sample_weight=sample_weight)
    y_pred = model.predict(X_test)
    
    print(f"\n==== {model.__class__.__name__} ====")
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred, average='macro'))
    print('Recall:', recall_score(y_test, y_pred, average='macro'))
    print('F1 Score:', f1_score(y_test, y_pred, average='macro'))

    # Decode nhãn về lại chuỗi (nếu cần)
    y_pred_labels = le.inverse_transform(y_pred.astype(int))
    y_test_labels = le.inverse_transform(y_test)

    # Đánh giá
    print("Accuracy:", accuracy_score(y_test_labels, y_pred_labels))
    print("Classification Report:\n", classification_report(y_test_labels, y_pred_labels))

    return model

models = [
    GradientBoostingClassifier(random_state=42),
    XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
]

# 6. Train và đánh giá từng model
for model in models:
    trained_model = train(model,    )

NameError: name 'y_train' is not defined

### Optuna

In [None]:
import optuna

def objective(trial):
    rf = RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 300, step=50),
        max_depth=trial.suggest_int("max_depth", 5, 50, step=5),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 1, 5),
        max_features=trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
        random_state=42,
        n_jobs=-1
    )

    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    return score

# Tạo và tối ưu hóa Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# In kết quả tốt nhất
print("Best Parameters:", study.best_params)
print("Best Recall Score:", study.best_value)

# Đánh giá trên bộ tham số tốt nhất
best_model_rf = RandomForestClassifier(**study.best_params, random_state=42, n_jobs=-1)
best_model_rf.fit(X_train, y_train)
y_pred = best_model_rf.predict(X_test)

print("*" * 100)
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='macro'))
print('Recall:', recall_score(y_test, y_pred, average='macro'))
print('F1 Score:', f1_score(y_test, y_pred, average='macro'))


# Save model
joblib.dump(best_model_rf, 'best_model_rf.pkl')