In [1]:
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from scipy import sparse
import pickle

# --- Load dữ liệu ---
X_train_tfidf = sparse.load_npz('../../data_use/tf_idf/X_train_tfidf.npz')
X_train_lsa = np.load('../../data_use/lsa/X_train_lsa.npy')
X_train_pca = np.load('../../data_use/pca/X_train_pca.npy')
y_train = np.load('../../data_use/tf_idf/y_train.npy', allow_pickle=True)

n_clusters = len(np.unique(y_train))
print(f"\n Số cụm (k): {n_clusters}")


 Số cụm (k): 5


In [2]:
datasets = {
    'TF-IDF': X_train_tfidf.toarray(),
    'LSA': X_train_lsa,
    'PCA': X_train_pca
}

results = {}
for name, X in datasets.items():
    print(f"\nĐang huấn luyện GMM trên {name}...")
    model = GaussianMixture(n_components=n_clusters, covariance_type='tied', random_state=42)
    labels = model.fit_predict(X)

    results[name] = {
        'model': model,
        'labels': labels,
        'silhouette': silhouette_score(X, labels),
        'davies_bouldin': davies_bouldin_score(X, labels),
        'calinski_harabasz': calinski_harabasz_score(X, labels)
    }

print("\nKẾT QUẢ GMM:")
for name, r in results.items():
    print(f"\n{name}:")
    print(f"   Silhouette Score      : {r['silhouette']:.4f}")
    print(f"   Davies-Bouldin Index  : {r['davies_bouldin']:.4f}")
    print(f"   Calinski-Harabasz     : {r['calinski_harabasz']:.4f}")

with open('../../data_use/cluster/gmm_results.pkl', 'wb') as f:
    pickle.dump(results, f)

print("\n Đã lưu kết quả tại: ../../data_use/cluster/gmm_results.pkl")


Đang huấn luyện GMM trên TF-IDF...

Đang huấn luyện GMM trên LSA...

Đang huấn luyện GMM trên PCA...

KẾT QUẢ GMM:

TF-IDF:
   Silhouette Score      : 0.0134
   Davies-Bouldin Index  : 8.0872
   Calinski-Harabasz     : 14.7522

LSA:
   Silhouette Score      : 0.0150
   Davies-Bouldin Index  : 6.5319
   Calinski-Harabasz     : 19.9547

PCA:
   Silhouette Score      : 0.0016
   Davies-Bouldin Index  : 5.2279
   Calinski-Harabasz     : 8.2362

 Đã lưu kết quả tại: ../../data_use/cluster/gmm_results.pkl
