<hr/>
HDBSCAN做關鍵詞分群<br>
Grid search最小分群群內元素量(2~10)<br>
基於歐式距離和餘弦距離的都要算，最後用Silhouette評估優劣<br>
把被HDBSCAN分到-1群組的noise群中元素，使用鄰近法插回其他群組<br>
最後有用Similar_Words_Dictionary.txt這個同義詞辭典把同義的詞彙插入同一個群集內<br>
每次的結果回存到clustering_results_hdbscan_0701.xlsx裡面的單一sheet<br>
*把列表丟回Excel時注意單一儲存格的限制最大字元數，不然會被強制切掉*<br>
<hr/>

In [None]:
import os
import random
import hdbscan
import warnings
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import pdist, squareform

os.environ["OMP_NUM_THREADS"] = "2"

# 忽略特定警告消息
warnings.filterwarnings("ignore", message="HDBSCAN clustering with cosine metric is less stable than Euclidean metric")

# 加载保存的英文单词向量文件
english_word_vectors = KeyedVectors.load_word2vec_format("english_word_vectors.txt", binary=False)
#english_word_vectors = KeyedVectors.load_word2vec_format("test_150.txt", binary=False)
# 获取单词和对应的向量
words = english_word_vectors.index_to_key
vectors = np.array([english_word_vectors[word] for word in words])

# 标准化向量以使其适用于余弦距离
normalized_vectors = normalize(vectors)

def get_chinese_translation(word, translations):
    return translations.get(word, word)  # 返回原始的英文单词

# 加载英文单词和中文词的对应关系
def load_translations(file_path):
    translations = {}
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            if " | " in line:
                parts = line.strip().split(" | ")
                if len(parts) == 2:
                    chinese, english = parts
                    translations[english.strip().replace("_", " ")] = chinese.split('.')[1].strip()
                else:
                    print(f"Skipping line due to unexpected format: {line.strip()}")
            else:
                print(f"Skipping line due to missing separator: {line.strip()}")
    return translations

translations = load_translations("answer_all_TC_cleaned_split.txt")

# 解析 "Similar_Words_Dictionary.txt" 文件，建立相似词的字典
def load_similar_words(file_path):
    similar_words = {}
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split("|")
            if len(parts) == 2:
                key_word = parts[0].strip()
                similar = [w.strip() for w in parts[1].split(",")]
                similar_words[key_word] = similar
            elif len(parts) == 1:
                key_word = parts[0].strip()
                similar_words[key_word] = []
    return similar_words

similar_words = load_similar_words("Similar_Words_Dictionary.txt")

# 定义计算基于余弦距离的HDBSCAN聚类和Silhouette分数的函数
def compute_hdbscan_silhouette(min_cluster_size, metric):
    if metric == 'cosine':
        distance_matrix = squareform(pdist(normalized_vectors, metric='cosine'))
        clusterer = hdbscan.HDBSCAN(metric='precomputed', min_cluster_size=min_cluster_size)
        labels = clusterer.fit_predict(distance_matrix)
    else:
        clusterer = hdbscan.HDBSCAN(metric=metric, min_cluster_size=min_cluster_size)
        labels = clusterer.fit_predict(vectors)

    if len(set(labels)) > 1:
        silhouette_score_cosine = silhouette_score(normalized_vectors, labels, metric='cosine')
        silhouette_score_euclidean = silhouette_score(vectors, labels, metric='euclidean')
    else:
        silhouette_score_cosine = -1  # 如果只有一个簇，返回-1
        silhouette_score_euclidean = -1

    # 将噪声点（-1标签）分配到最近的非噪声聚类
    noise_indices = np.where(labels == -1)[0]
    non_noise_indices = np.where(labels != -1)[0]
    non_noise_data = vectors[non_noise_indices]
    non_noise_labels = labels[non_noise_indices]

    if len(non_noise_indices) > 0:
        nbrs = NearestNeighbors(n_neighbors=1).fit(non_noise_data)
        distances, indices = nbrs.kneighbors(vectors[noise_indices])

        for i, noise_idx in enumerate(noise_indices):
            nearest_idx = non_noise_indices[indices[i][0]]
            labels[noise_idx] = labels[nearest_idx]

    clusters = {}
    forced_words = set()
    for i, word in enumerate(words):
        word_clean = word.replace("_", " ")
        cluster_label = labels[i]
        if cluster_label not in clusters:
            clusters[cluster_label] = {'original': [], 'forced': []}
        if i in noise_indices:
            clusters[cluster_label]['forced'].append(word_clean)
            forced_words.add(word_clean)
        else:
            clusters[cluster_label]['original'].append(word_clean)

    # 合并原始群集和强制分配的群集
    for cluster_label in clusters:
        clusters[cluster_label] = clusters[cluster_label]['original'] + ['-------'] + clusters[cluster_label]['forced']

    # 将相似词插入相应的群集中
    for key_word, similars in similar_words.items():
        # 查找 key_word 在哪个群集
        target_cluster_label = None
        for cluster_label, cluster_words in clusters.items():
            cluster_words_clean = [w.strip().lower() for w in cluster_words]
            if key_word in cluster_words_clean:
                target_cluster_label = cluster_label
                break

        # 如果找到了目标群集，将相似词插入相同群集
        if target_cluster_label is not None:
            for similar in similars:
                if similar == '':
                    break
                else:
                    if similar not in clusters[target_cluster_label]:
                        if key_word in forced_words:
                            # 插入到"-------"之后
                            insert_position = clusters[target_cluster_label].index('-------') + 1
                        else:
                            # 插入到开头
                            insert_position = 0
                        clusters[target_cluster_label].insert(insert_position, similar)

    return silhouette_score_cosine, silhouette_score_euclidean, clusters

# 循环计算不同 min_cluster_size 的结果
best_cosine_silhouette = -1
best_euclidean_silhouette = -1
best_cosine_clusters = None
best_euclidean_clusters = None
best_min_cluster_size = None
results = []

for min_cluster_size in range(2, 8, 1):  # 可以根据需要调整范围
    # 计算余弦距离的结果
    cosine_silhouette_cosine, cosine_silhouette_euclidean, cosine_clusters = compute_hdbscan_silhouette(min_cluster_size, 'cosine')
    print(f"Cosine Metric - Min_Cluster_Size: {min_cluster_size}, Silhouette (Cosine): {cosine_silhouette_cosine}, Silhouette (Euclidean): {cosine_silhouette_euclidean}")
    results.append((min_cluster_size, 'cosine', cosine_silhouette_cosine, cosine_silhouette_euclidean, cosine_clusters))
    if cosine_silhouette_cosine > best_cosine_silhouette:
        best_cosine_silhouette = cosine_silhouette_cosine
        best_cosine_clusters = cosine_clusters
        best_min_cluster_size = min_cluster_size
    if cosine_silhouette_euclidean > best_euclidean_silhouette:
        best_euclidean_silhouette = cosine_silhouette_euclidean
        best_euclidean_clusters = cosine_clusters
        best_min_cluster_size = min_cluster_size

    # 计算欧式距离的结果
    euclidean_silhouette_cosine, euclidean_silhouette_euclidean, euclidean_clusters = compute_hdbscan_silhouette(min_cluster_size, 'euclidean')
    print(f"Euclidean Metric - Min_Cluster_Size: {min_cluster_size}, Silhouette (Cosine): {euclidean_silhouette_cosine}, Silhouette (Euclidean): {euclidean_silhouette_euclidean}")
    results.append((min_cluster_size, 'euclidean', euclidean_silhouette_cosine, euclidean_silhouette_euclidean, euclidean_clusters))
    if euclidean_silhouette_cosine > best_cosine_silhouette:
        best_cosine_silhouette = euclidean_silhouette_cosine
        best_cosine_clusters = euclidean_clusters
        best_min_cluster_size = min_cluster_size
    if euclidean_silhouette_euclidean > best_euclidean_silhouette:
        best_euclidean_silhouette = euclidean_silhouette_euclidean
        best_euclidean_clusters = euclidean_clusters
        best_min_cluster_size = min_cluster_size

print(f"\nBest Min_Cluster_Size: {best_min_cluster_size}, Best Silhouette (Cosine): {best_cosine_silhouette}, Best Silhouette (Euclidean): {best_euclidean_silhouette}")

with pd.ExcelWriter("clustering_results_hdbscan_0701.xlsx") as writer:
    for min_cluster_size, metric, silhouette_cosine, silhouette_euclidean, clusters in results:
        cluster_df = pd.DataFrame()
        for cluster_label, cluster_words in clusters.items():
            if cluster_words:
                translated_words = [get_chinese_translation(word, translations) for word in cluster_words]
                
                # 将 translated_words 分割成多个子列表，每个子列表不超过3000项
                for i in range(0, len(translated_words), 3000):
                    sub_translated_words = translated_words[i:i+3000]
                    cluster_df = pd.concat([cluster_df, pd.DataFrame({
                        "Cluster_Label": [cluster_label],
                        "Words": [', '.join(sub_translated_words)]
                    })], ignore_index=True)
        
        sheet_name = f"{metric.capitalize()}_{min_cluster_size}_Sil_Cos_{silhouette_cosine:.4f}_Sil_Euc_{silhouette_euclidean:.4f}"
        cluster_df.to_excel(writer, sheet_name=sheet_name, index=False)
        
print("Clustering results have been saved to clustering_results_hdbscan.xlsx")
