In [1]:
import torch
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from tqdm import tqdm  
from sklearn.cluster import MiniBatchKMeans

In [9]:
# 数据准备
target = 1
data_name = ['0618', '0854', '1066'][target - 1]
features_bgr1x1 = np.load(f'./Features/{data_name}_BGR1x1_featuers.npy')
features_hsv1x1 = np.load(f'./Features/{data_name}_HSV1x1_features.npy')
features_bgr3x3 = np.load(f'./Features/{data_name}_BGR3x3_featuers.npy')
features_hsv3x3 = np.load(f'./Features/{data_name}_HSV3x3_features.npy')
features_resnet = np.load(f'./Features/{data_name}_resnet_features.npy').squeeze().reshape(-1, 1)
features_hog = np.load(f'./Features/{data_name}_HOG_featuers.npy')
features_gist = np.load(f'./Features/{data_name}_GIST_featuers.npy').reshape(-1, 1)
features_pca = np.load(f'./Features/{data_name}_PCA_features.npy')

# 将数据从 NumPy 转换为 PyTorch 张量，并将其转移到 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
features_bgr1x1 = torch.tensor(features_bgr1x1, dtype=torch.float32, device=device)
features_hsv1x1 = torch.tensor(features_hsv1x1, dtype=torch.float32, device=device)
features_bgr3x3 = torch.tensor(features_bgr3x3, dtype=torch.float32, device=device)
features_hsv3x3 = torch.tensor(features_hsv3x3, dtype=torch.float32, device=device)
features_resnet = torch.tensor(features_resnet, dtype=torch.float32, device=device)
features_hog = torch.tensor(features_hog, dtype=torch.float32, device=device)
features_gist = torch.tensor(features_gist, dtype=torch.float32, device=device)
features_pca = torch.tensor(features_pca, dtype=torch.float32, device=device)


In [3]:
def optimal_c_value(features, max_c=15):
    sse = []  # 存储SSE（聚类内误差平方和）
    
    # 计算不同C值下的SSE，Elbow方法选择合适的C值
    for c in tqdm(range(2, max_c + 1), desc="Calculating optimal C"):
        kmeans = MiniBatchKMeans(n_clusters=c, random_state=42, batch_size=200)
        kmeans.fit(features.cpu().numpy())  # 转换为 CPU 上的 NumPy 数组
        sse.append(kmeans.inertia_)  # 读取当前聚类的SSE存入列表

    # # 绘制 Elbow 图，选择拐点
    # plt.plot(range(2, max_c + 1), sse)
    # plt.xlabel('Number of clusters (C)')
    # plt.ylabel('SSE')
    # plt.title('Elbow Method For Optimal C')
    # plt.show()

    # 找到 Elbow 的最佳 C 值
    optimal_c = np.argmin(np.diff(sse)) + 2  # 选择拐点的前一个值
    return optimal_c

def Kmeans(features, optimal_c):
    # 使用 PyTorch 进行 GPU 加速的 KMeans 聚类
    features_gpu = features.to(device)
    num_samples, num_features = features_gpu.shape
    kmeans = KMeans(n_clusters=optimal_c, random_state=42)
    labels = kmeans.fit_predict(features_gpu.cpu().numpy())  # 将数据从 GPU 转回 CPU 进行处理

    centroids = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32, device=device)

    with_d, between_d, cluster_score = cluster_eval(features_gpu, labels, optimal_c, centroids)
    print("features.shape: ", features.shape)
    print('类内间距: ', with_d)
    print('类外间距: ', between_d)
    print('聚类效果评价参数: ', cluster_score)

    return kmeans, labels

def cluster_eval(data, labels, optimal_c, centroids):
    K = optimal_c

    # 计算类内间距
    within_distances = []
    for i in range(K):
        cluster_points = data[labels == i]
        centroid = centroids[i]

        # 计算每个点到中心的距离并求和
        distance = torch.sum(torch.norm(cluster_points - centroid, dim=1))  # 使用 PyTorch 的 norm 函数
        within_distances.append(distance)

    total_within_distance = torch.mean(torch.tensor(within_distances, device=device)).item()

    # 计算类外间距
    between_distances = []
    for i in range(K):
        for j in range(i + 1, K):
            distance = torch.norm(centroids[i] - centroids[j])  # 计算中心点之间的距离
            between_distances.append(distance.item())

    total_between_distance = torch.mean(torch.tensor(between_distances, device=device)).item()

    cluster_score = total_between_distance / total_within_distance

    return total_within_distance, total_between_distance, cluster_score

In [10]:
# 聚类分析
print("BGR1x1特征聚类分析:")
optimal_c_bgr1x1 = optimal_c_value(features_bgr1x1)
kmeans_bgr1x1, labels_bgr1x1 = Kmeans(features_bgr1x1, optimal_c_bgr1x1)
print(f"optimal C-value with BGR1x1: {optimal_c_bgr1x1}")

print("HSV1x1特征聚类分析:")
optimal_c_bgr1x1 = optimal_c_value(features_bgr1x1)
kmeans_bgr1x1, labels_bgr1x1 = Kmeans(features_bgr1x1, optimal_c_bgr1x1)
print(f"optimal C-value with HSV1x1: {optimal_c_bgr1x1}")

print("BGR3x3特征聚类分析:")
optimal_c_bgr3x3 = optimal_c_value(features_bgr3x3)
kmeans_bgr3x3, labels_bgr3x3 = Kmeans(features_bgr3x3, optimal_c_bgr3x3)
print(f"optimal C-value with BGR3x3: {optimal_c_bgr3x3}")

print("HSV3x3特征聚类分析:")
optimal_c_hsv3x3 = optimal_c_value(features_hsv3x3)
kmeans_hsv3x3, labels_hsv3x3 = Kmeans(features_hsv3x3, optimal_c_hsv3x3)
print(f"optimal C-value with HSV3x3: {optimal_c_hsv3x3}")


BGR1x1特征聚类分析:


Calculating optimal C: 100%|█| 14/14 [00:00<00:00, 152.61it/


features.shape:  torch.Size([1250, 3])
类内间距:  24632.939453125
类外间距:  224.63951110839844
聚类效果评价参数:  0.009119476444777282
optimal C-value with BGR1x1: 2
HSV1x1特征聚类分析:


Calculating optimal C: 100%|█| 14/14 [00:00<00:00, 184.59it/


features.shape:  torch.Size([1250, 3])
类内间距:  24632.939453125
类外间距:  224.63951110839844
聚类效果评价参数:  0.009119476444777282
optimal C-value with HSV1x1: 2
BGR3x3特征聚类分析:


Calculating optimal C: 100%|█| 14/14 [00:00<00:00, 106.60it/


features.shape:  torch.Size([1250, 27])
类内间距:  78595.171875
类外间距:  648.260986328125
聚类效果评价参数:  0.008248101898156515
optimal C-value with BGR3x3: 2
HSV3x3特征聚类分析:


Calculating optimal C: 100%|█| 14/14 [00:00<00:00, 130.98it/

features.shape:  torch.Size([1250, 27])
类内间距:  107475.4609375
类外间距:  389.54180908203125
聚类效果评价参数:  0.0036244720951563144
optimal C-value with HSV3x3: 2





In [5]:
print("HOG特征聚类分析:")
optimal_c_hog = optimal_c_value(features_hog)
kmeans_hog, labels_hog = Kmeans(features_hog, optimal_c_hog)
print(f"optimal C-value with HOG: {optimal_c_resnet}")

HOG特征聚类分析:


Calculating optimal C: 100%|█| 14/14 [00:00<00:00, 158.87it/


features.shape:  torch.Size([1830, 36])
类内间距:  408.5014343261719
类外间距:  0.4068683087825775
聚类效果评价参数:  0.0009960021546893997
optimal C-value with HOG: 2


In [6]:
print("GIST特征聚类分析:")
optimal_c_gist = optimal_c_value(features_gist)
kmeans_gist, labels_gist = Kmeans(features_gist, optimal_c_gist)
print(f"optimal C-value with GIST: {optimal_c_gist}")

GIST特征聚类分析:


Calculating optimal C: 100%|█| 14/14 [00:00<00:00, 214.05it/

features.shape:  torch.Size([128, 1])
类内间距:  607.060791015625
类外间距:  34.81443786621094
聚类效果评价参数:  0.057349178832593814
optimal C-value with GIST: 2





In [7]:
print("PCA特征聚类分析:")
optimal_c_pca = optimal_c_value(features_pca)
kmeans_pca, labels_pca = Kmeans(features_pca, optimal_c_pca)
print(f"optimal C-value with Resnet: {optimal_c_pca}")

PCA特征聚类分析:


Calculating optimal C: 100%|█| 14/14 [00:01<00:00, 12.42it/s


features.shape:  torch.Size([125000, 2])
类内间距:  35653.6484375
类外间距:  3.252089262008667
聚类效果评价参数:  9.121336537856715e-05
optimal C-value with Resnet: 2


In [11]:
print("ResNet特征聚类分析:")
optimal_c_resnet = optimal_c_value(features_resnet)
kmeans_resnet, labels_resnet = Kmeans(features_resnet, optimal_c_resnet)
print(f"optimal C-value with Resnet: {optimal_c_resnet}")

ResNet特征聚类分析:


Calculating optimal C: 100%|█| 14/14 [00:00<00:00, 154.47it/

features.shape:  torch.Size([512, 1])
类内间距:  68.05752563476562
类外间距:  1.6752053499221802
聚类效果评价参数:  0.02461454974005755
optimal C-value with Resnet: 2



