In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

选取数据

In [2]:
target = 3
data_name = ['0618', '0854', '1066'][target - 1]

采用轮廓系数和Elbow方法选取最优C值

In [3]:
def optimal_c_value(features, max_c=15):
    # max_c=10-15基本可以满足大部分特征这也是Elbow的上限
    
    sse = []  # 存储SSE（聚类内误差平方和）
    silhouette_scores = []  # 存储轮廓系数，用于聚类效果评估越大效果越好
    
    # 计算不同C值下的SSE和轮廓系数
    for c in range(2, max_c + 1): # C从2取到max_c
        kmeans = KMeans(n_clusters=c, random_state=42)
        kmeans.fit(features)
        sse.append(kmeans.inertia_) #读取当前聚类的SSE存入列表
        silhouette_scores.append(silhouette_score(features, kmeans.labels_))

    # # 绘制Elbow图和轮廓系数图
    # plt.figure(figsize=(12, 5))
    
    # # Elbow图
    # plt.subplot(1, 2, 1)
    # plt.plot(range(2, max_c + 1), sse, marker='o')
    # plt.xlabel("C-value")
    # plt.ylabel("SSE")
    # plt.title("Elbow search optimal C-value")
    
    # # 轮廓系数图
    # plt.subplot(1, 2, 2)
    # plt.plot(range(2, max_c + 1), silhouette_scores, marker='o')
    # plt.xlabel("C-value")
    # plt.ylabel("silhouette scores")
    # plt.title("silhouette scores search optimal C-value")

    # plt.show()

    # 选取拐点（Elbow法）或轮廓系数最大的C值
    optimal_c = 2 + np.argmax(silhouette_scores)  # 最佳C值
    return optimal_c

Kmeans实现和评价指标

In [4]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def Kmeans(features, optimal_c):

    print(features.shape)
    kmeans = KMeans(n_clusters=optimal_c, random_state=0)
    labels = kmeans.fit_predict(features)

    centroids = kmeans.cluster_centers_
    
    with_d, between_d, cluster_socre = cluster_eval(features, labels, optimal_c, centroids)
    print('类内间距: ', with_d)
    print('类外间距: ', between_d)
    print('聚类效果评价参数: ', cluster_socre) 

    return kmeans, labels

def cluster_eval(data, labels, optimal_c, centroids):
    K = optimal_c

    # 计算类内间距
    within_distances = []
    for i in range(K):
        cluster_points = data[labels == i]
        centroid = centroids[i]

        distance = np.sum(np.linalg.norm(cluster_points - centroid, axis=1))
        within_distances.append(distance)

    total_within_distance = np.mean(within_distances)

    # 计算类外间距
    between_distances = []
    for i in range(K):
        for j in range(i + 1, K):
            distance = np.linalg.norm(centroids[i] - centroids[j])
            between_distances.append(distance)
    total_between_distance = np.mean(between_distances)

    cluster_socre = total_between_distance / total_within_distance

    return total_within_distance, total_between_distance, cluster_socre

寻求最优C值并进行聚类分析

In [5]:
features_bgr1x1 = np.load(f'./Features/{data_name}_BGR1x1_featuers.npy')  
features_hsv1x1 = np.load(f'./Features/{data_name}_HSV1x1_features.npy')
features_bgr3x3 = np.load(f'./Features/{data_name}_BGR3x3_featuers.npy')  
features_hsv3x3 = np.load(f'./Features/{data_name}_HSV3x3_features.npy')  
features_hog    = np.load(f'./Features/{data_name}_hog_featuers.npy').squeeze().reshape(-1, 1)
features_resnet = np.load(f'./Features/{data_name}_NN_features.npy').squeeze().reshape(-1, 1)
print(features_bgr1x1.shape)
print(features_hsv1x1.shape)
print(features_bgr3x3.shape)
print(features_hsv3x3.shape)
print(features_hog.shape)
print(features_resnet.shape)

(1250, 3)
(1250, 3)
(1250, 27)
(1250, 27)
(3326400, 1)
(512, 1)


In [6]:
print("BGR1x1特征聚类分析:")
optimal_c_bgr1x1 = optimal_c_value(features_bgr1x1)
kmeans_bgr1x1, labels_bgr1x1 = Kmeans(features_bgr1x1, optimal_c_bgr1x1)
print(f"optimal C-value with BGR1x1: {optimal_c_bgr1x1}")

BGR1x1特征聚类分析:
(1250, 3)
类内间距:  35715.25068469604
类外间距:  282.26452270772035
聚类效果评价参数:  0.007903193098086542
optimal C-value with BGR1x1: 2


In [7]:
print("HSV1x1特征聚类分析:")
optimal_c_bgr1x1 = optimal_c_value(features_bgr1x1)
kmeans_bgr1x1, labels_bgr1x1 = Kmeans(features_bgr1x1, optimal_c_bgr1x1)
print(f"optimal C-value with HSV1x1: {optimal_c_bgr1x1}")

HSV1x1特征聚类分析:
(1250, 3)
类内间距:  35715.25068469604
类外间距:  282.26452270772035
聚类效果评价参数:  0.007903193098086542
optimal C-value with HSV1x1: 2


In [8]:
print("BGR3x3特征聚类分析:")
optimal_c_bgr3x3 = optimal_c_value(features_bgr3x3)
kmeans_bgr3x3, labels_bgr3x3 = Kmeans(features_bgr3x3, optimal_c_bgr3x3)
print(f"optimal C-value with BGR3x3: {optimal_c_bgr3x3}")

BGR3x3特征聚类分析:
(1250, 27)
类内间距:  109643.29206397198
类外间距:  841.3027634722699
聚类效果评价参数:  0.007673089229949491
optimal C-value with BGR3x3: 2


In [9]:
print("HSV3x3特征聚类分析:")
optimal_c_hsv3x3 = optimal_c_value(features_hsv3x3)
kmeans_hsv3x3, labels_hsv3x3 = Kmeans(features_hsv3x3, optimal_c_hsv3x3)
print(f"optimal C-value with HSV3x3: {optimal_c_hsv3x3}")

HSV3x3特征聚类分析:
(1250, 27)
类内间距:  91935.81550013072
类外间距:  498.1008898989391
聚类效果评价参数:  0.005417919960673334
optimal C-value with HSV3x3: 2


In [10]:
# print("HOG特征聚类分析:")
# optimal_c_hog = optimal_c_value(features_hog)
# kmeans_hog, labels_hog = Kmeans(features_hog, optimal_c_hog)
# print(f"optimal C-value with Resnet: {optimal_c_resnet}")

In [11]:
print("ResNet特征聚类分析:")
optimal_c_resnet = optimal_c_value(features_resnet)
kmeans_resnet, labels_resnet = Kmeans(features_resnet, optimal_c_resnet)
print(f"optimal C-value with Resnet: {optimal_c_resnet}")

ResNet特征聚类分析:
(512, 1)
类内间距:  101.2096
类外间距:  1.9095945
聚类效果评价参数:  0.018867722
optimal C-value with Resnet: 2
