In [12]:
import numpy as np
import sys
from copy import deepcopy
import os

In [4]:
sys.path.append(os.pardir)

In [5]:
from hct66 import *

In [45]:
image_data, image_label = generate_data()

In [30]:
dataSet = [get_feature(img).numpy().reshape(img.shape[1]) for img in image_data] # 分类数据集
dataSet = np.array(dataSet)
dataSet.shape

(10, 6)

In [18]:
class KMeans:
    def __init__(self, clusters: int = 3, init: str = 'kmeans++', max_iter: int = 100, epsilon: float = 1e-3):
        self.clusters = clusters # 
        self.init = init
        self.max_iter = max_iter
        self.epsilon = epsilon
        
        self.clusterCenters = None
        self.dist = None
        self.labels = None
    
    # 生成簇中心
    def generateCenter(self, dataset):
        n_sample, n_feature = dataset.shape
        
        if self.init == 'random': # 如果初始化选择的是random，就随机产生几个簇
            # 为了在数据范围内，产生随机质心，首先计算各特征的统计量
            f_mean = np.mean(dataset, axis=0)
            f_std = np.std(dataset, axis=0)
            
            self.clusterCenters = f_mean + np.random.randn(self.clusters, n_feature) * f_std
            
        elif self.init == 'kmeans++': # 如果初始化选择的是kmeans++，就要好好选择几个合理的簇
            # 第一个中心随机选择
            idx = np.random.randint(0, n_sample)
            self.clusterCenters = [dataset[idx, :]]
            
            # 选出后面的k - 1个质心
            for i in range(1, self.clusters):
                dist = np.zeros((n_sample, len(self.clusterCenters))) # 各样本到已经存在的质心的距离矩阵
                for center_idx in range(len(self.clusterCenters)):
                    dist[:, center_idx] = np.linalg.norm(
                        dataset - self.clusterCenters[center_idx], axis=1)
                
                dist = np.min(dist, axis=1)
                p = dist / np.sum(dist)
                
                next_center_idx = np.random.choice(n_sample, p=p)
                self.clusterCenters.append(dataset[next_center_idx])
                
            self.clusterCenters = np.array(self.clusterCenters)
    
    # 进行数据聚类学习
    def fit(self, dataset):
        n_sample, n_feature = dataset.shape # 样本数量，样本特征维度
        
        self.generateCenter(dataset)
        
        self.dist = np.zeros((n_sample, self.clusters))
        
        center_pre = np.zeros((self.clusters, n_feature))
        
        # 两次中心之间的更新
        center_differ = np.linalg.norm(center_pre - self.clusterCenters, ord=2)
        
        epoch = 0
        
        from copy import deepcopy
        while epoch < self.max_iter and center_differ > self.epsilon:
            epoch += 1
            # 首先计算每个样本到每个质心的距离
            for i in range(self.clusters):
                self.dist[:, i] = np.linalg.norm(dataset - self.clusterCenters[i], ord=2, axis=1)
            
            # 得到样本对应的类别为距离最近的质心
            self.labels = np.argmin(self.dist, axis=1)
            
            center_pre = deepcopy(self.clusterCenters)
            
            # 计算新的中心
            for i in range(self.clusters):
                self.clusterCenters[i] = np.mean(dataset[self.labels == i], axis=0)
            
            center_differ = np.linalg.norm(center_pre - self.clusterCenters, ord=2)
    
    # 利用模型进行新数据的预测
    def predict(self, testdata):
        n_sample = testdata.shape[0]
        
        dist_test = np.zeros(n_sample, self.clusters)
        
        for i in range(self.clusters):
            dist_test[:, i] = np.linalg.norm(testdata - self.clusterCenters, axis=1)
        
        # 对于每个数据预测的聚类形式
        cluster_pred = np.argmin(dist_pred, axis=1)
        
        return cluster_pred
        

In [32]:
if __name__ == '__main__':
    dataSet = [get_feature(img).numpy().reshape(img.shape[1]) for img in image_data] # 分类数据集
    dataSet = np.array(dataSet) # 分类数据集
#     print(dataSet.shape)
    
    kmeans = KMeans(clusters=3, init='kmeans++')
    kmeans.fit(dataSet)
    

In [58]:
image_class_1 = datas[kmeans.labels == 0]
print('第一类数据\n',image_class_1)

第一类数据
 [[[0 0 1 1 0 0]
  [0 1 0 0 1 0]
  [0 1 0 0 1 0]
  [0 1 0 0 1 0]
  [0 0 1 1 0 0]
  [0 0 0 0 0 0]]

 [[0 0 1 1 0 0]
  [0 1 0 0 1 0]
  [0 0 0 1 0 0]
  [0 0 1 0 0 0]
  [0 1 1 1 1 0]
  [0 0 0 0 0 0]]

 [[0 0 1 1 0 0]
  [0 0 0 0 1 0]
  [0 0 1 1 0 0]
  [0 0 0 0 1 0]
  [0 0 1 1 0 0]
  [0 0 0 0 0 0]]

 [[0 0 0 0 1 0]
  [0 0 0 1 1 0]
  [0 0 1 0 1 0]
  [0 1 1 1 1 1]
  [0 0 0 0 1 0]
  [0 0 0 0 0 0]]

 [[0 0 1 1 0 0]
  [0 1 0 0 1 0]
  [0 0 1 1 0 0]
  [0 1 0 0 1 0]
  [0 0 1 1 0 0]
  [0 0 0 0 0 0]]

 [[0 0 1 1 1 0]
  [0 1 0 0 1 0]
  [0 1 1 1 1 0]
  [0 0 0 0 1 0]
  [0 0 0 0 1 0]
  [0 0 0 0 0 0]]]


In [61]:
image_class_2 = datas[kmeans.labels == 1]
print('第二类数据\n', image_class_2)

第二类数据
 [[[0 1 1 1 0 0]
  [0 1 0 0 0 0]
  [0 1 1 1 0 0]
  [0 0 0 0 1 0]
  [0 1 1 1 0 0]
  [0 0 0 0 0 0]]

 [[0 0 1 1 0 0]
  [0 1 0 0 0 0]
  [0 1 1 1 0 0]
  [0 1 0 0 1 0]
  [0 0 1 1 0 0]
  [0 0 0 0 0 0]]]


In [60]:
image_class_3 = datas[kmeans.labels == 2]
print('第三类数据\n', image_class_3)

第三类数据
 [[[0 0 0 1 0 0]
  [0 0 1 1 0 0]
  [0 0 0 1 0 0]
  [0 0 0 1 0 0]
  [0 0 1 1 1 0]
  [0 0 0 0 0 0]]

 [[0 1 1 1 1 0]
  [0 0 0 0 1 0]
  [0 0 0 1 0 0]
  [0 0 0 1 0 0]
  [0 0 0 1 0 0]
  [0 0 0 0 0 0]]]
