In [1]:
import tensorflow as tf
from random import choice, shuffle
from numpy import array
import numpy as np

import pandas as pd
pd.set_option('display.max_columns',None)
def TFKMeansCluster(vectors, noofclusters):
    """
    K-Means Clustering using TensorFlow.
    `vertors`应该是一个n*k的二维的NumPy的数组，其中n代表着K维向量的数目
    'noofclusters' 代表了待分的集群的数目，是一个整型值
    """
 
    noofclusters = int(noofclusters)
    assert noofclusters < len(vectors)
 
    #找出每个向量的维度
    dim = len(vectors[0])

    #辅助随机地从可得的向量中选取中心点
    vector_indices = list(range(len(vectors)))
    shuffle(vector_indices)
 
    #计算图
    #我们创建了一个默认的计算流的图用于整个算法中，这样就保证了当函数被多次调用      #时，默认的图并不会被从上一次调用时留下的未使用的OPS或者Variables挤满
 
    graph = tf.Graph()
 
    with graph.as_default():
 
        #计算的会话
 
        sess = tf.Session()
 
        ##构建基本的计算的元素
         
        ##首先我们需要保证每个中心点都会存在一个Variable矩阵
        ##从现有的点集合中抽取出一部分作为默认的中心点
        centroids = [tf.Variable((vectors[vector_indices[i]]))
                     for i in range(noofclusters)]
        
        ##创建一个placeholder用于存放各个中心点可能的分类的情况
        centroid_value = tf.placeholder("float64", [dim])
        cent_assigns = []
        for centroid in centroids:
            cent_assigns.append(tf.assign(centroid, centroid_value))
 
        ##对于每个独立向量的分属的类别设置为默认值0
        assignments = [tf.Variable(0) for i in range(len(vectors))]
        
        ##这些节点在后续的操作中会被分配到合适的值
        assignment_value = tf.placeholder("int32")
        cluster_assigns = []
        for assignment in assignments:
            cluster_assigns.append(tf.assign(assignment,
                                             assignment_value))
 
        ##下面创建用于计算平均值的操作节点
        #输入的placeholder
        mean_input = tf.placeholder("float", [None, dim])
        
        #节点/OP接受输入，并且计算0维度的平均值，譬如输入的向量列表
        mean_op = tf.reduce_mean(mean_input, 0)
 
        ##用于计算欧几里得距离的节点
        v1 = tf.placeholder("float", [dim])
        v2 = tf.placeholder("float", [dim])
        euclid_dist = tf.sqrt(tf.reduce_sum(tf.pow(tf.subtract(v1, v2), 2)))
 
        ##这个OP会决定应该将向量归属到哪个节点
        ##基于向量到中心点的欧几里得距离
        #Placeholder for input
        centroid_distances = tf.placeholder("float", [noofclusters])
        cluster_assignment = tf.argmin(centroid_distances, 0)
 
        ##初始化所有的状态值
         ##这会帮助初始化图中定义的所有Variables。Variable-initializer应该定         ##义在所有的Variables被构造之后，这样所有的Variables才会被纳入初始化
        init_op = tf.initialize_all_variables()
        #初始化所有的变量
        sess.run(init_op)
 
        ##集群遍历
         
        #接下来在K-Means聚类迭代中使用最大期望算法。为了简单起见，只让它执行固           #定的次数，而不设置一个终止条件
        noofiterations = 10
        for iteration_n in range(noofiterations):
 
            ##期望步骤
            ##基于上次迭代后算出的中心点的未知
            ##the _expected_ centroid assignments.
            #首先遍历所有的向量
            for vector_n in range(len(vectors)):
                vect = vectors[vector_n]
                
                #计算给定向量与分配的中心节点之间的欧几里得距离
                distances = [sess.run(euclid_dist, feed_dict={
                    v1: vect, v2: sess.run(centroid)})
                             for centroid in centroids]
                #下面可以使用集群分配操作，将上述的距离当做输入
                assignment = sess.run(cluster_assignment, feed_dict = {
                    centroid_distances: distances})
                #接下来为每个向量分配合适的值
                sess.run(cluster_assigns[vector_n], feed_dict={
                    assignment_value: assignment})
 
            ##最大化的步骤
            #基于上述的期望步骤，计算每个新的中心点的距离从而使集群内的平方和最小
            for cluster_n in range(noofclusters):
                #收集所有分配给该集群的向量
                assigned_vects = [vectors[i] for i in range(len(vectors))
                                  if sess.run(assignments[i]) == cluster_n]
                #计算新的集群中心点
                new_location = sess.run(mean_op, feed_dict={
                    mean_input: array(assigned_vects)})
                #为每个向量分配合适的中心点
                sess.run(cent_assigns[cluster_n], feed_dict={
                    centroid_value: new_location})
 
        #返回中心节点和分组
        centroids = sess.run(centroids)
        assignments = sess.run(assignments)
        return centroids, assignments

In [2]:
pd.set_option('display.max_columns',None)
df=pd.read_csv("solo_train_kmeans.csv",header=0)
df=df.values
df1=np.array(df)
df=np.array(df)
for i in range(16):
    df[:,i] = (df[:,i]-df[:,i].min())/(df[:,i].max()-df[:,i].min())
print(df)
#cent, assi=TFKMeansCluster(df,4)

[[0.07142857 0.07575758 0.         ... 0.11293422 0.13043478 0.8462    ]
 [0.         0.01349242 0.         ... 0.03487805 0.13043478 0.2245    ]
 [0.07142857 0.07575758 0.         ... 0.01712491 0.17391304 0.1573    ]
 ...
 [0.07142857 0.15151515 0.25       ... 0.01337768 0.04347826 0.4409    ]
 [0.28571429 0.19924242 0.         ... 0.20399113 0.26086957 0.8454    ]
 [0.         0.06059091 0.         ... 0.05490022 0.08695652 0.4889    ]]


In [3]:
cent, assi=TFKMeansCluster(df,4)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use `tf.global_variables_initializer` instead.


In [4]:
print(cent)

#for i in range(18):
    #assi[:i] = (df[:,i]-df[:,i].min())/(df[:,i].max()-df[:,i].min())
    #cent1[:i] = (df1[:,i].max()*cent[:i]+df[:,i].min())/(df1[:i]+1)
    #sum = (df1[:,i].max()*cent[:i]+df[:,i].min())/(df1[:i]+1)
groupA=0
groupB=0
groupC=0
groupD=0
for n in range(len(assi)):
    if assi[n]==0:
        groupA=groupA+1
    elif assi[n]==1:
        groupB=groupB+1
    elif assi[n]==2:
        groupC=groupC+1
    elif assi[n]==3:
        groupD=groupD+1
print(groupA/len(df))
print(groupB/len(df))
print(groupC/len(df))
print(groupD/len(df))

[array([3.60103957e-02, 4.21900339e-02, 7.91258458e-03, 1.56550314e-02,
       4.88791049e-01, 2.84476168e-02, 3.16502973e-02, 9.17873625e-03,
       8.77788782e-01, 8.48023653e-01, 2.09996104e-02, 3.01431806e-04,
       9.47059039e-03, 1.50715897e-03, 6.82229474e-02, 1.80630043e-01,
       5.00668287e-01]), array([0.13086095, 0.12589401, 0.03730806, 0.06479481, 0.22705032,
       0.11660279, 0.08742729, 0.05412586, 0.8593567 , 0.83328861,
       0.04029142, 0.00460653, 0.01664542, 0.00575816, 0.13102975,
       0.22627918, 0.74526626]), array([4.29475633e-03, 1.73130743e-02, 1.18670880e-03, 1.93956681e-03,
       7.87274241e-01, 4.08755289e-03, 4.90506040e-03, 6.14204386e-04,
       8.69086564e-01, 8.42708945e-01, 1.95568614e-03, 0.00000000e+00,
       4.25542938e-04, 7.91139260e-04, 9.50388890e-03, 6.91734999e-02,
       1.50732592e-01]), array([0.35304439, 0.35991532, 0.16222678, 0.1073508 , 0.04523375,
       0.369991  , 0.11748618, 0.16189931, 0.86987704, 0.84169787,
       0.0641

In [None]:
[array([0.00892857, 0.07475062, 0.00347222, 0.0025463 , 0.08796293,
       0.02095057, 0.00315657, 0.00208333, 0.00122138, 0.67276454,
       0.04772284, 0.02310091, 0.00782688, 0.        , 0.00205903,
       0.        , 0.00671046, 0.16369049]),
 array([0.17827795, 0.19651173, 0.10095812, 0.06889103, 0.16463453,
       0.30997562, 0.21084645, 0.10564588, 0.08662617, 0.58029568,
       0.93155426, 0.91357976, 0.03880493, 0.00472279, 0.01257071,
       0.01437372, 0.13011143, 0.18333836]), 
 array([2.43259799e-02, 2.58506723e-02, 3.79609596e-03, 7.62834074e-03,
       6.49838150e-01, 2.51982093e-01, 1.33109745e-02, 1.46420989e-02,
       3.67045193e-03, 3.82173836e-01, 9.41416085e-01, 9.28051174e-01,
       1.66224432e-03, 2.16919740e-04, 6.72454014e-03, 0.00000000e+00,
       3.28670777e-02, 9.30041745e-02]), 
 array([3.34425233e-02, 2.51483526e-02, 3.30205099e-03, 1.90128870e-02,
       6.23175740e-01, 1.76905230e-01, 1.21338433e-02, 1.33472495e-02,
       3.74864158e-03, 7.82400489e-01, 9.39686358e-01, 9.15314257e-01,
       3.20604704e-02, 6.25651737e-04, 7.54670287e-03, 3.12825851e-03,
       6.27644584e-02, 1.26359165e-01])]
                                A:0.048
                                B:0.324
                                C:0.307
                                D:0.319

In [None]:
[array([3.60103957e-02, 4.21900339e-02, 7.91258458e-03, 1.56550314e-02,
       4.88791049e-01, 2.84476168e-02, 3.16502973e-02, 9.17873625e-03,
       8.77788782e-01, 8.48023653e-01, 2.09996104e-02, 3.01431806e-04,
       9.47059039e-03, 1.50715897e-03, 6.82229474e-02, 1.80630043e-01,
       5.00668287e-01]), 
 array([0.13086095, 0.12589401, 0.03730806, 0.06479481, 0.22705032,
       0.11660279, 0.08742729, 0.05412586, 0.8593567 , 0.83328861,
       0.04029142, 0.00460653, 0.01664542, 0.00575816, 0.13102975,
       0.22627918, 0.74526626]), 
 array([4.29475633e-03, 1.73130743e-02, 1.18670880e-03, 1.93956681e-03,
       7.87274241e-01, 4.08755289e-03, 4.90506040e-03, 6.14204386e-04,
       8.69086564e-01, 8.42708945e-01, 1.95568614e-03, 0.00000000e+00,
       4.25542938e-04, 7.91139260e-04, 9.50388890e-03, 6.91734999e-02,
       1.50732592e-01]), 
 array([0.35304439, 0.35991532, 0.16222678, 0.1073508 , 0.04523375,
       0.369991  , 0.11748618, 0.16189931, 0.86987704, 0.84169787,
       0.06418613, 0.01038251, 0.01529602, 0.04371585, 0.17798644,
       0.25291032, 0.90491992])]
0.44233333333333336
0.3473333333333333
0.42133333333333334
0.122