In [1]:
# %load_ext autotime

import os
import re
import io
import wget

# Define how to get data
def get_iris(storage_folder="temp", data_file="iris_data.txt", splitter=','):
    data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
    # Make a storage folder for models and data
    if not os.path.exists(storage_folder):
        os.makedirs(storage_folder)
        
    data_path = os.path.join(storage_folder, data_file)
    
    if not os.path.isfile(os.path.join(storage_folder, data_file)):        
        _ = wget.download(data_url, data_path) 

    data = [l.strip() for l in open(data_path) if l.strip()]
    features = [tuple(map(float, x.split(splitter)[:-1])) for x in data]
    labels = [x.split(splitter)[-1] for x in data]
    
    return features, labels

def get_satellite(storage_folder="temp", data_file="satellite_data.txt", splitter=' '):
    data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.trn'
    # data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.tst'
    # Make a storage folder for models and data
    if not os.path.exists(storage_folder):
        os.makedirs(storage_folder)
        
    data_path = os.path.join(storage_folder, data_file)
    
    if not os.path.isfile(os.path.join(storage_folder, data_file)):           
        _ = wget.download(data_url, data_path) 

    data = [l.strip() for l in open(data_path) if l.strip()]
    features = [tuple(map(float, x.split(splitter)[:-1])) for x in data]
    labels = [x.split(splitter)[-1] for x in data]
    
    return features, labels

def get_banknote(storage_folder="temp", data_file="banknote_data.txt", splitter=','):
    data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt'
    # Make a storage folder for models and data
    if not os.path.exists(storage_folder):
        os.makedirs(storage_folder)
        
    data_path = os.path.join(storage_folder, data_file)
    
    if not os.path.isfile(os.path.join(storage_folder, data_file)):           
        _ = wget.download(data_url, data_path) 

    data = [l.strip() for l in open(data_path) if l.strip()]
    features = [tuple(map(float, x.split(splitter)[:-1])) for x in data]
    labels = [x.split(splitter)[-1] for x in data]
    
    return features, labels

vector_values, labels = get_iris()

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.mlab import PCA as mlabPCA
import tensorflow as tf
import math

num_clusters = len(set(labels))
num_vectors = len(vector_values)

print("Labels: ")
print(set(labels))
print("Sample size: ")
print(num_vectors)

vectors = tf.constant(vector_values, dtype = tf.half)

npArray = np.array(vector_values)
mlab_pca = mlabPCA(npArray)
users_2d = mlab_pca.project(npArray, minfrac=mlab_pca.fracs[1])

def drawClusters(assignment_values):
    data = {"x": [], "y": [], "cluster": []}
    for i in range(len(assignment_values)):
        data["x"].append(users_2d[i][0])
        data["y"].append(users_2d[i][1])
        data["cluster"].append(assignment_values[i])
    df = pd.DataFrame(data)
    sns.lmplot("x", "y", data=df, 
               fit_reg=False, size=7, 
               hue="cluster", legend=False)
    plt.show()

Labels: 
{'Iris-setosa', 'Iris-virginica', 'Iris-versicolor'}
Sample size: 
150


In [3]:
def initialize_clusters():
    X = vector_values
    n = np.shape(X)[0]
    d = np.shape(X)[1]
    K = num_clusters
    S = np.ones(K)*(int(n/K))
    S[K-1] = S[K-1] + n%K
    
    Y = np.zeros([n, 1])
    for i in range(K):
        yis0 = np.where(Y==0)[0]
        a = np.take(X, yis0, axis=0)
        b = np.random.randn(d, 1)
        xt = np.squeeze(np.dot(a, b))
        inx = np.argsort(xt)
        Y[yis0[inx[range(int(S[i]))]]]=i
    return np.squeeze(Y)

In [4]:
#iteration variable
assignments = tf.Variable(initialize_clusters(), dtype=tf.int64)

#clusters are based on assignments
clusters = [tf.gather(vectors, tf.squeeze(tf.where(tf.equal(assignments, c)), squeeze_dims=[1])) for c in range(num_clusters)]

In [5]:
def crossDistance(a, b, ord='euclidean'):
    diff = tf.subtract(tf.expand_dims(a, 0), tf.expand_dims(b, 1))
    result = tf.norm(diff, ord, 2)
    return result

In [6]:
%%time
means = tf.concat([
    tf.expand_dims(tf.reduce_mean(cluster, 0), 0) 
    for cluster in clusters], 0)
kmeans = tf.argmin(crossDistance(vectors, means), 0)

Wall time: 90 ms


In [7]:
%%time
medoids = tf.concat([
    tf.expand_dims(tf.gather(cluster, tf.argmin(tf.reduce_sum(crossDistance(cluster, cluster), 0), 0)), 0)
    for cluster in clusters], 0)
kmedoids = tf.argmin(crossDistance(vectors, medoids), 0)

Wall time: 66 ms


In [8]:
%%time
def subsample(D, size, t):
    size_D = tf.shape(D)[0]
    selector_1 = tf.range(size_D, dtype=tf.int32)
    selector_t = tf.slice(tf.map_fn(
      lambda one: tf.random_shuffle(selector_1)  
    , tf.ones([t, size_D], dtype=tf.int32)), [0,0], [t, size])
    
    return tf.gather(D, selector_t)

def HM(X, D, t=500, psi = 20, lmbda=1.0, dt=tf.half):
    size_X = tf.shape(X)[0]
    dims_X = tf.shape(X)[1]
    size_D = tf.shape(D)[0]
    
    ones_tX = tf.ones([t, size_X, dims_X], dtype=dt)  
    
    psi = tf.cond(tf.logical_and(tf.less(0, psi), tf.less(psi, size_D)), lambda: tf.constant(psi), lambda: size_D)
    set_t_psi = tf.cond(psi<size_D, 
                        lambda: subsample(D, psi, t), 
                        # order of D will not matter and it will be broadcasted
                        lambda: tf.expand_dims(D, 0)) 
    
    ones_t_psi = tf.ones([t, psi, dims_X], dtype=dt) 
    
    hm_X = tf.zeros([size_X], dtype=dt)
    
    directions = tf.random_uniform([t, dims_X], minval = -1, maxval = 1, dtype = dt)
    projectors = tf.divide(directions, tf.norm(directions, axis=1, keep_dims=True))
    
    projects_X = tf.squeeze(tf.matmul(
        tf.expand_dims(tf.multiply(ones_tX, tf.expand_dims(X, 0)), 2), 
        tf.expand_dims(tf.multiply(ones_tX, tf.expand_dims(projectors, 1)), 3)), [2, 3])
    
    projects_psi = tf.squeeze(tf.matmul(
        tf.expand_dims(tf.multiply(ones_t_psi, set_t_psi), 2), 
        tf.expand_dims(tf.multiply(ones_t_psi, tf.expand_dims(projectors, 1)), 3)), [2, 3])
    
    mid_t_min = tf.add(
        tf.multiply(tf.reduce_max(projects_psi, 1), (1-lmbda)/2),
        tf.multiply(tf.reduce_min(projects_psi, 1), (1+lmbda)/2))
    
    mid_t_max = tf.add(
        tf.multiply(tf.reduce_max(projects_psi, 1), (1+lmbda)/2),
        tf.multiply(tf.reduce_min(projects_psi, 1), (1-lmbda)/2))
    
    mid_t = tf.add(mid_t_min, tf.multiply(tf.random_uniform([t], 0, 1, dtype=dt), tf.subtract(mid_t_max, mid_t_min)))
    
    mass_l_t = tf.divide(tf.reduce_sum(
        tf.where(
            tf.less(projects_psi, tf.expand_dims(mid_t, 1)), 
            tf.ones([t, psi], dtype=dt), 
            tf.zeros([t, psi], dtype=dt)), 1), tf.cast(psi, dtype=dt))
    
    mass_r_t = tf.add(tf.multiply(mass_l_t, -1), 1)
    
    mass_t = tf.where(tf.less(projects_X, tf.expand_dims(mid_t, 1)), 
                      tf.multiply(tf.ones([t, size_X], dtype=dt), tf.expand_dims(mass_l_t, 1)), 
                      tf.multiply(tf.ones([t, size_X], dtype=dt), tf.expand_dims(mass_r_t, 1)))
    hs_mass = tf.reduce_mean(mass_t, 0)
    return hs_mass

kmass = tf.argmax(tf.concat([
    tf.expand_dims(tf.divide(HM(vectors, cluster), tf.reduce_min(HM(vectors, cluster), 0, keep_dims=True)), 1)
    for cluster in clusters
], 1), 1)

Wall time: 1.1 s


In [9]:
%%time
%matplotlib inline

# method = kmeans
# method = kmedoids
# method = kmass

def run(method = kmeans, p=1, round_max=10, output=False):
    assignment_values = initialize_clusters()
    
    # os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
    # tf.logging.set_verbosity(tf.logging.ERROR)
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
    sess_config = tf.ConfigProto(gpu_options=gpu_options)

    init_op = tf.global_variables_initializer()
    round_i = 0
    keep_hurdle = num_vectors * p
    keep_max = -1
    best_result = assignment_values

    # drawClusters(assignment_values)
    while round_i < round_max:    
        round_i = round_i + 1
        if output:
            print(round_i)
        with tf.Session(config=sess_config) as sess:
            sess.run(init_op)
            round_result = sess.run(method)

            keep_count = np.sum([
                1 if assignment_values[i]==round_result[i] else 0
                for i in range(num_vectors)
            ])
            if output:
                print(keep_count)
            if keep_max < keep_count:
                keep_max = keep_count
                best_result = round_result
                if output:
                    print("best_result was updated.")

            assignment_values = sess.run(tf.assign(assignments, round_result))
    #         drawClusters(assignment_values)
            sess.close()
        if keep_count >= keep_hurdle:
            if output:
                print("keep_hurdle was hit.")
            break

    if output:
        print("The final P is ")
        print(float(keep_max)/float(num_vectors))    
        drawClusters(best_result)
    return best_result

Wall time: 4.99 ms


In [10]:
%run metrics.py

In [11]:
n_runs = 40 # number of runs with random initialization for clustering evaluation.

def evaluation_scores(groundtruth, labels_pred):
    """
    Eval scores of the predicted results.
     
    :param: groundtruth (type list): the groundtruth (GT) of cluster assignment. Each element denotes an item's GT cluster_id. 
    :param: labels_pred (type list): the predicted cluster assignments. Each element denotes an item's predicted cluster_id.
    """
    NMI = normalized_mutual_info_score(groundtruth,labels_pred)
    A = accuracy(groundtruth,labels_pred)
    F1 = f_measure(groundtruth,labels_pred)
    P = purity(groundtruth,labels_pred)
    RI = random_index(groundtruth,labels_pred)
    ARI = adjusted_rand_score(groundtruth,labels_pred)
    map_pairs = get_map_pairs(groundtruth,labels_pred)
    return NMI, A, F1, P, RI, ARI, map_pairs
    
def evaluation(method = kmeans):    
    import time
    t0 = time.time()
    NMIs,As,F1s = [],[],[]
    i_run = 1
    labels_unique = np.unique(labels)
    labels_indexed = []
    for label in labels:
        labels_indexed.append(np.where(labels_unique==label))
    labels_indexed = np.squeeze(labels_indexed)
    while i_run <= n_runs:
        t1 = time.time()
        result = run(method = method, output=False)
        NMI,A,F1,P,RI,ARI,map_pairs = evaluation_scores(labels_indexed, result)
        print("{}:(time={}s),<Acc,F1,NMI>\t{}\t{}\t{}".format(i_run, time.time()-t1, A, F1, NMI))
        i_run = i_run+1
        NMIs.append(NMI)
        As.append(A)
        F1s.append(F1)

    print("Results of {} runs (mean,std_var):\n\t Acc: {}, {}, {}, {}\n\t F1 : {}, {}, {}, {}\n\t NMI: {}, {}, {}, {}"
          .format(n_runs
                  , np.mean(As),np.std(As), np.min(As), np.max(As)
                  , np.mean(F1s),np.std(F1s), np.min(F1s), np.max(F1s)
                  , np.mean(NMIs),np.std(NMIs), np.min(NMIs), np.max(NMIs)))
    print("Running time: {}s".format(time.time() - t0))

In [12]:
evaluation(kmass)

1:(time=8.766646146774292s),<Acc,F1,NMI>	0.9133333333333333	0.9139893581144864	0.7388481297279407
2:(time=7.7589805126190186s),<Acc,F1,NMI>	0.8933333333333333	0.8938403644236007	0.7137482117490508
3:(time=7.9510016441345215s),<Acc,F1,NMI>	0.88	0.8794278377395751	0.7061027007244721
4:(time=9.026017427444458s),<Acc,F1,NMI>	0.9333333333333333	0.9333273327282736	0.7717795548876403
5:(time=8.79598331451416s),<Acc,F1,NMI>	0.9	0.9003280327982806	0.6983587626086414
6:(time=8.148000001907349s),<Acc,F1,NMI>	0.92	0.920370770881239	0.7618318280705825
7:(time=8.068001747131348s),<Acc,F1,NMI>	0.88	0.8803828134977686	0.6883449133059752
8:(time=8.078001022338867s),<Acc,F1,NMI>	0.92	0.9201960195969606	0.7366500304786324
9:(time=8.172999143600464s),<Acc,F1,NMI>	0.8866666666666667	0.885343537023975	0.7291663144225511
10:(time=8.233000755310059s),<Acc,F1,NMI>	0.9266666666666666	0.9265899645194334	0.7836263121307085
11:(time=8.292001724243164s),<Acc,F1,NMI>	0.8866666666666667	0.885278536958972	0.7419322984

In [13]:
evaluation(kmeans)

1:(time=1.4440135955810547s),<Acc,F1,NMI>	0.9266666666666666	0.9266006072081615	0.7918864294943572
2:(time=1.3620033264160156s),<Acc,F1,NMI>	0.9266666666666666	0.9266006072081615	0.7918864294943572
3:(time=1.387981653213501s),<Acc,F1,NMI>	0.9266666666666666	0.9266006072081615	0.7918864294943572
4:(time=1.3410184383392334s),<Acc,F1,NMI>	0.9266666666666666	0.9266006072081615	0.7918864294943572
5:(time=1.4444606304168701s),<Acc,F1,NMI>	0.9266666666666666	0.9266006072081615	0.7918864294943572
6:(time=1.3670613765716553s),<Acc,F1,NMI>	0.9266666666666666	0.9266006072081615	0.7918864294943572
7:(time=1.2999846935272217s),<Acc,F1,NMI>	0.9266666666666666	0.9266006072081615	0.7918864294943572
8:(time=1.385000467300415s),<Acc,F1,NMI>	0.9266666666666666	0.9266006072081615	0.7918864294943572
9:(time=1.4160146713256836s),<Acc,F1,NMI>	0.9266666666666666	0.9266006072081615	0.7918864294943572
10:(time=1.2910659313201904s),<Acc,F1,NMI>	0.9266666666666666	0.9266006072081615	0.7918864294943572
11:(time=1.

In [14]:
evaluation(kmedoids)

1:(time=1.4239938259124756s),<Acc,F1,NMI>	0.9266666666666666	0.9266593325949269	0.7899623034484412
2:(time=1.528001070022583s),<Acc,F1,NMI>	0.9266666666666666	0.9266593325949269	0.7899623034484412
3:(time=1.4210143089294434s),<Acc,F1,NMI>	0.9266666666666666	0.9266593325949269	0.7899623034484412
4:(time=1.4219982624053955s),<Acc,F1,NMI>	0.9266666666666666	0.9266593325949269	0.7899623034484412
5:(time=1.5530014038085938s),<Acc,F1,NMI>	0.9266666666666666	0.9266593325949269	0.7899623034484412
6:(time=1.3954057693481445s),<Acc,F1,NMI>	0.9266666666666666	0.9266593325949269	0.7899623034484412
7:(time=1.400054931640625s),<Acc,F1,NMI>	0.9266666666666666	0.9266593325949269	0.7899623034484412
8:(time=1.5230138301849365s),<Acc,F1,NMI>	0.9266666666666666	0.9266593325949269	0.7899623034484412
9:(time=1.4030048847198486s),<Acc,F1,NMI>	0.9266666666666666	0.9266593325949269	0.7899623034484412
10:(time=1.4039955139160156s),<Acc,F1,NMI>	0.9266666666666666	0.9266593325949269	0.7899623034484412
11:(time=1.