In [1]:
# %load_ext autotime

import os
import re
import io
import wget

# Define how to get data
def get_iris(storage_folder="temp", data_file="iris_data.txt", splitter=','):
    data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
    # Make a storage folder for models and data
    if not os.path.exists(storage_folder):
        os.makedirs(storage_folder)
        
    data_path = os.path.join(storage_folder, data_file)
    
    if not os.path.isfile(os.path.join(storage_folder, data_file)):        
        _ = wget.download(data_url, data_path) 

    data = [l.strip() for l in open(data_path) if l.strip()]
    features = [tuple(map(float, x.split(splitter)[:-1])) for x in data]
    labels = [x.split(splitter)[-1] for x in data]
    
    return features, labels

def get_satellite(storage_folder="temp", data_file="satellite_data.txt", splitter=' '):
    data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.trn'
    # data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.tst'
    # Make a storage folder for models and data
    if not os.path.exists(storage_folder):
        os.makedirs(storage_folder)
        
    data_path = os.path.join(storage_folder, data_file)
    
    if not os.path.isfile(os.path.join(storage_folder, data_file)):           
        _ = wget.download(data_url, data_path) 

    data = [l.strip() for l in open(data_path) if l.strip()]
    features = [tuple(map(float, x.split(splitter)[:-1])) for x in data]
    labels = [x.split(splitter)[-1] for x in data]
    
    return features, labels

def get_banknote(storage_folder="temp", data_file="banknote_data.txt", splitter=','):
    data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt'
    # Make a storage folder for models and data
    if not os.path.exists(storage_folder):
        os.makedirs(storage_folder)
        
    data_path = os.path.join(storage_folder, data_file)
    
    if not os.path.isfile(os.path.join(storage_folder, data_file)):           
        _ = wget.download(data_url, data_path) 

    data = [l.strip() for l in open(data_path) if l.strip()]
    features = [tuple(map(float, x.split(splitter)[:-1])) for x in data]
    labels = [x.split(splitter)[-1] for x in data]
    
    return features, labels

vector_values, labels = get_banknote()

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.mlab import PCA as mlabPCA
import tensorflow as tf
import math

num_clusters = len(set(labels))
num_vectors = len(vector_values)

print("Labels: ")
print(set(labels))
print("Sample size: ")
print(num_vectors)

vectors = tf.constant(vector_values, dtype = tf.half)

npArray = np.array(vector_values)
mlab_pca = mlabPCA(npArray)
users_2d = mlab_pca.project(npArray, minfrac=mlab_pca.fracs[1])

def drawClusters(assignment_values):
    data = {"x": [], "y": [], "cluster": []}
    for i in range(len(assignment_values)):
        data["x"].append(users_2d[i][0])
        data["y"].append(users_2d[i][1])
        data["cluster"].append(assignment_values[i])
    df = pd.DataFrame(data)
    sns.lmplot("x", "y", data=df, 
               fit_reg=False, size=7, 
               hue="cluster", legend=False)
    plt.show()

Labels: 
{'0', '1'}
Sample size: 
1372


In [3]:
def initialize_clusters():
    X = vector_values
    n = np.shape(X)[0]
    d = np.shape(X)[1]
    K = num_clusters
    S = np.ones(K)*(int(n/K))
    S[K-1] = S[K-1] + n%K
    
    Y = np.zeros([n, 1])
    for i in range(K):
        yis0 = np.where(Y==0)[0]
        a = np.take(X, yis0, axis=0)
        b = np.random.randn(d, 1)
        xt = np.squeeze(np.dot(a, b))
        inx = np.argsort(xt)
        Y[yis0[inx[range(int(S[i]))]]]=i
    return np.squeeze(Y)

In [4]:
#iteration variable
assignments = tf.Variable(initialize_clusters(), dtype=tf.int64)

#clusters are based on assignments
clusters = [tf.gather(vectors, tf.squeeze(tf.where(tf.equal(assignments, c)), squeeze_dims=[1])) for c in range(num_clusters)]

In [5]:
def crossDistance(a, b, ord='euclidean'):
    diff = tf.subtract(tf.expand_dims(a, 0), tf.expand_dims(b, 1))
    result = tf.norm(diff, ord, 2)
    return result

In [6]:
%%time
means = tf.concat([
    tf.expand_dims(tf.reduce_mean(cluster, 0), 0) 
    for cluster in clusters], 0)
kmeans = tf.argmin(crossDistance(vectors, means), 0)

Wall time: 93 ms


In [7]:
%%time
medoids = tf.concat([
    tf.expand_dims(tf.gather(cluster, tf.argmin(tf.reduce_sum(crossDistance(cluster, cluster), 0), 0)), 0)
    for cluster in clusters], 0)
kmedoids = tf.argmin(crossDistance(vectors, medoids), 0)

Wall time: 55 ms


In [8]:
%%time
def subsample(D, size, t):
    size_D = tf.shape(D)[0]
    selector_1 = tf.range(size_D, dtype=tf.int32)
    selector_t = tf.slice(tf.map_fn(
      lambda one: tf.random_shuffle(selector_1)  
    , tf.ones([t, size_D], dtype=tf.int32)), [0,0], [t, size])
    
    return tf.gather(D, selector_t)

def HM(X, D, t=500, psi = 20, lmbda=1.0, dt=tf.half):
    size_X = tf.shape(X)[0]
    dims_X = tf.shape(X)[1]
    size_D = tf.shape(D)[0]
    
    ones_tX = tf.ones([t, size_X, dims_X], dtype=dt)  
    
    psi = tf.cond(tf.logical_and(tf.less(0, psi), tf.less(psi, size_D)), lambda: tf.constant(psi), lambda: size_D)
    set_t_psi = tf.cond(psi<size_D, 
                        lambda: subsample(D, psi, t), 
                        # order of D will not matter and it will be broadcasted
                        lambda: tf.expand_dims(D, 0)) 
    
    ones_t_psi = tf.ones([t, psi, dims_X], dtype=dt) 
    
    hm_X = tf.zeros([size_X], dtype=dt)
    
    directions = tf.random_uniform([t, dims_X], minval = -1, maxval = 1, dtype = dt)
    projectors = tf.divide(directions, tf.norm(directions, axis=1, keep_dims=True))
    
    projects_X = tf.squeeze(tf.matmul(
        tf.expand_dims(tf.multiply(ones_tX, tf.expand_dims(X, 0)), 2), 
        tf.expand_dims(tf.multiply(ones_tX, tf.expand_dims(projectors, 1)), 3)), [2, 3])
    
    projects_psi = tf.squeeze(tf.matmul(
        tf.expand_dims(tf.multiply(ones_t_psi, set_t_psi), 2), 
        tf.expand_dims(tf.multiply(ones_t_psi, tf.expand_dims(projectors, 1)), 3)), [2, 3])
    
    mid_t_min = tf.add(
        tf.multiply(tf.reduce_max(projects_psi, 1), (1-lmbda)/2),
        tf.multiply(tf.reduce_min(projects_psi, 1), (1+lmbda)/2))
    
    mid_t_max = tf.add(
        tf.multiply(tf.reduce_max(projects_psi, 1), (1+lmbda)/2),
        tf.multiply(tf.reduce_min(projects_psi, 1), (1-lmbda)/2))
    
    mid_t = tf.add(mid_t_min, tf.multiply(tf.random_uniform([t], 0, 1, dtype=dt), tf.subtract(mid_t_max, mid_t_min)))
    
    mass_l_t = tf.divide(tf.reduce_sum(
        tf.where(
            tf.less(projects_psi, tf.expand_dims(mid_t, 1)), 
            tf.ones([t, psi], dtype=dt), 
            tf.zeros([t, psi], dtype=dt)), 1), tf.cast(psi, dtype=dt))
    
    mass_r_t = tf.add(tf.multiply(mass_l_t, -1), 1)
    
    mass_t = tf.where(tf.less(projects_X, tf.expand_dims(mid_t, 1)), 
                      tf.multiply(tf.ones([t, size_X], dtype=dt), tf.expand_dims(mass_l_t, 1)), 
                      tf.multiply(tf.ones([t, size_X], dtype=dt), tf.expand_dims(mass_r_t, 1)))
    hs_mass = tf.reduce_mean(mass_t, 0)
    return hs_mass

kmass = tf.argmax(tf.concat([
    tf.expand_dims(tf.divide(HM(vectors, cluster), tf.reduce_min(HM(vectors, cluster), 0, keep_dims=True)), 1)
    for cluster in clusters
], 1), 1)

Wall time: 782 ms


In [9]:
%%time
%matplotlib inline

# method = kmeans
# method = kmedoids
# method = kmass

def run(method = kmeans, p=1, round_max=10, output=0):
    assignment_values = initialize_clusters()
    
    # os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
    # tf.logging.set_verbosity(tf.logging.ERROR)
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
    sess_config = tf.ConfigProto(gpu_options=gpu_options)

    init_op = tf.global_variables_initializer()
    round_i = 0
    keep_hurdle = num_vectors * p
    keep_max = -1
    best_result = assignment_values
    best_round = 0

    # drawClusters(assignment_values)
    while round_i < round_max:    
        round_i = round_i + 1
        if output>1:
            print(round_i)
        with tf.Session(config=sess_config) as sess:
            sess.run(init_op)
            round_result = sess.run(method)

            keep_count = np.sum([
                1 if assignment_values[i]==round_result[i] else 0
                for i in range(num_vectors)
            ])
            if output>1:
                print(keep_count)
            if keep_max < keep_count:
                keep_max = keep_count
                best_result = round_result
                best_round=round_i
                if output>1:
                    print("best_result was updated.")

            assignment_values = sess.run(tf.assign(assignments, round_result))
    #         drawClusters(assignment_values)
            sess.close()
        if keep_count >= keep_hurdle:
            if output:
                print("keep_hurdle was hit.")
            break

    if output>0:
        print("The final P is ")
        p = float(keep_max)/float(num_vectors)
        print(p)    
        print("The round with best result is ")
        print(best_round)
        drawClusters(best_result)
    return [best_result, p, best_round]

Wall time: 2 ms


In [10]:
%run metrics.py

In [11]:
n_runs = 40 # number of runs with random initialization for clustering evaluation.

def evaluation_scores(groundtruth, labels_pred):
    """
    Eval scores of the predicted results.
     
    :param: groundtruth (type list): the groundtruth (GT) of cluster assignment. Each element denotes an item's GT cluster_id. 
    :param: labels_pred (type list): the predicted cluster assignments. Each element denotes an item's predicted cluster_id.
    """
    NMI = normalized_mutual_info_score(groundtruth,labels_pred)
    A = accuracy(groundtruth,labels_pred)
    F1 = f_measure(groundtruth,labels_pred)
    P = purity(groundtruth,labels_pred)
    RI = random_index(groundtruth,labels_pred)
    ARI = adjusted_rand_score(groundtruth,labels_pred)
    map_pairs = get_map_pairs(groundtruth,labels_pred)
    return NMI, A, F1, P, RI, ARI, map_pairs
    
def evaluation(method = kmeans):    
    import time
    t0 = time.time()
    NMIs,As,F1s = [],[],[]
    i_run = 1
    labels_unique = np.unique(labels)
    labels_indexed = []
    for label in labels:
        labels_indexed.append(np.where(labels_unique==label))
    labels_indexed = np.squeeze(labels_indexed)
    print("Round\tTime\tAcc\tF1\tNMI\tp\tl")
    while i_run <= n_runs:
        t1 = time.time()
        [result, p, l] = run(method = method, output=0)
        NMI,A,F1,P,RI,ARI,map_pairs = evaluation_scores(labels_indexed, result)
        tt = (time.time()-t1)
        print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
            i_run, 
            "{0:.2f}".format(time.time()-t1), 
            "{0:.2f}".format(A), 
            "{0:.2f}".format(F1), 
            "{0:.2f}".format(NMI),
            "{0:.2f}".format(p),
            l
        ))
        i_run = i_run+1
        NMIs.append(NMI)
        As.append(A)
        F1s.append(F1)

    print("Results of {} runs (mean,std_var,min,max):\n\t Acc: {}, {}, {}, {}\n\t F1 : {}, {}, {}, {}\n\t NMI: {}, {}, {}, {}"
          .format(n_runs
                  , np.mean(As),np.std(As), np.min(As), np.max(As)
                  , np.mean(F1s),np.std(F1s), np.min(F1s), np.max(F1s)
                  , np.mean(NMIs),np.std(NMIs), np.min(NMIs), np.max(NMIs)))
    print("Running time: {}s".format(time.time() - t0))

In [12]:
evaluation(kmass)

Round	Time	Acc	F1	NMI	p	l
1	11.47	0.67	0.67	0.09	1.00	2
2	10.60	0.63	0.63	0.06	1.00	7
3	10.64	0.66	0.66	0.08	1.00	10
4	10.71	0.71	0.70	0.31	1.00	6
5	10.98	0.65	0.64	0.06	1.00	9
6	10.96	0.74	0.73	0.29	1.00	4
7	10.82	0.64	0.63	0.05	1.00	9
8	11.05	0.67	0.67	0.09	1.00	7
9	10.97	0.70	0.69	0.19	1.00	9
10	11.55	0.64	0.64	0.06	1.00	10
11	11.32	0.71	0.71	0.19	1.00	5
12	11.49	0.71	0.70	0.25	1.00	8
13	11.26	0.67	0.67	0.09	1.00	4
14	11.37	0.69	0.68	0.14	1.00	10
15	11.38	0.65	0.65	0.07	1.00	8
16	11.50	0.68	0.68	0.11	1.00	2
17	11.50	0.62	0.61	0.04	1.00	10
18	11.65	0.68	0.68	0.12	1.00	10
19	11.66	0.71	0.71	0.19	1.00	2
20	11.71	0.64	0.64	0.06	1.00	5
21	11.91	0.70	0.70	0.15	1.00	10
22	11.96	0.74	0.74	0.33	1.00	7
23	12.46	0.64	0.64	0.06	1.00	3
24	12.30	0.64	0.64	0.06	1.00	2
25	12.35	0.64	0.64	0.06	1.00	4
26	12.39	0.65	0.65	0.08	1.00	9
27	12.43	0.66	0.66	0.08	1.00	6
28	12.56	0.63	0.63	0.05	1.00	2
29	12.48	0.66	0.66	0.08	1.00	9
30	12.68	0.64	0.64	0.06	1.00	4
31	12.64	0.62	0.62	0.04	1.00	2
32	12.84	0.68	0.

In [13]:
evaluation(kmeans)

Round	Time	Acc	F1	NMI	p	l
1	1.15	0.61	0.61	0.03	1.00	2
2	1.13	0.61	0.61	0.03	1.00	2
3	1.14	0.61	0.61	0.03	1.00	2
4	1.14	0.61	0.61	0.03	1.00	2
5	1.15	0.61	0.61	0.03	1.00	2
6	1.18	0.61	0.61	0.03	1.00	2
7	1.25	0.61	0.61	0.03	1.00	2
8	1.16	0.61	0.61	0.03	1.00	2
9	1.16	0.61	0.61	0.03	1.00	2
10	1.18	0.61	0.61	0.03	1.00	2
11	1.16	0.61	0.61	0.03	1.00	2
12	1.17	0.61	0.61	0.03	1.00	2
13	1.17	0.61	0.61	0.03	1.00	2
14	1.17	0.61	0.61	0.03	1.00	2
15	1.28	0.61	0.61	0.03	1.00	2
16	1.18	0.61	0.61	0.03	1.00	2
17	1.20	0.61	0.61	0.03	1.00	2
18	1.18	0.61	0.61	0.03	1.00	2
19	1.22	0.61	0.61	0.03	1.00	2
20	1.21	0.61	0.61	0.03	1.00	2
21	1.38	0.61	0.61	0.03	1.00	2
22	1.23	0.61	0.61	0.03	1.00	2
23	1.21	0.61	0.61	0.03	1.00	2
24	1.20	0.61	0.61	0.03	1.00	2
25	1.22	0.61	0.61	0.03	1.00	2
26	1.22	0.61	0.61	0.03	1.00	2
27	1.21	0.61	0.61	0.03	1.00	2
28	1.31	0.61	0.61	0.03	1.00	2
29	1.22	0.61	0.61	0.03	1.00	2
30	1.26	0.61	0.61	0.03	1.00	2
31	1.23	0.61	0.61	0.03	1.00	2
32	1.33	0.61	0.61	0.03	1.00	2
33	1.22	0.61	0.61	0.03	

In [14]:
evaluation(kmedoids)

Round	Time	Acc	F1	NMI	p	l
1	1.29	0.63	0.63	0.05	1.00	2
2	1.27	0.63	0.63	0.05	1.00	2
3	1.39	0.63	0.63	0.05	1.00	2
4	1.32	0.63	0.63	0.05	1.00	2
5	1.38	0.63	0.63	0.05	1.00	2
6	1.40	0.63	0.63	0.05	1.00	2
7	1.45	0.63	0.63	0.05	1.00	2
8	1.33	0.63	0.63	0.05	1.00	2
9	1.38	0.63	0.63	0.05	1.00	2
10	1.34	0.63	0.63	0.05	1.00	2
11	1.32	0.63	0.63	0.05	1.00	2
12	1.48	0.63	0.63	0.05	1.00	2
13	1.39	0.63	0.63	0.05	1.00	2
14	1.37	0.63	0.63	0.05	1.00	2
15	1.42	0.63	0.63	0.05	1.00	2
16	1.48	0.63	0.63	0.05	1.00	2
17	1.36	0.63	0.63	0.05	1.00	2
18	1.41	0.63	0.63	0.05	1.00	2
19	1.38	0.63	0.63	0.05	1.00	2
20	1.48	0.63	0.63	0.05	1.00	2
21	1.31	0.63	0.63	0.05	1.00	2
22	1.32	0.63	0.63	0.05	1.00	2
23	1.33	0.63	0.63	0.05	1.00	2
24	1.43	0.63	0.63	0.05	1.00	2
25	1.33	0.63	0.63	0.05	1.00	2
26	1.34	0.63	0.63	0.05	1.00	2
27	1.34	0.63	0.63	0.05	1.00	2
28	1.44	0.63	0.63	0.05	1.00	2
29	1.34	0.63	0.63	0.05	1.00	2
30	1.36	0.63	0.63	0.05	1.00	2
31	1.35	0.63	0.63	0.05	1.00	2
32	1.45	0.63	0.63	0.05	1.00	2
33	1.36	0.63	0.63	0.05	