In [1]:
# %load_ext autotime

import os
import re
import io
import wget

# Define how to get data
def get_iris(storage_folder="temp", data_file="iris_data.txt", splitter=','):
    data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
    # Make a storage folder for models and data
    if not os.path.exists(storage_folder):
        os.makedirs(storage_folder)
        
    data_path = os.path.join(storage_folder, data_file)
    
    if not os.path.isfile(os.path.join(storage_folder, data_file)):        
        _ = wget.download(data_url, data_path) 

    data = [l.strip() for l in open(data_path) if l.strip()]
    features = [tuple(map(float, x.split(splitter)[:-1])) for x in data]
    labels = [x.split(splitter)[-1] for x in data]
    
    return features, labels

def get_satellite(storage_folder="temp", data_file="satellite_data.txt", splitter=' '):
    data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.trn'
    # data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/satimage/sat.tst'
    # Make a storage folder for models and data
    if not os.path.exists(storage_folder):
        os.makedirs(storage_folder)
        
    data_path = os.path.join(storage_folder, data_file)
    
    if not os.path.isfile(os.path.join(storage_folder, data_file)):           
        _ = wget.download(data_url, data_path) 

    data = [l.strip() for l in open(data_path) if l.strip()]
    features = [tuple(map(float, x.split(splitter)[:-1])) for x in data]
    labels = [x.split(splitter)[-1] for x in data]
    
    return features, labels

def get_banknote(storage_folder="temp", data_file="banknote_data.txt", splitter=','):
    data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt'
    # Make a storage folder for models and data
    if not os.path.exists(storage_folder):
        os.makedirs(storage_folder)
        
    data_path = os.path.join(storage_folder, data_file)
    
    if not os.path.isfile(os.path.join(storage_folder, data_file)):           
        _ = wget.download(data_url, data_path) 

    data = [l.strip() for l in open(data_path) if l.strip()]
    features = [tuple(map(float, x.split(splitter)[:-1])) for x in data]
    labels = [x.split(splitter)[-1] for x in data]
    
    return features, labels

vector_values, labels = get_satellite()

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.mlab import PCA as mlabPCA
import tensorflow as tf
import math

num_clusters = len(set(labels))
num_vectors = len(vector_values)

print("Labels: ")
print(set(labels))
print("Sample size: ")
print(num_vectors)

vectors = tf.constant(vector_values, dtype = tf.half)

npArray = np.array(vector_values)
mlab_pca = mlabPCA(npArray)
users_2d = mlab_pca.project(npArray, minfrac=mlab_pca.fracs[1])

def drawClusters(assignment_values):
    data = {"x": [], "y": [], "cluster": []}
    for i in range(len(assignment_values)):
        data["x"].append(users_2d[i][0])
        data["y"].append(users_2d[i][1])
        data["cluster"].append(assignment_values[i])
    df = pd.DataFrame(data)
    sns.lmplot("x", "y", data=df, 
               fit_reg=False, size=7, 
               hue="cluster", legend=False)
    plt.show()

Labels: 
{'5', '4', '2', '3', '1', '7'}
Sample size: 
4435


In [3]:
def initialize_clusters():
    X = vector_values
    n = np.shape(X)[0]
    d = np.shape(X)[1]
    K = num_clusters
    S = np.ones(K)*(int(n/K))
    S[K-1] = S[K-1] + n%K
    
    Y = np.zeros([n, 1])
    for i in range(K):
        yis0 = np.where(Y==0)[0]
        a = np.take(X, yis0, axis=0)
        b = np.random.randn(d, 1)
        xt = np.squeeze(np.dot(a, b))
        inx = np.argsort(xt)
        Y[yis0[inx[range(int(S[i]))]]]=i
    return np.squeeze(Y)

In [4]:
#iteration variable
assignments = tf.Variable(initialize_clusters(), dtype=tf.int64)

#clusters are based on assignments
clusters = [tf.gather(vectors, tf.squeeze(tf.where(tf.equal(assignments, c)), squeeze_dims=[1])) for c in range(num_clusters)]

In [5]:
def crossDistance(a, b, ord='euclidean'):
    diff = tf.subtract(tf.expand_dims(a, 0), tf.expand_dims(b, 1))
    result = tf.norm(diff, ord, 2)
    return result

In [6]:
%%time
means = tf.concat([
    tf.expand_dims(tf.reduce_mean(cluster, 0), 0) 
    for cluster in clusters], 0)
kmeans = tf.argmin(crossDistance(vectors, means), 0)

Wall time: 45 ms


In [7]:
%%time
medoids = tf.concat([
    tf.expand_dims(tf.gather(cluster, tf.argmin(tf.reduce_sum(crossDistance(cluster, cluster), 0), 0)), 0)
    for cluster in clusters], 0)
kmedoids = tf.argmin(crossDistance(vectors, medoids), 0)

Wall time: 124 ms


In [8]:
%%time
def subsample(D, size, t):
    size_D = tf.shape(D)[0]
    selector_1 = tf.range(size_D, dtype=tf.int32)
    selector_t = tf.slice(tf.map_fn(
      lambda one: tf.random_shuffle(selector_1)  
    , tf.ones([t, size_D], dtype=tf.int32)), [0,0], [t, size])
    
    return tf.gather(D, selector_t)

def HM(X, D, t=500, psi = 20, lmbda=1.0, dt=tf.half):
    size_X = tf.shape(X)[0]
    dims_X = tf.shape(X)[1]
    size_D = tf.shape(D)[0]
    
    ones_tX = tf.ones([t, size_X, dims_X], dtype=dt)  
    
    psi = tf.cond(tf.logical_and(tf.less(0, psi), tf.less(psi, size_D)), lambda: tf.constant(psi), lambda: size_D)
    set_t_psi = tf.cond(psi<size_D, 
                        lambda: subsample(D, psi, t), 
                        # order of D will not matter and it will be broadcasted
                        lambda: tf.expand_dims(D, 0)) 
    
    ones_t_psi = tf.ones([t, psi, dims_X], dtype=dt) 
    
    hm_X = tf.zeros([size_X], dtype=dt)
    
    directions = tf.random_uniform([t, dims_X], minval = -1, maxval = 1, dtype = dt)
    projectors = tf.divide(directions, tf.norm(directions, axis=1, keep_dims=True))
    
    projects_X = tf.squeeze(tf.matmul(
        tf.expand_dims(tf.multiply(ones_tX, tf.expand_dims(X, 0)), 2), 
        tf.expand_dims(tf.multiply(ones_tX, tf.expand_dims(projectors, 1)), 3)), [2, 3])
    
    projects_psi = tf.squeeze(tf.matmul(
        tf.expand_dims(tf.multiply(ones_t_psi, set_t_psi), 2), 
        tf.expand_dims(tf.multiply(ones_t_psi, tf.expand_dims(projectors, 1)), 3)), [2, 3])
    
    mid_t_min = tf.add(
        tf.multiply(tf.reduce_max(projects_psi, 1), (1-lmbda)/2),
        tf.multiply(tf.reduce_min(projects_psi, 1), (1+lmbda)/2))
    
    mid_t_max = tf.add(
        tf.multiply(tf.reduce_max(projects_psi, 1), (1+lmbda)/2),
        tf.multiply(tf.reduce_min(projects_psi, 1), (1-lmbda)/2))
    
    mid_t = tf.add(mid_t_min, tf.multiply(tf.random_uniform([t], 0, 1, dtype=dt), tf.subtract(mid_t_max, mid_t_min)))
    
    mass_l_t = tf.divide(tf.reduce_sum(
        tf.where(
            tf.less(projects_psi, tf.expand_dims(mid_t, 1)), 
            tf.ones([t, psi], dtype=dt), 
            tf.zeros([t, psi], dtype=dt)), 1), tf.cast(psi, dtype=dt))
    
    mass_r_t = tf.add(tf.multiply(mass_l_t, -1), 1)
    
    mass_t = tf.where(tf.less(projects_X, tf.expand_dims(mid_t, 1)), 
                      tf.multiply(tf.ones([t, size_X], dtype=dt), tf.expand_dims(mass_l_t, 1)), 
                      tf.multiply(tf.ones([t, size_X], dtype=dt), tf.expand_dims(mass_r_t, 1)))
    hs_mass = tf.reduce_mean(mass_t, 0)
    return hs_mass

kmass = tf.argmax(tf.concat([
    tf.expand_dims(tf.divide(HM(vectors, cluster), tf.reduce_min(HM(vectors, cluster), 0, keep_dims=True)), 1)
    for cluster in clusters
], 1), 1)

Wall time: 2.27 s


In [9]:
%%time
%matplotlib inline

# method = kmeans
# method = kmedoids
# method = kmass

def run(method = kmeans, p=1, round_max=10, output=0):
    assignment_values = initialize_clusters()
    
    # os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
    # tf.logging.set_verbosity(tf.logging.ERROR)
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
    sess_config = tf.ConfigProto(gpu_options=gpu_options)

    init_op = tf.global_variables_initializer()
    round_i = 0
    keep_hurdle = num_vectors * p
    keep_max = -1
    best_result = assignment_values
    best_round = 0

    # drawClusters(assignment_values)
    while round_i < round_max:    
        round_i = round_i + 1
        if output>1:
            print(round_i)
        with tf.Session(config=sess_config) as sess:
            sess.run(init_op)
            round_result = sess.run(method)

            keep_count = np.sum([
                1 if assignment_values[i]==round_result[i] else 0
                for i in range(num_vectors)
            ])
            if output>1:
                print(keep_count)
            if keep_max < keep_count:
                keep_max = keep_count
                best_result = round_result
                best_round=round_i
                if output>1:
                    print("best_result was updated.")

            assignment_values = sess.run(tf.assign(assignments, round_result))
    #         drawClusters(assignment_values)
            sess.close()
        if keep_count >= keep_hurdle:
            if output:
                print("keep_hurdle was hit.")
            break

    p = float(keep_max)/float(num_vectors)
    if output>0:
        print("The final P is ")
        print(p)    
        print("The round with best result is ")
        print(best_round)
        drawClusters(best_result)
    return [best_result, p, best_round]

Wall time: 2 ms


In [10]:
%run metrics.py

In [11]:
n_runs = 40 # number of runs with random initialization for clustering evaluation.

def evaluation_scores(groundtruth, labels_pred):
    """
    Eval scores of the predicted results.
     
    :param: groundtruth (type list): the groundtruth (GT) of cluster assignment. Each element denotes an item's GT cluster_id. 
    :param: labels_pred (type list): the predicted cluster assignments. Each element denotes an item's predicted cluster_id.
    """
    NMI = normalized_mutual_info_score(groundtruth,labels_pred)
    A = accuracy(groundtruth,labels_pred)
    F1 = f_measure(groundtruth,labels_pred)
    P = purity(groundtruth,labels_pred)
    RI = random_index(groundtruth,labels_pred)
    ARI = adjusted_rand_score(groundtruth,labels_pred)
    map_pairs = get_map_pairs(groundtruth,labels_pred)
    return NMI, A, F1, P, RI, ARI, map_pairs
    
def evaluation(method = kmeans):    
    import time
    t0 = time.time()
    NMIs,As,F1s = [],[],[]
    i_run = 1
    labels_unique = np.unique(labels)
    labels_indexed = []
    for label in labels:
        labels_indexed.append(np.where(labels_unique==label))
    labels_indexed = np.squeeze(labels_indexed)
    print("Round\tTime\tAcc\tF1\tNMI\tp\tl")
    while i_run <= n_runs:
        t1 = time.time()
        [result, p, l] = run(method = method, output=0)
        NMI,A,F1,P,RI,ARI,map_pairs = evaluation_scores(labels_indexed, result)
        tt = (time.time()-t1)
        print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
            i_run, 
            "{0:.2f}".format(time.time()-t1), 
            "{0:.2f}".format(A), 
            "{0:.2f}".format(F1), 
            "{0:.2f}".format(NMI),
            "{0:.2f}".format(p),
            l
        ))
        i_run = i_run+1
        NMIs.append(NMI)
        As.append(A)
        F1s.append(F1)

    print("Results of {} runs (mean,std_var,min,max):\n\t Acc: {}, {}, {}, {}\n\t F1 : {}, {}, {}, {}\n\t NMI: {}, {}, {}, {}"
          .format(n_runs
                  , np.mean(As),np.std(As), np.min(As), np.max(As)
                  , np.mean(F1s),np.std(F1s), np.min(F1s), np.max(F1s)
                  , np.mean(NMIs),np.std(NMIs), np.min(NMIs), np.max(NMIs)))
    print("Running time: {}s".format(time.time() - t0))

In [12]:
evaluation(kmass)

Round	Time	Acc	F1	NMI	p	l
1	227.12	0.56	0.48	0.43	0.78	8
2	225.73	0.55	0.52	0.43	0.87	5
3	221.13	0.55	0.44	0.40	0.84	3
4	219.59	0.53	0.44	0.39	0.79	4
5	222.43	0.58	0.51	0.45	0.85	9
6	224.51	0.60	0.57	0.44	0.81	4
7	217.62	0.59	0.54	0.44	0.85	3
8	216.29	0.69	0.58	0.49	0.71	4
9	221.08	0.52	0.44	0.41	0.82	2
10	216.47	0.60	0.54	0.47	0.88	6
11	218.36	0.64	0.62	0.46	0.80	9
12	218.96	0.54	0.52	0.44	0.88	2
13	226.87	0.59	0.56	0.45	0.76	8
14	225.00	0.69	0.65	0.46	0.79	3
15	226.15	0.59	0.54	0.42	0.86	6
16	225.22	0.58	0.53	0.43	0.84	6
17	218.38	0.58	0.54	0.40	0.80	8
18	217.10	0.53	0.51	0.44	0.75	7
19	229.51	0.47	0.41	0.41	0.88	8
20	234.61	0.54	0.55	0.41	0.87	9
21	238.49	0.57	0.53	0.41	0.86	7
22	227.02	0.60	0.55	0.42	0.78	6
23	225.94	0.61	0.58	0.47	0.75	10
24	226.70	0.65	0.56	0.45	0.79	8
25	232.44	0.58	0.53	0.42	0.85	7
26	229.65	0.60	0.55	0.47	0.87	9
27	225.70	0.58	0.51	0.46	0.84	9
28	223.93	0.56	0.50	0.41	0.77	6
29	228.68	0.60	0.52	0.43	0.79	10
30	225.21	0.60	0.55	0.49	0.82	3
31	219.72	0.58	0.56	0

In [13]:
evaluation(kmeans)

Round	Time	Acc	F1	NMI	p	l
1	4.40	0.62	0.62	0.45	1.00	2
2	3.24	0.62	0.62	0.45	1.00	2
3	3.11	0.62	0.62	0.45	1.00	2
4	3.18	0.62	0.62	0.45	1.00	2
5	3.23	0.62	0.62	0.45	1.00	2
6	3.09	0.62	0.62	0.45	1.00	2
7	3.11	0.62	0.62	0.45	1.00	2
8	3.44	0.62	0.62	0.45	1.00	2
9	3.34	0.62	0.62	0.45	1.00	2
10	3.39	0.62	0.62	0.45	1.00	2
11	3.34	0.62	0.62	0.45	1.00	2
12	3.15	0.62	0.62	0.45	1.00	2
13	3.34	0.62	0.62	0.45	1.00	2
14	3.19	0.62	0.62	0.45	1.00	2
15	3.26	0.62	0.62	0.45	1.00	2
16	3.13	0.62	0.62	0.45	1.00	2
17	3.25	0.62	0.62	0.45	1.00	2
18	3.19	0.62	0.62	0.45	1.00	2
19	3.27	0.62	0.62	0.45	1.00	2
20	3.13	0.62	0.62	0.45	1.00	2
21	3.31	0.62	0.62	0.45	1.00	2
22	3.29	0.62	0.62	0.45	1.00	2
23	3.34	0.62	0.62	0.45	1.00	2
24	3.21	0.62	0.62	0.45	1.00	2
25	3.39	0.62	0.62	0.45	1.00	2
26	3.21	0.62	0.62	0.45	1.00	2
27	3.34	0.62	0.62	0.45	1.00	2
28	3.17	0.62	0.62	0.45	1.00	2
29	3.21	0.62	0.62	0.45	1.00	2
30	3.27	0.62	0.62	0.45	1.00	2
31	3.36	0.62	0.62	0.45	1.00	2
32	3.28	0.62	0.62	0.45	1.00	2
33	3.31	0.62	0.62	0.45	

In [14]:
evaluation(kmedoids)

Round	Time	Acc	F1	NMI	p	l
1	3.38	0.47	0.44	0.36	1.00	2
2	3.31	0.47	0.44	0.36	1.00	2
3	3.43	0.47	0.44	0.36	1.00	2
4	3.49	0.47	0.44	0.36	1.00	2
5	3.30	0.47	0.44	0.36	1.00	2
6	3.42	0.47	0.44	0.36	1.00	2
7	3.29	0.47	0.44	0.36	1.00	2
8	3.43	0.47	0.44	0.36	1.00	2
9	3.28	0.47	0.44	0.36	1.00	2
10	3.43	0.47	0.44	0.36	1.00	2
11	3.47	0.47	0.44	0.36	1.00	2
12	3.35	0.47	0.44	0.36	1.00	2
13	3.30	0.47	0.44	0.36	1.00	2
14	3.56	0.47	0.44	0.36	1.00	2
15	3.50	0.47	0.44	0.36	1.00	2
16	3.35	0.47	0.44	0.36	1.00	2
17	3.53	0.47	0.44	0.36	1.00	2
18	3.40	0.47	0.44	0.36	1.00	2
19	3.48	0.47	0.44	0.36	1.00	2
20	3.37	0.47	0.44	0.36	1.00	2
21	3.47	0.47	0.44	0.36	1.00	2
22	3.34	0.47	0.44	0.36	1.00	2
23	3.52	0.47	0.44	0.36	1.00	2
24	3.36	0.47	0.44	0.36	1.00	2
25	3.47	0.47	0.44	0.36	1.00	2
26	3.52	0.47	0.44	0.36	1.00	2
27	3.49	0.47	0.44	0.36	1.00	2
28	3.55	0.47	0.44	0.36	1.00	2
29	3.49	0.47	0.44	0.36	1.00	2
30	3.47	0.47	0.44	0.36	1.00	2
31	3.52	0.47	0.44	0.36	1.00	2
32	3.35	0.47	0.44	0.36	1.00	2
33	3.55	0.47	0.44	0.36	