Install tensorflow 1.X & import packages
The neural network part is borrowed from https://github.com/MaziarMF/deep-k-means which used tf 1.X.

In [None]:
!pip install tensorflow-gpu==1.15.0


In [None]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
import argparse
import math

from collections import defaultdict
from collections import Counter

from sklearn.utils.linear_assignment_ import linear_assignment
from sklearn.utils import shuffle
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder  
from sklearn.preprocessing import LabelEncoder 



Process the data

In [None]:
dataset = pd.read_csv('SpotifyFeatures.csv')
print(dataset.shape)
dataset[:10]

# label first
encoder = LabelEncoder()  
key = encoder.fit_transform(dataset['key'].values)  
key = np.array([key]).T
mode = encoder.fit_transform(dataset['mode'].values)
mode = np.array([mode]).T
time_signature = encoder.fit_transform(dataset['time_signature'].values)
time_signature = np.array([time_signature]).T

# then apply one-hot coding
encoder_oh = OneHotEncoder()
key=encoder_oh.fit_transform(key)
key=key.toarray()
mode=encoder_oh.fit_transform(mode)
mode=mode.toarray()
time_signature=encoder_oh.fit_transform(time_signature)
time_signature=time_signature.toarray()
print(key, mode, time_signature)

# remove unnecessary columns
data = dataset[:]
data = data.drop(['key', 'mode', 'time_signature', 'artist_name', 'track_name', 'track_id'],axis=1)
data = pd.concat([data,pd.DataFrame(key)],axis=1)
data = pd.concat([data,pd.DataFrame(mode)],axis=1)
data = pd.concat([data,pd.DataFrame(time_signature)],axis=1)

data.head()

# shuffle and extract the label
data = shuffle(data)
index = data._stat_axis.values.tolist()
label = data['genre']
data = data.drop(['genre'], axis=1)

# normalize
min_max_scaler = MinMaxScaler()
data = min_max_scaler.fit_transform(data)

(232725, 18)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]] [[1. 0.]
 [0. 1.]
 [0. 1.]
 ...
 [1. 0.]
 [1. 0.]
 [0. 1.]] [[0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 ...
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]]


Define functions
(borrowed from the deep k-means paper)

In [None]:
TF_FLOAT_TYPE = tf.float32

def fc_layers(input, specs):
    [dimensions, activations, names] = specs
    for dimension, activation, name in zip(dimensions, activations, names):
        input = tf.layers.dense(inputs=input, units=dimension, activation=activation, name=name, reuse=tf.AUTO_REUSE)
    return input

def autoencoder(input, specs):
    [dimensions, activations, names] = specs
    mid_ind = int(len(dimensions)/2)

    # Encoder
    embedding = fc_layers(input, [dimensions[:mid_ind], activations[:mid_ind], names[:mid_ind]])
    # Decoder
    output = fc_layers(embedding, [dimensions[mid_ind:], activations[mid_ind:], names[mid_ind:]])

    return embedding, output

def f_func(x, y):
    return tf.reduce_sum(tf.square(x - y), axis=1)

def g_func(x, y):
    return tf.reduce_sum(tf.square(x - y), axis=1)


def next_batch(num, data):
    """
    Return a total of `num` random samples.
    """
    indices = np.arange(0, data.shape[0])
    np.random.shuffle(indices)
    indices = indices[:num]
    batch_data = np.asarray([data[i, :] for i in indices])

    return indices, batch_data

class DkmCompGraph(object):
    """Computation graph for Deep K-Means
    """

    def __init__(self, ae_specs, n_clusters, val_lambda):
        input_size = ae_specs[0][-1]
        embedding_size = ae_specs[0][int((len(ae_specs[0])-1)/2)]

        # Placeholder tensor for input data
        self.input = tf.placeholder(dtype=TF_FLOAT_TYPE, shape=(None, input_size))

        # Auto-encoder loss computations
        self.embedding, self.output = autoencoder(self.input, ae_specs)  # Get the auto-encoder's embedding and output
        rec_error = g_func(self.input, self.output)  # Reconstruction error based on distance g

        # k-Means loss computations
        ## Tensor for cluster representatives
        minval_rep, maxval_rep = -1, 1
        self.cluster_rep = tf.Variable(tf.random_uniform([n_clusters, embedding_size],
                                                    minval=minval_rep, maxval=maxval_rep,
                                                    dtype=TF_FLOAT_TYPE), name='cluster_rep', dtype=TF_FLOAT_TYPE)

        ## First, compute the distance f between the embedding and each cluster representative
        list_dist = []
        for i in range(0, n_clusters):
            dist = f_func(self.embedding, tf.reshape(self.cluster_rep[i, :], (1, embedding_size)))
            list_dist.append(dist)
        self.stack_dist = tf.stack(list_dist)

        ## Second, find the minimum squared distance for softmax normalization
        min_dist = tf.reduce_min(list_dist, axis=0)

        ## Third, compute exponentials shifted with min_dist to avoid underflow (0/0) issues in softmaxes
        self.alpha = tf.placeholder(dtype=TF_FLOAT_TYPE, shape=())  # Placeholder tensor for alpha
        list_exp = []
        for i in range(n_clusters):
            exp = tf.exp(-self.alpha * (self.stack_dist[i] - min_dist))
            list_exp.append(exp)
        stack_exp = tf.stack(list_exp)
        sum_exponentials = tf.reduce_sum(stack_exp, axis=0)

        ## Fourth, compute softmaxes and the embedding/representative distances weighted by softmax
        list_softmax = []
        list_weighted_dist = []
        for j in range(n_clusters):
            softmax = stack_exp[j] / sum_exponentials
            weighted_dist = self.stack_dist[j] * softmax
            list_softmax.append(softmax)
            list_weighted_dist.append(weighted_dist)
        stack_weighted_dist = tf.stack(list_weighted_dist)

        # Compute the full loss combining the reconstruction error and k-means term
        self.ae_loss = tf.reduce_mean(rec_error)
        self.kmeans_loss = tf.reduce_mean(tf.reduce_sum(stack_weighted_dist, axis=0))
        self.loss = self.ae_loss + val_lambda * self.kmeans_loss

        # The optimizer is defined to minimize this loss
        optimizer = tf.train.AdamOptimizer()
        self.pretrain_op = optimizer.minimize(self.ae_loss) # Pretrain the autoencoder before starting DKM
        self.train_op = optimizer.minimize(self.loss) # Train the whole DKM model

Initialize the neural network

In [None]:
n_samples = data.shape[0]
n_pretrain_epochs = 50
n_finetuning_epochs = 5
lambda_ = 1
batch_size = 256 # Size of the mini-batches used in the stochastic optimizer
n_batches = int(math.ceil(n_samples / batch_size)) # Number of mini-batches
validation = False # Specify if data should be split into validation and test sets
pretrain = True # Specify if DKM's autoencoder should be pretrained
annealing = False # Specify if annealing should be used
#seeded = args.seeded # Specify if runs are seeded

print("Hyperparameters...")
print("lambda =", lambda_)

constant_value = 1  # specs.embedding_size # Used to modify the range of the alpha scheme
max_n = 15  # Number of alpha values to consider (constant values are used here)
alphas = 1000*np.ones(max_n, dtype=float) # alpha is constant
alphas = alphas / constant_value

target = label
data = data

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.05)
config = tf.ConfigProto(gpu_options=gpu_options)

Hyperparameters...
lambda = 1


In [None]:
n_sample = data.shape[0]
n_clusters = 26
input_size = data.shape[1]
hidden_1_size = 75
hidden_2_size = 75
hidden_3_size = 300
embedding_size = n_clusters
dimensions = [hidden_1_size, hidden_2_size, hidden_3_size, embedding_size, # Encoder layer dimensions
              hidden_3_size, hidden_2_size, hidden_1_size, input_size] # Decoder layer dimensions
activations = [tf.nn.relu, tf.nn.relu, tf.nn.relu, None, # Encoder layer activations
               tf.nn.relu, tf.nn.relu, tf.nn.relu, None] # Decoder layer activations
names = ['enc_hidden_1', 'enc_hidden_2', 'enc_hidden_3', 'embedding', # Encoder layer names
         'dec_hidden_1', 'dec_hidden_2', 'dec_hidden_3', 'output'] # Decoder layer names
cg = DkmCompGraph([dimensions, activations, names], n_clusters, lambda_)

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.


Prepare to train

Train the model
(borrowed from the deep k-means paper)

In [None]:
with tf.Session(config=config) as sess:
  # Initialization
  init = tf.global_variables_initializer()
  sess.run(init)

  # Variables to save tensor content
  distances = np.zeros((n_clusters, n_samples))

  # Pretrain if specified
        
  print("Starting autoencoder pretraining...")

  # Variables to save pretraining tensor content
  embeddings = np.zeros((n_samples, embedding_size), dtype=float)

  # First, pretrain the autoencoder
  ## Loop over epochs
  for epoch in range(n_pretrain_epochs):
    print("Pretraining step: epoch {}".format(epoch))

    # Loop over the samples
    for _ in range(n_batches):
      # Fetch a random data batch of the specified size
      indices, data_batch = next_batch(batch_size, data)

      # Run the computation graph until pretrain_op (only on autoencoder) on the data batch
      _, embedding_, ae_loss_ = sess.run((cg.pretrain_op, cg.embedding, cg.ae_loss),
                        feed_dict={cg.input: data_batch})

      # Save the embeddings for batch samples
      for j in range(len(indices)):
        embeddings[indices[j], :] = embedding_[j, :]

  # Second, run k-means++ on the pretrained embeddings
  print("Running k-means on the learned embeddings...")
  kmeans_model = KMeans(n_clusters=n_clusters, init="k-means++").fit(embeddings)          

  # The cluster centers are used to initialize the cluster representatives in DKM
  sess.run(tf.assign(cg.cluster_rep, kmeans_model.cluster_centers_))

  # Train the full DKM model
  if (len(alphas) > 0):
    print("Starting DKM training...")
        ## Loop over alpha (inverse temperature), from small to large values
  for k in range(len(alphas)):
    print("Training step: alpha[{}]: {}".format(k, alphas[k]))
  
  # Loop over epochs per alpha
    for _ in range(n_finetuning_epochs):
    # Loop over the samples
      for _ in range(n_batches):
      #print("Training step: alpha[{}], epoch {}".format(k, i))

      # Fetch a random data batch of the specified size
        indices, data_batch = next_batch(batch_size, data)

      #print(tf.trainable_variables())
      #current_batch_size = np.shape(data_batch)[0] # Can be different from batch_size for unequal splits

      # Run the computation graph on the data batch
        _, loss_, stack_dist_, cluster_rep_, ae_loss_, kmeans_loss_ =\
        sess.run((cg.train_op, cg.loss, cg.stack_dist, cg.cluster_rep, cg.ae_loss, cg.kmeans_loss),
              feed_dict={cg.input: data_batch, cg.alpha: alphas[k]})

      # Save the distances for batch samples
        for j in range(len(indices)):
          distances[:, indices[j]] = stack_dist_[:, j]

  # Evaluate the clustering performance every print_val alpha and for last alpha
    print_val = 1
    if k % print_val == 0 or k == max_n - 1:
      print("loss:", loss_)
      print("ae loss:", ae_loss_)
      print("kmeans loss:", kmeans_loss_)

                # Infer cluster assignments for all samples
      cluster_assign = np.zeros((n_samples), dtype=float)
      for i in range(n_samples):
        index_closest_cluster = np.argmin(distances[:, i])
        cluster_assign[i] = index_closest_cluster
        cluster_assign = cluster_assign.astype(np.int64) # the clustering result we have

Compute the accuracy

In [None]:
print(cluster_assign)
print(len(cluster_assign))

[17 22  6 ...  8 18  8]
232725


In [None]:
def correct_top_n(labels, predicted, cluster_counts, top=1):
    correct = 0
    for label, prediction in zip(labels, predicted):
        if label in cluster_counts[prediction][:top]:
            correct += 1
    return correct / len(labels)

In [None]:
counters = defaultdict(Counter)

for predicted, actual in zip(cluster_assign, label):
    counters[predicted][actual] += 1

clust_labels = {}
for cluster, counts in counters.items():
    clust_labels[cluster] = [a[0] for a in counts.most_common()]

In [None]:
for j in range(1, 6):
    print(j, correct_top_n(label, cluster_assign, clust_labels, top=j))