# Section 3.1: K-means

## Section 3.1.0: Package initialisations, environment configuration and function definitions

Import relevant packages:

In [1]:
import tensorflow as tf
import numpy as np

import time
import datetime
import os

# Non-interactive plotting
import matplotlib.pyplot as plt

# Interactive plotting
from plotly import tools
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.grid_objs as gro
import plotly.offline as pyo
from plotly.offline import download_plotlyjs

Configure environment:

In [2]:
%config InlineBackend.figure_format = 'retina'
np.set_printoptions(precision=3)

# Global Variables
CURRENT_DIR = os.getcwd()
LOG_DIR = os.path.join(CURRENT_DIR, 'Logs')

# Activate Plotly Offline for Jupyter
pyo.init_notebook_mode(connected=True)

# Define global variable SEED
SEED = 521

Load data2D.npy into memory:

In [8]:
"""
data2D.npy contains 10,000 data points of dimension 2
"""
# Load data
source = np.load("./Data/RBF NN/source.npz")
data2D = source['Y'][:,:2]

# Set random seed
np.random.seed(SEED)

# Generate random index
randIdx2D = np.arange(len(data2D))

# Randomise data2D
np.random.shuffle(randIdx2D)
data2D = data2D[randIdx2D]

### Load results (optional; when resuming work)

In [4]:
# results_1_1_3 = np.load('./Results/K-means/1_1_3.npy')
# results_1_1_4 = np.load('./Results/K-means/1_1_4.npy')
# results_2_2_2_4 = np.load('./Results/MoG/2_2_4_K-means.npy')

### Create K-means TensorFlow graph:

Loss function:
$$ \mathcal{L}(\mathbf{\mu}) = \sum_{n=1}^N \min_{k=1}^K || \mathbf{x}_n - \mathbf{\mu}_k ||_2^2 $$ 

In [5]:
'''
Creates a graph for K-means based on the loss function above:

Inputs
    K:       Number of classifiers
    data_dim: dimension of datapoint
    
X:  data placeholder (N x data_dim)
mu: cluster centres (K x data_dim)
'''
def build_k_means(K, data_dim, device='cpu'):
    
    # Set TF graph seed
    tf.set_random_seed(SEED)
    
    # Define computation device
    try:
        assert device == 'cpu' or device == 'gpu'
    except AssertionError:
        print 'Invalid device chosen. Please use \'cpu\' or \'gpu\''
        quit()
    device = '/' + device + ':0'
    
    with tf.device('/cpu:0'):
        # Create placeholder
        with tf.name_scope('placeholders'):
            X = tf.placeholder(tf.float32, shape=[None, data_dim], name='inputs')
        # Define parameters
        with tf.variable_scope('parameters'):
            mu = tf.get_variable('cluster_centres', shape=[K, data_dim], initializer=tf.random_normal_initializer(seed=SEED))

    with tf.device(device):
        # Calculate distance matrix (N x K)
        # by subtracting expanded X (N x D x 1) with expanded mu (1 x D x K) using broadcasting
        with tf.name_scope('distances'):
            dist = tf.reduce_sum(tf.square(tf.expand_dims(X, axis=2) - tf.expand_dims(tf.transpose(mu), axis=0)), \
                                 axis=1, name='distances')
        
        # Create responsibility indices to track which datapoint belongs to which cluster
        with tf.name_scope('responsibility'):
            _, resp = tf.nn.top_k(-dist, name='responsibility_indices')
            resp = tf.cast(resp + 1, tf.int64)
        
        # Calculate loss
        with tf.name_scope('loss'):
            loss = tf.reduce_sum(tf.reduce_min(dist, axis=1), name='loss')
            tf.summary.scalar('loss', loss)

        # Create Adam optimizer
        with tf.name_scope('optimizer'):
            optimizer = tf.train.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.99, epsilon=1e-5).minimize(loss)
    
    with tf.device('/cpu:0'):
        # Merge all summaries
        merged = tf.summary.merge_all()
            
    return X, mu, resp, loss, optimizer, merged

### Define training function:

In [6]:
'''
Run k-means clustering algorithm to cluster datapoints
'''
def run_k_means(K_list, data_dim=2, has_valid=False, device='cpu'):
    '''
    If has_valid is true, subsets:
        first 2/3 of data as training data
        remaining 1/3 of data as validation data
    '''
    def subset_data(D):
        if D == 2:
            data = data2D
        divider = data.shape[0] * 2 / 3
        return data[:divider], data[divider:]
    
    #######################
    ##  Function begins  ##
    #######################
    '''
    cluster_centres: 11 x K x D
    resp_idx:        N x 11
    '''
    
    # Define locally global function
    MAX_ITER = 500
    CURR_TIME = '{:%b%d %H_%M_%S}'.format(datetime.datetime.now())
    SUMMARY_DIR = LOG_DIR + '/K-means/' + CURR_TIME
    
    for K in K_list:
        # Clear any pre-defined graph
        tf.reset_default_graph()
        
        # Build TensorFlow graph
        X, mu, resp, loss, optimizer, merged = build_k_means(K, data_dim, device)
        
        # Select appropriate input_data
        if has_valid:
            input_data, valid_data = subset_data(data_dim)
        else:
            input_data = data2D
        
        # Begin session
        with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
            # Log start time
            start_time = time.time()

            # Create sub-directory title
            sub_dir = '/K={},dim={},valid={}'.format(K, data_dim, has_valid)
            
            # Create summary writers
            train_writer = tf.summary.FileWriter(SUMMARY_DIR + sub_dir + '/train', graph=sess.graph)
            if has_valid:
                valid_writer = tf.summary.FileWriter(SUMMARY_DIR + sub_dir + '/valid')

            # Initialise all TensorFlow variables
            tf.global_variables_initializer().run()
            
            # Define iterator
            currIter = 0
            
            # Calculate training (and validation) loss, 
            # cluster centres and responsibility indices before any training
            err, summaries, clusters, indices = sess.run([loss, merged, mu, resp], feed_dict={X:input_data})
            train_writer.add_summary(summaries, currIter)
            
            if has_valid:
                err, summaries = sess.run([loss, merged], feed_dict={X:valid_data})
                valid_writer.add_summary(summaries, currIter)
            
            # Begin training
            while currIter < MAX_ITER:                
                # Train graph
                _, err, summaries = sess.run([optimizer, loss, merged], feed_dict={X:input_data})

                # Add training loss
                train_writer.add_summary(summaries, currIter + 1)

                # Log validation loss
                if has_valid:
                    err, summaries = sess.run([loss, merged], feed_dict={X:valid_data})
                    valid_writer.add_summary(summaries, currIter)
                
                # Log responsibility indices and cluster centres every 10% of maximum iteration
                if ((float(currIter) + 1) * 100 / MAX_ITER) % 10 == 0:
                    clusters, indices = sess.run([mu, resp], feed_dict={X:input_data})
                
                # Post training progress to user, every 100 iterations
                if currIter % 100 == 99:
                    if not has_valid:
                        print 'iter: {:3d}, train_loss: {:3.1f}'.format(currIter + 1, err)
                    else:
                        print 'iter: {:3d}, valid_loss: {:3.1f}'\
                                .format(currIter + 1, err)
                
                currIter += 1
            
            # End of while loop
            print 'Max iteration reached'
            train_writer.close()
            if has_valid:
                valid_writer.close()
                
            centres = mu.eval()
            
            # TODO calculate convergence
            if not has_valid:
                print 'Duration: {:3.1f}s, Centres:\n{}\n'.format(time.time() - start_time, centres)
            else:
                print 'Duration: {:3.1f}s, Centres:\n{}\n'.format(time.time() - start_time, centres)
                                                                              
    print 'RUN COMPLETED'

## Section 3.1.2: Run K-means without validation

In [10]:
run_k_means(K_list=[1, 2, 3, 4, 5], has_valid=True, data_dim=2)

iter: 100, valid_loss: 251252180555506166506979328.0
iter: 200, valid_loss: 251252180555506166506979328.0
iter: 300, valid_loss: 251252180555506166506979328.0
iter: 400, valid_loss: 251252180555506166506979328.0
iter: 500, valid_loss: 251252180555506166506979328.0
Max iteration reached
Duration: 1.2s, Centres:
[[ 4.914 -6.21 ]]

iter: 100, valid_loss: 251252180555506166506979328.0
iter: 200, valid_loss: 251252180555506166506979328.0
iter: 300, valid_loss: 251252180555506166506979328.0
iter: 400, valid_loss: 251252180555506166506979328.0
iter: 500, valid_loss: 251252180555506166506979328.0
Max iteration reached
Duration: 1.9s, Centres:
[[ 4.914 -6.21 ]
 [ 4.073 -5.505]]

iter: 100, valid_loss: 251252180555506166506979328.0
iter: 200, valid_loss: 251252180555506166506979328.0
iter: 300, valid_loss: 251252180555506166506979328.0
iter: 400, valid_loss: 251252180555506166506979328.0
iter: 500, valid_loss: 251252180555506166506979328.0
Max iteration reached
Duration: 2.1s, Centres:
[[ 4.914 

### Plot loss vs number of updates

In [None]:
def build_RBFNN(eta, lambda_, num_hidden_units, dropout_rate, device='gpu'):
    dim_features = train_data.shape[1]
    dim_target = train_target.shape[1]
    '''
    - Builds a radial-basis function neural network with numHidLayers hidden layers, with 
      layerUnits[i] in the i-th hidden layer
      
    Inputs:
        eta:              Learning rate
        lambda_:          L2 regularizer
        dim_features:     Dimension of input features
        dim_target:       Dimension of target
        num_hidden_units: List of hidden units in each layer
    '''
    
    def connect_layers(h_in, num_next_unit, name='input_to_layer'):
        '''
        - Creates weights and biases to connect previous layer to next layer
        - Returns the weighted sum of inputs for to the next layer
        - Initializes weights using Xavier initialization.
        
        Inputs:
            h_in:    Incoming activation tensor from previous layer
            numOut: Number of hidden units of next layer
            
        Outputs:
            z: Weighted sum of inputs into the next layer
        '''
        num_prev_unit = h_in.get_shape()[1]
        
        # Xavier initialization for weights
        xavier = tf.sqrt(3 / tf.cast(num_prev_unit + num_next_unit, tf.float32), name='xavier')
        
        W = tf.get_variable('weights', shape=[num_prev_unit, num_next_unit], \
                            initializer=tf.truncated_normal_initializer(stddev=xavier))
        b = tf.get_variable('biases', shape=[1, num_next_unit], initializer=tf.truncated_normal_initializer(0., 1.))
        
        tf.add_to_collection(tf.GraphKeys.WEIGHTS, W)
        tf.add_to_collection(tf.GraphKeys.BIASES, b)
        
        return tf.add(tf.matmul(h_in, W), b, name=name)
    
    def build_layers(num_hidden_units, dropout_rate):
        '''
        - Builds hidden layers and output layer
        
        Inputs:
            numHidLayers: Number of hidden layers
            layerUnits: List of number of units in each layer; len(layerUnits) == numHidLayer
            
        Output:
            zOut: Logits to output layer
        '''
        ### Builds hidden layers
        ZList = [] # Stores the inputs to the i-th layer
        hList = [] # Stores the activation of the i-th layer
        
        for i in range(len(num_hidden_units)):
            # Create the i-th hidden layer
            with tf.variable_scope('hidden_layer_%d' % i) as scope:
                # Calculates input to the i-th hidden layer
                # Uses placeholder tensor X instead of Z_(i-1) in connectLayers for the first hidden layer
                if i == 0:
                    ZList.append(connect_layers(X, num_hidden_units[i]))
                else:
                    ZList.append(connect_layers(hList[-1], num_hidden_units[i]))
                
                # Calculates ReLU activation of each layer
                hList.append(tf.nn.dropout(tf.nn.relu(ZList[i], name='hidden_activations'), keep_prob=1-dropout_rate, \
                                          name='dropped_activations'))
            
        # Adds activation variables into GraphKey.ACTIVATIONS collection
        for h in hList:
            tf.add_to_collection(tf.GraphKeys.ACTIVATIONS, h)
            
        ### Builds output layer
        with tf.variable_scope('output_layer'):
            ZList.append(connect_layers(hList[-1], dim_target, name='logits'))
        
        return ZList[-1]
    
    def _add_histogram(vars_):
        '''
        Helper function to add histogram tag to variables
        Input:
            var: variable to be tagged with histogram summary
        '''
        for var in vars_:
            tf.summary.histogram(var.op.name, var)
    
    
    #######################################
    #           Function begins           #
    #######################################
    
    # Define computation device
    try:
        assert device == 'cpu' or device == 'gpu'
    except AssertionError:
        print 'Invalid device chosen. Please use \'cpu\' or \'gpu\'.'
        quit()
    device = '/' + device + ':0'
    
    # Fix the random seed
    # tf.set_random_seed(SEED)
    
    # Create a Tensor to save data partition index for reproducibility
    rand_idx_tf = tf.constant(rand_idx, name='data_partition_index')
    
    with tf.device('/cpu:0'):
        with tf.name_scope('placeholders'):
            X = tf.placeholder(tf.float32, shape=[None, dim_features], name='input_layer')
            Y = tf.placeholder(tf.float32, shape=[None, dim_target], name='target')
        
    with tf.device(device):
        Yhat = build_layers(num_hidden_units, dropout_rate) # Logits to output layer
        
    with tf.device('/cpu:0'):
        # Calculates training metrics
        with tf.name_scope('metrics'):
            # Calculates loss function
            with tf.name_scope('loss'):
                # Calculates cross-entropy
                with tf.name_scope('l2_loss'):
                    loss = tf.reduce_mean((Yhat - Y)**2, name='l2_loss')
                    l2_loss_summary = tf.summary.scalar('l2_loss', loss)
                # Calculates l2 loss
                with tf.name_scope('l2_penalty'):
                    weights = tf.get_collection_ref(tf.GraphKeys.WEIGHTS)
                    biases = tf.get_collection_ref(tf.GraphKeys.BIASES)
                    actv = tf.get_collection_ref(tf.GraphKeys.ACTIVATIONS)
                    l2_loss = tf.multiply(lambda_, \
                                          tf.add(tf.add_n([tf.nn.l2_loss(W) for W in weights]), \
                                                    tf.add_n([tf.nn.l2_loss(b) for b in biases]), \
                                          name='l2_penalty'))
                # Calculates total loss
                with tf.name_scope('total_loss'):
                    total_loss = tf.add(loss, l2_loss, name='total_loss')
                    tf.summary.scalar('loss', total_loss)
                
    with tf.device(device):
        # Optimizer
        with tf.variable_scope('Adam'):
            optimizer = tf.train.AdamOptimizer(eta).minimize(total_loss)
            
    with tf.device('/cpu:0'):
        # Add histogram summaries for variables of interest
        vars_ = []
        for var in [weights, biases, actv]:
            vars_.extend(var)
        _add_histogram(vars_)
        
        # Merge all summaries
        merged = tf.summary.merge_all()
    
    return X, Y, Yhat, weights, biases, actv, total_loss, optimizer, l2_loss_summary, merged

In [8]:
def loss_IGraph(loss):
    # Define data to plot
    trace = go.Scatter(
        x = range(loss.shape[0]),
        y = loss
    )
    data = go.Data([trace])
    
    # Define layout
    layout = go.Layout(
        title = '$\\mathcal{L}({\\mathbf{\\mu}}) \\text{ vs. Number of Updates}$',
        xaxis = {'title': 'Updates'},
        yaxis = {'title': 'Loss'}
    )
    
    # Define figure
    figure = go.Figure(data=data, layout=layout)
    
    # Generate plot
    py.iplot(figure, filename='/ECE521: A3/Q1: K-means/Q1.2_k_means_loss', sharing='private')
    return pyo.iplot(figure)

# Generate loss function graph
figure = loss_IGraph(results_1_1_3[2]['train_loss'])

### Clustering visualisations

In [14]:
'''
Colour data points by clusters generated by K-means algorithm
Input:
    cluster_centres: coordinates of cluster centres (K x D)
    resp_idxes:      final responsibility indices for each run of K (N x num_subplots)
'''
def visualise_k_means_clusters(result):    
    # Store cluster centres and responsibility indices
    
    cluster_centres = result['cluster_centres'][-1,:,:]
    resp_idx = result['responsibility_indices'][:,-1]
    
    # Define colour list as per Plotly's default colour list
    colour_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
    
    # Create traces for each cluster
    traces = []
    for k in range(np.amax(resp_idx)):
        # Create trace for data points in cluster k
        traces.append(go.Scatter(
            x = data2D[resp_idx == k + 1][:,0],
            y = data2D[resp_idx == k + 1][:,1],
            hoverinfo = 'none',
            mode = 'markers',
            marker = {
                'size': 4,
                'color': colour_list[k],
            }
        ))

        # Create trace for cluster centre k
        traces.append(go.Scatter(
            x = [cluster_centres[k][0]],
            y = [cluster_centres[k][1]],
            name = 'Cluster {}'.format(k + 1),
            mode = 'markers',
            marker = {
                'size': 15,
                'symbol': 'diamond',
                'color': '#000000'
            }
        ))

    # Add traces 
    traces = go.Data(traces)

    # Generate figure layout
    layout = go.Layout(
        height = 800,
        showlegend = False,
        title = 'Clusters Visualization (K = {})'.format(result['K']),
        xaxis = {'title': 'x'},
        yaxis = {'title': 'y'}
    )
    
    # Generate figure
    figure = go.Figure(data=traces, layout=layout)
    
    py.iplot(figure, filename='/ECE521: A3/Q1: K-means/Q1.3_cluster_viz_K={}'.format(result['K']), sharing='private')
    return pyo.iplot(figure)

In [178]:
'''
Creates animated plot of data points coloured by clusters

Input:
    result['cluster_centres']:        coordinates of cluster centres [11 x K x D]
    result['responsibility_indices']: responsibility indices for each run of K [N x 11]

Dimension of 11 represent 10% increments from (0%, 100%)
'''
def visualise_k_means_clusters_animated(result, title, filename):
    # Store cluster centres and responsibility indices
    cluster_centres = result['cluster_centres']#[-1,:,:]
    resp_idx = result['responsibility_indices']#[:,-1]
    K = result['K']
    
    # Define colour list as per Plotly's default colour list
    colour_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
    
    # Define slider ticker labels
    slider_values = ['0%', '10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%', '100%']
    
    ###
    ### Create Grid to store data for Plotly.v2 animated plot
    ###
    # Define column containing data
    columns = []
    for i in range(11):
        for k in range(K):
            # Create columns for data points
            columns.append(gro.Column(data2D[resp_idx[:,i] == k + 1][:,0], 'data_k={}_x_{}'.format(k + 1, i)))
            columns.append(gro.Column(data2D[resp_idx[:,i] == k + 1][:,1], 'data_k={}_y_{}'.format(k + 1, i)))
            
            # Create columns for cluster centres
            columns.append(gro.Column([cluster_centres[i][k][0]], 'cluster_k={}_x_{}'.format(k + 1, i)))
            columns.append(gro.Column([cluster_centres[i][k][1]], 'cluster_k={}_y_{}'.format(k + 1, i)))
    
    # Create grid from columns
    grid = gro.Grid(columns)
    
    # Push grid to cloud
    try:
        py.grid_ops.upload(grid, 'A3Q1.1.3_cluster_subplots_K={}'.format(K), auto_open=False)
    except:
        py.grid_ops.delete(grid)
        py.grid_ops.upload(grid, 'A3Q1.1.3_cluster_subplots_K={}'.format(K), auto_open=False)
    
    # Define barebone figure
    figure = {
        'data': [],
        'layout': {},
        'frames': [],
        'config': {'scrollzoom': True}
    }
    
    # Create layout
    figure['layout'] = {
        'width': 900,
        'height': 900,
        'xaxis': {'range': [-4, 4], 'autorange': False},
        'yaxis': {'range': [-5, 2], 'autorange': False},
        'title': 'Cluster Visualisation (K = {})'.format(K),
        'showlegends': False,
        'updatemenus': [{'type': 'buttons',
                          'direction': 'left', # Arrange placement of buttons
                          'pad': {'r': 10, 't': 87}, # Right and top padding
                          'showactive': False, # Removes highlight from active button
                          'x': 0.1, # Button positions 
                          'y': 0, # Button positions
                          'xanchor': 'right',
                          'yanchor': 'top',
                          'buttons': [
                              {
                                  'label': 'Play', # Button label
                                  'method': 'animate', # Method name
                                  'args': [None, 
                                           { # Args determines which frames to animate
                                               'frame': {'duration': 500, 'redraw': False},
                                               'fromcurrent': True,
                                               'transition': {'duration': 300, 'easing': 'quadratic-in-out'}
                                           }
                                          ]
                              },
                              {
                                  'label': 'Pause',
                                  'method': 'animate',
                                  'args': [[None], 
                                           { # '[None]' ensures proper 'pause' functionailty
                                               'frame': {'duration': 0, 'redraw': False},
                                               'mode': 'immediate',
                                               'transition': {'duration': 0}
                                           }
                                          ]
                              }
                          ]
                         }
                       ]
    }
    
    # Create frames and slider steps
    slider_steps = []
    for i in range(11):        
        # Create single frame variable
        frame = {'data': [], 'name': slider_values[i]} # Without name, slider will not interact with graph

        # Populate data for each cluster k
        for k in range(K):
            # Create trace for coloured data points
            data_trace = {
                'xsrc': grid.get_column_reference('data_k={}_x_{}'.format(k + 1, i)),
                'ysrc': grid.get_column_reference('data_k={}_y_{}'.format(k + 1, i)),
                'hoverinfo': 'none',
                'mode': 'markers',
                'marker': {
                    'size': 4,
                    'color': colour_list[k]
                }
            }
            
            # Create trace for cluster centres
            cluster_trace = {
                'xsrc': grid.get_column_reference('cluster_k={}_x_{}'.format(k + 1, i)),
                'ysrc': grid.get_column_reference('cluster_k={}_y_{}'.format(k + 1, i)),
                'name': 'Cluster {}'.format(k + 1),
                'mode': 'markers',
                'marker': {
                    'size': 15,
                    'symbol': 'diamond',
                    'color': '#000000'
                }
            }
            
            # Append first set of data_trace and cluster_trace to data
            if i == 0:
                figure['data'].append(data_trace)
                figure['data'].append(cluster_trace)
            
            # Append all data_trace and cluster_trace to frames
            frame['data'].append(data_trace)
            frame['data'].append(cluster_trace)
            
        # Append frame to frames variable
        figure['frames'].append(frame)
        
        # Define slider step
        slider_step = {
            'args': [
                [slider_values[i]],
                {'frame': {'duration': 300, 'redraw': False},
                 'mode': 'immediate',
                 'transition': {'duration': 300}}
            ],
            'label': slider_values[i],
            'method': 'animate'
        }
        
        # Append slider stes to slider dictionary
        slider_steps.append(slider_step)
        
    # Define slider dictionary
    slider_dict = {
        'active': 0, # Slider knob's relative starting location
        'pad': {'b': 10, 't': 50}, # Bottom and top padding
        'len': 0.9, # Slider length
        'x': 0.1, # Slider x-position
        'y': 0, # Slider y-position
        'yanchor': 'top', 
        'xanchor': 'left',
        'currentvalue': { # Displays current value selected by slider
            'font': {'size': 20},
            'prefix': 'Training: ',
            'visible': True,
            'xanchor': 'right'
        },
        'transition': {'duration': 300, 'easing': 'cubic-in-out'},
        'steps': slider_steps
    }
    
    # Add sliders to layout
    figure['layout']['sliders'] = [slider_dict]
    
    # Reinstate showlegend property in layout to be False
    figure['layout']['showlegend'] = False
    
    try:
        py.icreate_animations(figure, filename=filename)
    except:
        py.icreate_animations(figure)
        
#     py.grid_ops.delete(grid)

In [236]:
'''
Creates animated plot of data points coloured by clusters

Input:
    result['cluster_centres']:        coordinates of cluster centres [11 x K x D]
    result['responsibility_indices']: responsibility indices for each run of K [N x 11]

Dimension of 11 represent 10% increments from (0%, 100%)
'''
def visualise_k_means_clusters_animated_offline(result):
    # Store cluster centres and responsibility indices
    cluster_centres = result['cluster_centres']
    resp_idx = result['responsibility_indices']
    K = result['K']
    
    # Define colour list as per Plotly's default colour list
    colour_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
    
    # Define slider ticker labels
    slider_values = ['0%', '10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%', '100%']
    
    # Define barebone figure
    figure = {
        'data': [],
        'layout': {},
        'frames': []
    }
    
    # Create layout
    figure['layout'] = {
        'width': 900,
        'height': 900,
        'xaxis': {'range': [-4, 4], 'autorange': False},
        'yaxis': {'range': [-5, 2], 'autorange': False},
        'title': 'Cluster Visualisation (K = {})'.format(K),
        'showlegends': False,
        'updatemenus': [{'type': 'buttons',
                          'direction': 'left', # Arrange placement of buttons
                          'pad': {'r': 10, 't': 87}, # Right and top padding
                          'showactive': False, # Removes highlight from active button
                          'x': 0.1, # Button positions 
                          'y': 0, # Button positions
                          'xanchor': 'right',
                          'yanchor': 'top',
                          'buttons': [
                              {
                                  'label': 'Play', # Button label
                                  'method': 'animate', # Method name
                                  'args': [None, 
                                           { # Args determines which frames to animate
                                               'frame': {'duration': 500, 'redraw': False},
                                               'fromcurrent': True,
                                               'transition': {'duration': 300, 'easing': 'quadratic-in-out'}
                                           }
                                          ]
                              },
                              {
                                  'label': 'Pause',
                                  'method': 'animate',
                                  'args': [[None], 
                                           { # '[None]' ensures proper 'pause' functionailty
                                               'frame': {'duration': 0, 'redraw': False},
                                               'mode': 'immediate',
                                               'transition': {'duration': 0}
                                           }
                                          ]
                              }
                          ]
                         }
                       ]
    }
    
    # Create frames and slider steps
    slider_steps = []
    for i in range(11):        
        # Create single frame variable
        frame = {'data': [], 'name': slider_values[i]} # Without name, slider will not interact with graph

        # Populate data for each cluster k
        for k in range(K):
            # Create trace for coloured data points
            data_trace = {
                'x': data2D[resp_idx[:,i] == k + 1][:,0], 
                'y': data2D[resp_idx[:,i] == k + 1][:,1],
                'hoverinfo': 'none',
                'mode': 'markers',
                'marker': {
                    'size': 4,
                    'color': colour_list[k]
                }
            }
            
            # Create trace for cluster centres
            cluster_trace = {
                'x': [cluster_centres[i][k][0]],
                'y': [cluster_centres[i][k][1]],
                'name': 'Cluster {}'.format(k + 1),
                'mode': 'markers',
                'marker': {
                    'size': 15,
                    'symbol': 'diamond',
                    'color': '#000000'
                }
            }
            
            # Append first set of data_trace and cluster_trace to data
            if i == 0:
                figure['data'].append(data_trace)
                figure['data'].append(cluster_trace)
            
            # Append all data_trace and cluster_trace to frames
            frame['data'].append(data_trace)
            frame['data'].append(cluster_trace)
            
        # Append frame to frames variable
        figure['frames'].append(frame)
        
        # Define slider step
        slider_step = {
            'args': [
                [slider_values[i]],
                {'frame': {'duration': 300, 'redraw': False},
                 'mode': 'immediate',
                 'transition': {'duration': 300}}
            ],
            'label': slider_values[i],
            'method': 'animate'
        }
        
        # Append slider stes to slider dictionary
        slider_steps.append(slider_step)
        
    # Define slider dictionary
    slider_dict = {
        'active': 0, # Slider knob's relative starting location
        'pad': {'b': 10, 't': 50}, # Bottom and top padding
        'len': 0.9, # Slider length
        'x': 0.1, # Slider x-position
        'y': 0, # Slider y-position
        'yanchor': 'top', 
        'xanchor': 'left',
        'currentvalue': { # Displays current value selected by slider
            'font': {'size': 20},
            'prefix': 'Training: ',
            'visible': True,
            'xanchor': 'right'
        },
        'transition': {'duration': 300, 'easing': 'cubic-in-out'},
        'steps': slider_steps
    }
    
    # Add sliders to layout
    figure['layout']['sliders'] = [slider_dict]
    
    # Reinstate showlegend property in layout to be False
    figure['layout']['showlegend'] = False
    
    return pyo.plot(figure, filename='A3Q1.1.3_clusters_K={}.html'.format(K), auto_open=False)

### Visualise clusters

In [15]:
figures = []
for i, result in enumerate(results_1_1_3):
    figures.append(visualise_k_means_clusters(result))

### Create gifs

In [12]:
'''
Creates snapshots of animated plots, with data points coloured by clusters

Input:
    result['cluster_centres']:        coordinates of cluster centres [11 x K x D]
    result['responsibility_indices']: responsibility indices for each run of K [N x 11]

Dimension of 11 represent 10% increments from (0%, 100%)
'''
def generate_gif_images(result):
    # Store cluster centres and responsibility indices
    cluster_centres = result['cluster_centres']
    resp_idx = result['responsibility_indices']
    K = result['K']
    
    # Define colour list as per Plotly's default colour list
    colour_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
    
    # Define slider ticker labels
    slider_values = ['0%', '10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%', '100%']
    
    # Define barebone figure
    figure = {
        'data': [],
        'layout': {}
    }
    
    # Create layout
    figure['layout'] = {
        'width': 900,
        'height': 900,
        'xaxis': {'range': [-4, 4], 'autorange': False},
        'yaxis': {'range': [-5, 2], 'autorange': False},
        'title': 'Cluster Visualisation (K = {})'.format(K),
        'showlegend': False
    }
    
    # Define slider step
    slider_steps = []
    
    for i in range(11):
        slider_step = {
            'args': [
                [slider_values[i]],
                {'frame': {'duration': 300, 'redraw': False},
                 'mode': 'immediate',
                 'transition': {'duration': 300}}
            ],
            'label': slider_values[i],
            'method': 'animate'
        }

        # Append slider stes to slider dictionary
        slider_steps.append(slider_step)
    
    #####
    ### SNAPSHOTS
    #####
    
    # Create snapshots
    for i in range(11):        
        # Reset figure data
        figure['data'] = []
        
        # Populate data for each cluster k
        for k in range(K):
            # Create trace for coloured data points
            data_trace = {
                'x': data2D[resp_idx[:,i] == k + 1][:,0], 
                'y': data2D[resp_idx[:,i] == k + 1][:,1],
                'hoverinfo': 'none',
                'mode': 'markers',
                'marker': {
                    'size': 4,
                    'color': colour_list[k]
                }
            }
            
            # Create trace for cluster centres
            cluster_trace = {
                'x': [cluster_centres[i][k][0]],
                'y': [cluster_centres[i][k][1]],
                'name': 'Cluster {}'.format(k + 1),
                'mode': 'markers',
                'marker': {
                    'size': 15,
                    'symbol': 'diamond',
                    'color': '#000000'
                }
            }
            
            # Append first set of data_trace and cluster_trace to data
            figure['data'].append(data_trace)
            figure['data'].append(cluster_trace)
                    
        # Reinstate showlegend property in layout to be False
        figure['layout']['showlegend'] = False
        
        # Define slider dictionary
        slider_dict = {
            'active': i, # Slider knob's relative starting location
            'pad': {'b': 10, 't': 50}, # Bottom and top padding
            'len': 1, # Slider length
            'x': 0, # Slider x-position
            'y': 0, # Slider y-position
            'yanchor': 'top', 
            'xanchor': 'left',
            'currentvalue': { # Displays current value selected by slider
                'font': {'size': 20},
                'prefix': 'Training: ',
                'visible': True,
                'xanchor': 'right'
            },
            'transition': {'duration': 300, 'easing': 'cubic-in-out'},
            'steps': slider_steps
        }

        # Add sliders to layout
        figure['layout']['sliders'] = [slider_dict]
        
        # Save snapshots locally
        py.plotly.image.save_as(figure, filename='Q1.1.3_K={0}_gif_{1:02d}.png'.format(K, i), \
                                width=900, height=900, scale=1)

In [352]:
for i, result in enumerate(results_1_1_3[2:]):
    generate_gif_images(result)

## Section 3.1.3: Run K-means with validation

In [26]:
results_1_1_4 = run_k_means(K_list=[1, 2, 3, 4, 5], data_dim=2, has_valid=True)

iter: 100, train_loss: 25576.7, valid_loss: 12877.1
iter: 200, train_loss: 25576.6, valid_loss: 12877.4
iter: 300, train_loss: 25576.6, valid_loss: 12877.4
iter: 400, train_loss: 25576.6, valid_loss: 12877.4
iter: 500, train_loss: 25576.6, valid_loss: 12877.4
Max iteration reached
K:   1, train loss: 25576.6, valid loss: 12877.4, duration: 1.4s

iter: 100, train_loss: 13357.7, valid_loss: 6647.4
iter: 200, train_loss: 6567.5, valid_loss: 3363.8
iter: 300, train_loss: 6072.9, valid_loss: 3139.7
iter: 400, train_loss: 6063.8, valid_loss: 3139.9
iter: 500, train_loss: 6063.8, valid_loss: 3140.1
Max iteration reached
K:   2, train loss: 6063.8, valid loss: 3140.1, duration: 2.5s

iter: 100, train_loss: 7211.1, valid_loss: 3581.0
iter: 200, train_loss: 3997.4, valid_loss: 2061.2
iter: 300, train_loss: 3423.1, valid_loss: 1776.9
iter: 400, train_loss: 3369.5, valid_loss: 1745.6
iter: 500, train_loss: 3369.0, valid_loss: 1744.9
Max iteration reached
K:   3, train loss: 3369.0, valid loss: 174

### Save results

In [27]:
np.save('./Results/K-means/1_1_4.npy', results_1_1_4)

### Generate bar chart for cluster assignment $\%$

In [16]:
'''
Generate a bar chart for each model showing percentage of data points belong to each cluster
'''
def cluster_assignment_IGraph(results):
    # Define colour list as per Plotly's default colour list
    colour_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
    
    # Define empty figure
    figure = {
        'data': [],
        'layout': {}
    }
    
    # Define data to plot
    for i, result in enumerate(results):
        for k in range(result['K']):
            trace = go.Bar(
                x = [i + 1],
                y = [result['composition'][k]],
                marker = {'color': colour_list[k]},
                name = 'Cluster {}'.format(k + 1)
            )
            figure['data'].append(trace)
    
    # Define layout
    figure['layout'] = {
        'title': 'Percentage of data points assigned to each cluster',
        'xaxis': {'title': 'Number of clusters, K'},
        'yaxis': {'title': 'Assignment to cluster, %'},
        'barmode': 'stack',
        'showlegend': False
    }
    
    # Generate plot
    py.iplot(figure, filename='/ECE521: A3/Q1: K-means/Q1.3_assignment_bar_chart', sharing='private')
    return pyo.iplot(figure)

# Generate loss function graph
figure = cluster_assignment_IGraph(results_1_1_3)

## Section 2.2.2.4: K-Means on $\textit{data100D.npy}$ with validation $(K = 1, 2, 3, 4, 5, 6)$

In [40]:
results_2_2_2_4 = run_k_means(K_list=[1], data_dim=100, device='cpu', has_valid=True)

iter: 100, train_loss: 322298.4, valid_loss: 162575.0
iter: 200, train_loss: 174001.2, valid_loss: 88365.3
iter: 300, train_loss: 148354.3, valid_loss: 75443.4
iter: 400, train_loss: 143330.5, valid_loss: 72840.5
iter: 500, train_loss: 142040.7, valid_loss: 72171.0
Max iteration reached
K:  15, train loss: 142031.0, valid loss: 72167.2, duration: 180.7s

RUN COMPLETED


### Save results

In [47]:
results_2_2_2_4 = holder

In [48]:
np.save('./Results/MoG/2_2_4_K-means.npy', results_2_2_2_4)