# Section 3.1: K-means

## Section 3.1.0: Package initialisations, environment configuration and function definitions

Import relevant packages:

In [1]:
import tensorflow as tf
import numpy as np

import time
import datetime

# Non-interactive plotting
import matplotlib.pyplot as plt

# Interactive plotting
from plotly import tools
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.grid_objs as gro
import plotly.offline as pyo
from plotly.offline import download_plotlyjs

Configure environment:

In [2]:
%config InlineBackend.figure_format = 'retina'
np.set_printoptions(precision=3)

# Global Variables
LOG_DIR = './Logs'

# Activate Plotly Offline for Jupyter
pyo.init_notebook_mode(connected=True)

# Define global variable SEED
SEED = 521

Load data2D.npy into memory:

In [3]:
"""
data2D.npy contains 10,000 data points of dimension 2
data100D.npy contains 10,000 data points of dimension 100
"""
# Load data
data2D = np.load("./Data/data2D.npy")
data100D = np.load("./Data/data100D.npy")

# Set random seed
np.random.seed(521)

# Generate random index
randIdx2D = np.arange(len(data2D))
randIdx100D = np.arange(len(data100D))

# Randomise data2D
np.random.shuffle(randIdx2D)
data2D = data2D[randIdx2D]

# Randomise data100D
np.random.shuffle(randIdx100D)
data100D = data100D[randIdx100D]

### Load results (optional; when resuming work)

In [5]:
# results_1_1_3 = np.load('./Results/K-means/1_1_3.npy')
# results_1_1_4 = np.load('./Results/K-means/1_1_4.npy')
# results_2_2_2_4 = np.load('./Results/MoG/2_2_4_K-means.npy')

### Create K-means TensorFlow graph:

Loss function:
$$ \mathcal{L}(\mathbf{\mu}) = \sum_{n=1}^N \min_{k=1}^K || \mathbf{x}_n - \mathbf{\mu}_k ||_2^2 $$ 

In [4]:
'''
Creates a graph for K-means based on the loss function above:

Inputs
    K:       Number of classifiers
    data_dim: dimension of datapoint
    
X:  data placeholder (N x data_dim)
mu: cluster centres (K x data_dim)
'''
def build_k_means(K, data_dim, device='cpu'):
    
    # Set TF graph seed
    tf.set_random_seed(SEED)
    
    # Define computation device
    try:
        assert device == 'cpu' or device == 'gpu'
    except AssertionError:
        print 'Invalid device chosen. Please use \'cpu\' or \'gpu\''
        quit()
    device = '/' + device + ':0'
    
    with tf.device('/cpu:0'):
        # Create placeholder
        with tf.name_scope('placeholders'):
            X = tf.placeholder(tf.float32, shape=[None, data_dim], name='inputs')
        # Define parameters
        with tf.variable_scope('parameters'):
            mu = tf.get_variable('cluster_centres', shape=[K, data_dim], initializer=tf.random_normal_initializer(seed=SEED))

    with tf.device(device):
        # Calculate distance matrix (N x K)
        # by subtracting expanded X (N x D x 1) with expanded mu (1 x D x K) using broadcasting
        with tf.name_scope('distances'):
            dist = tf.reduce_sum(tf.square(tf.expand_dims(X, axis=2) - tf.expand_dims(tf.transpose(mu), axis=0)), \
                                 axis=1, name='distances')
        
        # Create responsibility indices to track which datapoint belongs to which cluster
        with tf.name_scope('responsibility'):
            _, resp = tf.nn.top_k(-dist, name='responsibility_indices')
            resp = tf.cast(resp + 1, tf.int64)
        
        # Calculate loss
        with tf.name_scope('loss'):
            loss = tf.reduce_sum(tf.reduce_min(dist, axis=1), name='loss')
            tf.summary.scalar('loss', loss)

        # Create Adam optimizer
        with tf.name_scope('optimizer'):
            optimizer = tf.train.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.99, epsilon=1e-5).minimize(loss)
    
    with tf.device('/cpu:0'):
        # Merge all summaries
        merged = tf.summary.merge_all()
            
    return X, mu, resp, loss, optimizer, merged

### Define training function:

In [8]:
'''
Run k-means clustering algorithm to cluster datapoints
'''
def run_k_means(K_list, data_dim, has_valid=False, device='cpu'):
    '''
    If has_valid is true, subsets:
        first 2/3 of data as training data
        remaining 1/3 of data as validation data
    '''
    def subset_data(D):
        if D == 2:
            data = data2D
        elif D == 100:
            data = data100D
        divider = data.shape[0] * 2 / 3
        return data[:divider], data[divider:]
    
    '''
    Calculate percentage of points belonging to each of K clusters
    '''
    def calculate_composition(K, resp_idx):
        composition = np.array([])
        for k in range(K):
            composition = np.append(composition, np.true_divide(np.sum(resp_idx == k + 1), resp_idx.shape[0]))
        return composition
    
    #######################
    ##  Function begins  ##
    #######################
    '''
    cluster_centres: 11 x K x D
    resp_idx:        N x 11
    '''
    
    # Assert correct value for data_dim
    assert data_dim == 2 or data_dim == 100
    
    # Define locally global function
    MAX_ITER = 500
    CURR_TIME = '{:%b%d %H_%M_%S}'.format(datetime.datetime.now())
    SUMMARY_DIR = LOG_DIR + '/K-means/' + CURR_TIME
    
    # Create list to store run results
    results = []
    
    for K in K_list:
        # Clear any pre-defined graph
        tf.reset_default_graph()
        
        # Build TensorFlow graph
        X, mu, resp, loss, optimizer, merged = build_k_means(K, data_dim, device)
        
        # Select appropriate input_data
        if has_valid:
            input_data, valid_data = subset_data(data_dim)
        else:
            input_data = data2D if data_dim == 2 else data100D

        # Create arrays to log session losses, cluster_centres and responsbility indices
        train_loss = np.array([])[:, np.newaxis]            
        if has_valid:
            valid_loss = np.array([])[:, np.newaxis]
        cluster_centres = np.array([])[:, np.newaxis, np.newaxis].reshape(0, K, data_dim)
        resp_idx = np.array([])[:, np.newaxis].reshape(input_data.shape[0], 0)
        
        # Begin session
        with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
            # Log start time
            start_time = time.time()

            # Create sub-directory title
            sub_dir = '/K={},dim={},valid={}'.format(K, data_dim, has_valid)
            
            # Create summary writers
            train_writer = tf.summary.FileWriter(SUMMARY_DIR + sub_dir + '/train', graph=sess.graph)
            if has_valid:
                valid_writer = tf.summary.FileWriter(SUMMARY_DIR + sub_dir + '/valid')

            # Initialise all TensorFlow variables
            tf.global_variables_initializer().run()
            
            # Define iterator
            currIter = 0
            
            # Calculate training (and validation) loss, 
            # cluster centres and responsibility indices before any training
            err, summaries, clusters, indices = sess.run([loss, merged, mu, resp], feed_dict={X:input_data})
            train_loss = np.append(train_loss, err)
            train_writer.add_summary(summaries, currIter)
            
            if has_valid:
                err, summaries = sess.run([loss, merged], feed_dict={X:valid_data})
                valid_loss = np.append(valid_loss, err)
                valid_writer.add_summary(summaries, currIter)
            
            if data_dim == 2:
                cluster_centres = np.append(cluster_centres, clusters[np.newaxis, :, :], axis=0)
                resp_idx = np.append(resp_idx, indices, axis=1)
            
            # Begin training
            while currIter < MAX_ITER:                
                # Run optimizer and log metadata every 100th iteration
                if currIter % 10 == 9:
                    # Configure run options and metadata
                    run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()
                    
                    # Train graph whilst logging metadata
                    _, err, summaries = sess.run([optimizer, loss, merged], \
                                 feed_dict={X:input_data}, \
                                 options=run_options, \
                                 run_metadata=run_metadata)
                    
                    # Log training metadata and run summary
                    train_writer.add_run_metadata(run_metadata, 'step {}'.format(currIter + 1))
                    train_writer.add_summary(summaries, currIter + 1)
                    train_loss = np.append(train_loss, err)
                else:
                    # Train graph
                    _, err, summaries = sess.run([optimizer, loss, merged], feed_dict={X:input_data})

                    # Add training loss
                    train_writer.add_summary(summaries, currIter + 1)
                    train_loss = np.append(train_loss, err)

                # Log validation loss
                if has_valid:
                    err, summaries = sess.run([loss, merged], feed_dict={X:valid_data})
                    valid_loss = np.append(valid_loss, err)
                    valid_writer.add_summary(summaries, currIter)
                
                # Log responsibility indices and cluster centres every 10% of maximum iteration
                if ((float(currIter) + 1) * 100 / MAX_ITER) % 10 == 0:
                    clusters, indices = sess.run([mu, resp], feed_dict={X:input_data})
                    
                    cluster_centres = np.append(cluster_centres, clusters[np.newaxis, :, :], axis=0)
                    resp_idx = np.append(resp_idx, indices, axis=1)
                
                # Post training progress to user, every 100 iterations
                if currIter % 100 == 99:
                    if not has_valid:
                        print 'iter: {:3d}, train_loss: {:3.1f}'.format(currIter, train_loss[currIter])
                    else:
                        print 'iter: {:3d}, train_loss: {:3.1f}, valid_loss: {:3.1f}'\
                                .format(currIter + 1, train_loss[currIter], valid_loss[currIter])
                
                currIter += 1
            
            # End of while loop
            print 'Max iteration reached'
            train_writer.close()
            if has_valid:
                valid_writer.close()
            
            # Calculate composition of points belonging to each cluster
            composition = calculate_composition(K, resp_idx[:, -1])
            
            if not has_valid:
                results.append(
                    {
                        'K': K,
                        'train_loss': train_loss,
                        'cluster_centres': cluster_centres,
                        'responsibility_indices': resp_idx.astype(int),
                        'composition': composition,
                        'time_of_run': '{:%b%d %H_%M_%S}'.format(datetime.datetime.now())
                    }
                )
            else:
                results.append(
                {
                    'K': K,
                    'train_loss': train_loss,
                    'valid_loss': valid_loss,
                    'cluster_centres': cluster_centres,
                    'responsibility_indices': resp_idx.astype(int),
                    'composition': composition,
                    'time_of_run': '{:%b%d %H_%M_%S}'.format(datetime.datetime.now())
                }
            )
            
            # TODO calculate convergence
            if not has_valid:
                print 'K: {:3d}, train loss: {:3.1f}, duration: {:3.1f}s\n'\
                        .format(K, train_loss[-1], time.time() - start_time)
            else:
                print 'K: {:3d}, train loss: {:3.1f}, valid loss: {:3.1f}, duration: {:3.1f}s\n'\
                        .format(K, train_loss[-1], valid_loss[-1], time.time() - start_time)
                                                                              
    print 'RUN COMPLETED'
    return results

## Section 3.1.2: Run K-means without validation

In [9]:
results_1_1_3 = run_k_means(K_list=[1, 2, 3, 4, 5], data_dim=2)

iter:  99, train_loss: 38453.5
iter: 199, train_loss: 38453.5
iter: 299, train_loss: 38453.5
iter: 399, train_loss: 38453.5
iter: 499, train_loss: 38453.5
Max iteration reached
K:   1, train loss: 38453.5, duration: 7.1s

iter:  99, train_loss: 19253.0
iter: 199, train_loss: 9907.8
iter: 299, train_loss: 9216.2
iter: 399, train_loss: 9203.4
iter: 499, train_loss: 9203.4
Max iteration reached
K:   2, train loss: 9203.4, duration: 9.3s

iter:  99, train_loss: 10833.8
iter: 199, train_loss: 6051.7
iter: 299, train_loss: 5198.8
iter: 399, train_loss: 5112.0
iter: 499, train_loss: 5111.1
Max iteration reached
K:   3, train loss: 5111.1, duration: 9.6s

iter:  99, train_loss: 7783.0
iter: 199, train_loss: 3823.0
iter: 299, train_loss: 3392.3
iter: 399, train_loss: 3374.1
iter: 499, train_loss: 3374.0
Max iteration reached
K:   4, train loss: 3374.0, duration: 8.9s

iter:  99, train_loss: 7474.9
iter: 199, train_loss: 3380.9
iter: 299, train_loss: 2872.0
iter: 399, train_loss: 2848.4
iter: 49

### Save results

In [10]:
np.save('./Results/K-means/1_1_3.npy', results_1_1_3)

### Plot loss vs number of updates

In [11]:
def loss_IGraph(loss):
    # Define data to plot
    trace = go.Scatter(
        x = range(loss.shape[0]),
        y = loss
    )
    data = go.Data([trace])
    
    # Define layout
    layout = go.Layout(
        title = '$\\mathcal{L}({\\mathbf{\\mu}}) \\text{ vs. Number of Updates}$',
        xaxis = {'title': 'Updates'},
        yaxis = {'title': 'Loss'}
    )
    
    # Define figure
    figure = go.Figure(data=data, layout=layout)
    
    # Generate plot
    py.iplot(figure, filename='/ECE521: A3/Q1: K-means/Q1.2_k_means_loss', sharing='private')
    return pyo.iplot(figure)

# Generate loss function graph
figure = loss_IGraph(results_1_1_3[2]['train_loss'])

### Clustering visualisations

In [12]:
'''
Colour data points by clusters generated by K-means algorithm
Input:
    cluster_centres: coordinates of cluster centres (K x D)
    resp_idxes:      final responsibility indices for each run of K (N x num_subplots)
'''
def visualise_k_means_clusters(result):    
    # Store cluster centres and responsibility indices
    
    cluster_centres = result['cluster_centres'][-1,:,:]
    resp_idx = result['responsibility_indices'][:,-1]
    
    # Define colour list as per Plotly's default colour list
    colour_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
    
    # Create traces for each cluster
    traces = []
    for k in range(np.amax(resp_idx)):
        # Create trace for data points in cluster k
        traces.append(go.Scatter(
            x = data2D[resp_idx == k + 1][:,0],
            y = data2D[resp_idx == k + 1][:,1],
            hoverinfo = 'none',
            mode = 'markers',
            marker = {
                'size': 4,
                'color': colour_list[k],
            }
        ))

        # Create trace for cluster centre k
        traces.append(go.Scatter(
            x = [cluster_centres[k][0]],
            y = [cluster_centres[k][1]],
            name = 'Cluster {}'.format(k + 1),
            mode = 'markers',
            marker = {
                'size': 15,
                'symbol': 'diamond',
                'color': '#000000'
            }
        ))

    # Add traces 
    traces = go.Data(traces)

    # Generate figure layout
    layout = go.Layout(
        height = 800,
        showlegend = False,
        title = 'Clusters Visualization (K = {})'.format(result['K']),
        xaxis = {'title': 'x'},
        yaxis = {'title': 'y'}
    )
    
    # Generate figure
    figure = go.Figure(data=traces, layout=layout)
    
    py.iplot(figure, filename='/ECE521: A3/Q1: K-means/Q1.3_cluster_viz_K={}'.format(result['K']), sharing='private')
    return pyo.iplot(figure)

In [13]:
'''
Creates animated plot of data points coloured by clusters

Input:
    result['cluster_centres']:        coordinates of cluster centres [11 x K x D]
    result['responsibility_indices']: responsibility indices for each run of K [N x 11]

Dimension of 11 represent 10% increments from (0%, 100%)
'''
def visualise_k_means_clusters_animated(result, title, filename):
    # Store cluster centres and responsibility indices
    cluster_centres = result['cluster_centres']#[-1,:,:]
    resp_idx = result['responsibility_indices']#[:,-1]
    K = result['K']
    
    # Define colour list as per Plotly's default colour list
    colour_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
    
    # Define slider ticker labels
    slider_values = ['0%', '10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%', '100%']
    
    ###
    ### Create Grid to store data for Plotly.v2 animated plot
    ###
    # Define column containing data
    columns = []
    for i in range(11):
        for k in range(K):
            # Create columns for data points
            columns.append(gro.Column(data2D[resp_idx[:,i] == k + 1][:,0], 'data_k={}_x_{}'.format(k + 1, i)))
            columns.append(gro.Column(data2D[resp_idx[:,i] == k + 1][:,1], 'data_k={}_y_{}'.format(k + 1, i)))
            
            # Create columns for cluster centres
            columns.append(gro.Column([cluster_centres[i][k][0]], 'cluster_k={}_x_{}'.format(k + 1, i)))
            columns.append(gro.Column([cluster_centres[i][k][1]], 'cluster_k={}_y_{}'.format(k + 1, i)))
    
    # Create grid from columns
    grid = gro.Grid(columns)
    
    # Push grid to cloud
    try:
        py.grid_ops.upload(grid, 'A3Q1.1.3_cluster_subplots_K={}'.format(K), auto_open=False)
    except:
        py.grid_ops.delete(grid)
        py.grid_ops.upload(grid, 'A3Q1.1.3_cluster_subplots_K={}'.format(K), auto_open=False)
    
    # Define barebone figure
    figure = {
        'data': [],
        'layout': {},
        'frames': [],
        'config': {'scrollzoom': True}
    }
    
    # Create layout
    figure['layout'] = {
        'width': 900,
        'height': 900,
        'xaxis': {'range': [-4, 4], 'autorange': False},
        'yaxis': {'range': [-5, 2], 'autorange': False},
        'title': 'Cluster Visualisation (K = {})'.format(K),
        'showlegends': False,
        'updatemenus': [{'type': 'buttons',
                          'direction': 'left', # Arrange placement of buttons
                          'pad': {'r': 10, 't': 87}, # Right and top padding
                          'showactive': False, # Removes highlight from active button
                          'x': 0.1, # Button positions 
                          'y': 0, # Button positions
                          'xanchor': 'right',
                          'yanchor': 'top',
                          'buttons': [
                              {
                                  'label': 'Play', # Button label
                                  'method': 'animate', # Method name
                                  'args': [None, 
                                           { # Args determines which frames to animate
                                               'frame': {'duration': 500, 'redraw': False},
                                               'fromcurrent': True,
                                               'transition': {'duration': 300, 'easing': 'quadratic-in-out'}
                                           }
                                          ]
                              },
                              {
                                  'label': 'Pause',
                                  'method': 'animate',
                                  'args': [[None], 
                                           { # '[None]' ensures proper 'pause' functionailty
                                               'frame': {'duration': 0, 'redraw': False},
                                               'mode': 'immediate',
                                               'transition': {'duration': 0}
                                           }
                                          ]
                              }
                          ]
                         }
                       ]
    }
    
    # Create frames and slider steps
    slider_steps = []
    for i in range(11):        
        # Create single frame variable
        frame = {'data': [], 'name': slider_values[i]} # Without name, slider will not interact with graph

        # Populate data for each cluster k
        for k in range(K):
            # Create trace for coloured data points
            data_trace = {
                'xsrc': grid.get_column_reference('data_k={}_x_{}'.format(k + 1, i)),
                'ysrc': grid.get_column_reference('data_k={}_y_{}'.format(k + 1, i)),
                'hoverinfo': 'none',
                'mode': 'markers',
                'marker': {
                    'size': 4,
                    'color': colour_list[k]
                }
            }
            
            # Create trace for cluster centres
            cluster_trace = {
                'xsrc': grid.get_column_reference('cluster_k={}_x_{}'.format(k + 1, i)),
                'ysrc': grid.get_column_reference('cluster_k={}_y_{}'.format(k + 1, i)),
                'name': 'Cluster {}'.format(k + 1),
                'mode': 'markers',
                'marker': {
                    'size': 15,
                    'symbol': 'diamond',
                    'color': '#000000'
                }
            }
            
            # Append first set of data_trace and cluster_trace to data
            if i == 0:
                figure['data'].append(data_trace)
                figure['data'].append(cluster_trace)
            
            # Append all data_trace and cluster_trace to frames
            frame['data'].append(data_trace)
            frame['data'].append(cluster_trace)
            
        # Append frame to frames variable
        figure['frames'].append(frame)
        
        # Define slider step
        slider_step = {
            'args': [
                [slider_values[i]],
                {'frame': {'duration': 300, 'redraw': False},
                 'mode': 'immediate',
                 'transition': {'duration': 300}}
            ],
            'label': slider_values[i],
            'method': 'animate'
        }
        
        # Append slider stes to slider dictionary
        slider_steps.append(slider_step)
        
    # Define slider dictionary
    slider_dict = {
        'active': 0, # Slider knob's relative starting location
        'pad': {'b': 10, 't': 50}, # Bottom and top padding
        'len': 0.9, # Slider length
        'x': 0.1, # Slider x-position
        'y': 0, # Slider y-position
        'yanchor': 'top', 
        'xanchor': 'left',
        'currentvalue': { # Displays current value selected by slider
            'font': {'size': 20},
            'prefix': 'Training: ',
            'visible': True,
            'xanchor': 'right'
        },
        'transition': {'duration': 300, 'easing': 'cubic-in-out'},
        'steps': slider_steps
    }
    
    # Add sliders to layout
    figure['layout']['sliders'] = [slider_dict]
    
    # Reinstate showlegend property in layout to be False
    figure['layout']['showlegend'] = False
    
    try:
        py.icreate_animations(figure, filename=filename)
    except:
        py.icreate_animations(figure)
        
#     py.grid_ops.delete(grid)

In [14]:
'''
Creates animated plot of data points coloured by clusters

Input:
    result['cluster_centres']:        coordinates of cluster centres [11 x K x D]
    result['responsibility_indices']: responsibility indices for each run of K [N x 11]

Dimension of 11 represent 10% increments from (0%, 100%)
'''
def visualise_k_means_clusters_animated_offline(result):
    # Store cluster centres and responsibility indices
    cluster_centres = result['cluster_centres']
    resp_idx = result['responsibility_indices']
    K = result['K']
    
    # Define colour list as per Plotly's default colour list
    colour_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
    
    # Define slider ticker labels
    slider_values = ['0%', '10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%', '100%']
    
    # Define barebone figure
    figure = {
        'data': [],
        'layout': {},
        'frames': []
    }
    
    # Create layout
    figure['layout'] = {
        'width': 900,
        'height': 900,
        'xaxis': {'range': [-4, 4], 'autorange': False},
        'yaxis': {'range': [-5, 2], 'autorange': False},
        'title': 'Cluster Visualisation (K = {})'.format(K),
        'showlegends': False,
        'updatemenus': [{'type': 'buttons',
                          'direction': 'left', # Arrange placement of buttons
                          'pad': {'r': 10, 't': 87}, # Right and top padding
                          'showactive': False, # Removes highlight from active button
                          'x': 0.1, # Button positions 
                          'y': 0, # Button positions
                          'xanchor': 'right',
                          'yanchor': 'top',
                          'buttons': [
                              {
                                  'label': 'Play', # Button label
                                  'method': 'animate', # Method name
                                  'args': [None, 
                                           { # Args determines which frames to animate
                                               'frame': {'duration': 500, 'redraw': False},
                                               'fromcurrent': True,
                                               'transition': {'duration': 300, 'easing': 'quadratic-in-out'}
                                           }
                                          ]
                              },
                              {
                                  'label': 'Pause',
                                  'method': 'animate',
                                  'args': [[None], 
                                           { # '[None]' ensures proper 'pause' functionailty
                                               'frame': {'duration': 0, 'redraw': False},
                                               'mode': 'immediate',
                                               'transition': {'duration': 0}
                                           }
                                          ]
                              }
                          ]
                         }
                       ]
    }
    
    # Create frames and slider steps
    slider_steps = []
    for i in range(11):        
        # Create single frame variable
        frame = {'data': [], 'name': slider_values[i]} # Without name, slider will not interact with graph

        # Populate data for each cluster k
        for k in range(K):
            # Create trace for coloured data points
            data_trace = {
                'x': data2D[resp_idx[:,i] == k + 1][:,0], 
                'y': data2D[resp_idx[:,i] == k + 1][:,1],
                'hoverinfo': 'none',
                'mode': 'markers',
                'marker': {
                    'size': 4,
                    'color': colour_list[k]
                }
            }
            
            # Create trace for cluster centres
            cluster_trace = {
                'x': [cluster_centres[i][k][0]],
                'y': [cluster_centres[i][k][1]],
                'name': 'Cluster {}'.format(k + 1),
                'mode': 'markers',
                'marker': {
                    'size': 15,
                    'symbol': 'diamond',
                    'color': '#000000'
                }
            }
            
            # Append first set of data_trace and cluster_trace to data
            if i == 0:
                figure['data'].append(data_trace)
                figure['data'].append(cluster_trace)
            
            # Append all data_trace and cluster_trace to frames
            frame['data'].append(data_trace)
            frame['data'].append(cluster_trace)
            
        # Append frame to frames variable
        figure['frames'].append(frame)
        
        # Define slider step
        slider_step = {
            'args': [
                [slider_values[i]],
                {'frame': {'duration': 300, 'redraw': False},
                 'mode': 'immediate',
                 'transition': {'duration': 300}}
            ],
            'label': slider_values[i],
            'method': 'animate'
        }
        
        # Append slider stes to slider dictionary
        slider_steps.append(slider_step)
        
    # Define slider dictionary
    slider_dict = {
        'active': 0, # Slider knob's relative starting location
        'pad': {'b': 10, 't': 50}, # Bottom and top padding
        'len': 0.9, # Slider length
        'x': 0.1, # Slider x-position
        'y': 0, # Slider y-position
        'yanchor': 'top', 
        'xanchor': 'left',
        'currentvalue': { # Displays current value selected by slider
            'font': {'size': 20},
            'prefix': 'Training: ',
            'visible': True,
            'xanchor': 'right'
        },
        'transition': {'duration': 300, 'easing': 'cubic-in-out'},
        'steps': slider_steps
    }
    
    # Add sliders to layout
    figure['layout']['sliders'] = [slider_dict]
    
    # Reinstate showlegend property in layout to be False
    figure['layout']['showlegend'] = False
    
    return pyo.plot(figure, filename='A3Q1.1.3_clusters_K={}.html'.format(K), auto_open=False)

### Visualise clusters

In [15]:
figures = []
for i, result in enumerate(results_1_1_3):
    figures.append(visualise_k_means_clusters(result))

### Create gifs

In [16]:
'''
Creates snapshots of animated plots, with data points coloured by clusters

Input:
    result['cluster_centres']:        coordinates of cluster centres [11 x K x D]
    result['responsibility_indices']: responsibility indices for each run of K [N x 11]

Dimension of 11 represent 10% increments from (0%, 100%)
'''
def generate_gif_images(result):
    # Store cluster centres and responsibility indices
    cluster_centres = result['cluster_centres']
    resp_idx = result['responsibility_indices']
    K = result['K']
    
    # Define colour list as per Plotly's default colour list
    colour_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
    
    # Define slider ticker labels
    slider_values = ['0%', '10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%', '100%']
    
    # Define barebone figure
    figure = {
        'data': [],
        'layout': {}
    }
    
    # Create layout
    figure['layout'] = {
        'width': 900,
        'height': 900,
        'xaxis': {'range': [-4, 4], 'autorange': False},
        'yaxis': {'range': [-5, 2], 'autorange': False},
        'title': 'Cluster Visualisation (K = {})'.format(K),
        'showlegend': False
    }
    
    # Define slider step
    slider_steps = []
    
    for i in range(11):
        slider_step = {
            'args': [
                [slider_values[i]],
                {'frame': {'duration': 300, 'redraw': False},
                 'mode': 'immediate',
                 'transition': {'duration': 300}}
            ],
            'label': slider_values[i],
            'method': 'animate'
        }

        # Append slider stes to slider dictionary
        slider_steps.append(slider_step)
    
    #####
    ### SNAPSHOTS
    #####
    
    # Create snapshots
    for i in range(11):        
        # Reset figure data
        figure['data'] = []
        
        # Populate data for each cluster k
        for k in range(K):
            # Create trace for coloured data points
            data_trace = {
                'x': data2D[resp_idx[:,i] == k + 1][:,0], 
                'y': data2D[resp_idx[:,i] == k + 1][:,1],
                'hoverinfo': 'none',
                'mode': 'markers',
                'marker': {
                    'size': 4,
                    'color': colour_list[k]
                }
            }
            
            # Create trace for cluster centres
            cluster_trace = {
                'x': [cluster_centres[i][k][0]],
                'y': [cluster_centres[i][k][1]],
                'name': 'Cluster {}'.format(k + 1),
                'mode': 'markers',
                'marker': {
                    'size': 15,
                    'symbol': 'diamond',
                    'color': '#000000'
                }
            }
            
            # Append first set of data_trace and cluster_trace to data
            figure['data'].append(data_trace)
            figure['data'].append(cluster_trace)
                    
        # Reinstate showlegend property in layout to be False
        figure['layout']['showlegend'] = False
        
        # Define slider dictionary
        slider_dict = {
            'active': i, # Slider knob's relative starting location
            'pad': {'b': 10, 't': 50}, # Bottom and top padding
            'len': 1, # Slider length
            'x': 0, # Slider x-position
            'y': 0, # Slider y-position
            'yanchor': 'top', 
            'xanchor': 'left',
            'currentvalue': { # Displays current value selected by slider
                'font': {'size': 20},
                'prefix': 'Training: ',
                'visible': True,
                'xanchor': 'right'
            },
            'transition': {'duration': 300, 'easing': 'cubic-in-out'},
            'steps': slider_steps
        }

        # Add sliders to layout
        figure['layout']['sliders'] = [slider_dict]
        
        # Save snapshots locally
        py.plotly.image.save_as(figure, filename='Q1.1.3_K={0}_gif_{1:02d}.png'.format(K, i), \
                                width=900, height=900, scale=1)

In [352]:
for i, result in enumerate(results_1_1_3[2:]):
    generate_gif_images(result)

## Section 3.1.3: Run K-means with validation

In [17]:
results_1_1_4 = run_k_means(K_list=[1, 2, 3, 4, 5], data_dim=2, has_valid=True)

iter: 100, train_loss: 25576.7, valid_loss: 12877.1
iter: 200, train_loss: 25576.6, valid_loss: 12877.4
iter: 300, train_loss: 25576.6, valid_loss: 12877.4
iter: 400, train_loss: 25576.6, valid_loss: 12877.4
iter: 500, train_loss: 25576.6, valid_loss: 12877.4
Max iteration reached
K:   1, train loss: 25576.6, valid loss: 12877.4, duration: 9.3s

iter: 100, train_loss: 13357.7, valid_loss: 6647.4
iter: 200, train_loss: 6567.5, valid_loss: 3363.8
iter: 300, train_loss: 6072.9, valid_loss: 3139.7
iter: 400, train_loss: 6063.8, valid_loss: 3139.9
iter: 500, train_loss: 6063.8, valid_loss: 3140.1
Max iteration reached
K:   2, train loss: 6063.8, valid loss: 3140.1, duration: 9.5s

iter: 100, train_loss: 7211.1, valid_loss: 3581.0
iter: 200, train_loss: 3997.4, valid_loss: 2061.2
iter: 300, train_loss: 3423.1, valid_loss: 1776.9
iter: 400, train_loss: 3369.5, valid_loss: 1745.6
iter: 500, train_loss: 3369.0, valid_loss: 1744.9
Max iteration reached
K:   3, train loss: 3369.0, valid loss: 174

### Save results

In [18]:
np.save('./Results/K-means/1_1_4.npy', results_1_1_4)

### Generate bar chart for cluster assignment $\%$

In [19]:
'''
Generate a bar chart for each model showing percentage of data points belong to each cluster
'''
def cluster_assignment_IGraph(results):
    # Define colour list as per Plotly's default colour list
    colour_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
    
    # Define empty figure
    figure = {
        'data': [],
        'layout': {}
    }
    
    # Define data to plot
    for i, result in enumerate(results):
        for k in range(result['K']):
            trace = go.Bar(
                x = [i + 1],
                y = [result['composition'][k]],
                marker = {'color': colour_list[k]},
                name = 'Cluster {}'.format(k + 1)
            )
            figure['data'].append(trace)
    
    # Define layout
    figure['layout'] = {
        'title': 'Percentage of data points assigned to each cluster',
        'xaxis': {'title': 'Number of clusters, K'},
        'yaxis': {'title': 'Assignment to cluster, %'},
        'barmode': 'stack',
        'showlegend': False
    }
    
    # Generate plot
    py.iplot(figure, filename='/ECE521: A3/Q1: K-means/Q1.3_assignment_bar_chart', sharing='private')
    return pyo.iplot(figure)

# Generate loss function graph
figure = cluster_assignment_IGraph(results_1_1_3)

## Section 2.2.2.4: K-Means on $\textit{data100D.npy}$ with validation $(K = 1, \cdots, 15)$

In [21]:
results_2_2_2_4 = run_k_means(K_list=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], data_dim=100, device='cpu', has_valid=True)

iter: 100, train_loss: 778579.2, valid_loss: 392918.2
iter: 200, train_loss: 682138.1, valid_loss: 346432.5
iter: 300, train_loss: 665756.1, valid_loss: 338481.1
iter: 400, train_loss: 663328.9, valid_loss: 337185.3
iter: 500, train_loss: 663121.4, valid_loss: 337042.4
Max iteration reached
K:   1, train loss: 663121.3, valid loss: 337042.1, duration: 14.5s

iter: 100, train_loss: 633318.1, valid_loss: 318378.9
iter: 200, train_loss: 550968.4, valid_loss: 276971.7
iter: 300, train_loss: 538410.8, valid_loss: 270413.4
iter: 400, train_loss: 536625.8, valid_loss: 269427.4
iter: 500, train_loss: 536433.7, valid_loss: 269307.6
Max iteration reached
K:   2, train loss: 536433.5, valid loss: 269307.3, duration: 70.2s

iter: 100, train_loss: 572185.9, valid_loss: 283911.5
iter: 200, train_loss: 471422.8, valid_loss: 232081.2
iter: 300, train_loss: 453459.1, valid_loss: 222402.0
iter: 400, train_loss: 449594.5, valid_loss: 220222.5
iter: 500, train_loss: 448691.6, valid_loss: 219701.6
Max iter

### Save results

In [22]:
np.save('./Results/MoG/2_2_4_K-means.npy', results_2_2_2_4)