# Radial Basis Function Neural Network
## using Mixture of Gaussians

## Package initialisations, environment configuration and function definitions

Import relevant packages:

In [1]:
import tensorflow as tf
import numpy as np

import time
import datetime
import os

# TensorFlow embedding API library
from tensorflow.contrib.tensorboard.plugins import projector

# Non-interactive plotting
import matplotlib.pyplot as plt

# Interactive plotting
import plotly.graph_objs as go
import plotly.offline as pyo

Configure environment:

In [2]:
%config InlineBackend.figure_format = 'retina'
np.set_printoptions(precision=3)

# Global Variables
CURRENT_DIR = os.getcwd()
LOG_DIR = '/Logs'

# Activate Plotly Offline for Jupyter
pyo.init_notebook_mode(connected=True)

Load data2D.npy and data100D.npy into memory:

In [3]:
"""
source.npz contains 20 data points of dimension 2
"""
# Load data
source = np.load("./Data/RBFN/source_ex4.npz")

# Create data and target variables
data = source['Y']
target = source['X']

## Create Mixture of Gaussian (MoG) TensorFlow graph:

### Loss function:

$$ \mathcal{L}(\mathbf{\mu}, \mathbf{\sigma}, \mathbf{\pi}) = \prod_{i=1}^B \sum_{k=1}^K \pi_k \mathcal{N} (\mathbf{x}_n \ | \ \mathbf{\mu}_k, \Sigma_k) $$

In [4]:
def build_MoG(K, D, device='cpu'):
    '''
    Builds TensorFlow graph for MoG

    Input:
        K: number of clusters
        D: dimension of data (only 2 or 100 allowed)
    Internal variables:
        X: input data matrix (N x D)
        Mu: cluster centres (K x D)
        sigma_sq: cluster variance (K x D x D)
        log_pi: log of latent cluster variables (K x 1)
    '''
    
    def calc_log_gaussian_cluster_k(X, Mu, Sigma):
        '''
        Calculate log probability density function for all pairs of B data points and K clusters

        Assumptions:
            Dimensions are independent and have the same standard deviation, sigma
        Output:
            log PDF function (N x K)
        '''
        with tf.name_scope('log_gaussian_cluster'):
            # Infer dimension of data
            N = tf.shape(X)[0]
            D = tf.shape(X)[1]
            K = tf.shape(Mu)[0]

            # Calculate Mahalanobis distance
            ### Expand dim(X) to (N x 1 x D)
            ### Expand dim(Mu) to (1 x K x D)
            ### Reduce sum along the D-axis
            with tf.name_scope('mahalanobis_dist'):
                # Calculate the distance of data points from means (N x K x 1 x D)
                dist_from_mean = tf.subtract(tf.reshape(X, [N, 1, 1, D], name='reshaped_X'), \
                                              tf.reshape(Mu, [1, K, 1, D], name='reshaped_Mu'), \
                                              name='dist_from_mean')
                
                # Reshape Sigma for matmul operation
                Sigma_reshaped = tf.tile(tf.reshape(Sigma, [1, K, D, D]), [N, 1, 1, 1], name='Sigma_reshaped')
                
                # Calculate Mahalanobis distance
                dist = tf.reshape(- 0.5 * tf.matmul(dist_from_mean, \
                                                   tf.matmul(tf.matrix_inverse(Sigma_reshaped), \
                                                             tf.transpose(dist_from_mean, perm=[0, 1, 3, 2]))), \
                                  shape=[N, K], \
                                  name = 'mahalanobis_dist')
                
            # Calculate log of gaussian constant term
            ### Transpose sigma_sq to (1 x K)
            with tf.name_scope('log_gauss_const'):
                log_gauss_const = tf.negative(tf.cast(D, tf.float32) / 2 * tf.log(2 * np.pi) \
                                              + 0.5 * tf.log(tf.matrix_determinant(Sigma)),\
                                                name='log_gauss_const')

            # Sum results
            log_gaussian_cluster = tf.add(dist, log_gauss_const, name='log_gauss_cluster')

        return log_gaussian_cluster
    
    def calc_log_conditional_responsibilities(X, Mu, Sigma, log_pi):
        '''
        Calculate log probability cluster variable z given x, a.k.a. conditional responsibilities, gamma

        Output:
            conditional responsibilities (N x K)
        '''
        with tf.name_scope('log_conditional_responsibilities'):
            # Calculate unnormalised_log_posterior P(z|x)
            with tf.name_scope('unnormalised_log_posterior'):
                unnormalised_log_posterior = calc_log_gaussian_cluster_k(X, Mu, Sigma) + tf.transpose(log_pi)

            # Return log normalised posterior / conditional responsibilities
            with tf.name_scope('log_gamma_z'):
                cond_resp = tf.add(- tf.reduce_logsumexp(unnormalised_log_posterior, axis=1, keep_dims=True),\
                                   unnormalised_log_posterior, \
                                   name='log_gamma_z')
        return cond_resp
    
    def calc_neg_log_marg_prob(X, Mu, Sigma, log_pi):
        '''
        Calculates the negative log marginal probability, -log P(X), aka the loss function for MoG

        Output:
            - log P(X) (scalar)
        '''
        with tf.name_scope('loss'):
            loss = tf.negative(tf.reduce_sum(tf.reduce_logsumexp(calc_log_gaussian_cluster_k(X, Mu, Sigma) \
                                                                 + tf.transpose(log_pi), axis=1),\
                                             axis=0), name='-log_P_X')
        return loss
    
    def _add_histogram(vars_):
        '''
        Helper function to add histogram tag to variables
        Input:
            var: variable to be tagged with histogram summary
        '''
        for var in vars_:
            tf.summary.histogram(var.op.name, var)
    
    #######################
    ##  Function begins  ##
    #######################
    
    # Define computation device
    try:
        assert device == 'cpu' or device == 'gpu'
    except AssertionError:
        print 'Invalid device chosen. Please use \'cpu\' or \'gpu\''
        quit()
    device = '/' + device + ':0'
    
    with tf.device('/cpu:0'):
        # Define placeholder
        with tf.name_scope('placeholder'):
            X = tf.placeholder(tf.float32, shape=[None, D], name='inputs')
            
        # Define parameters
        with tf.variable_scope('parameters'):
            Mu = tf.get_variable('cluster_centres', shape=[K, D], \
                                initializer=tf.truncated_normal_initializer())
            Square = tf.get_variable('random_square_mat', shape=[K, D, D], \
                                    initializer=tf.truncated_normal_initializer())
            Sigma = tf.matmul(Square, tf.transpose(Square, perm=[0, 2, 1]))
            psi = tf.get_variable('latent_for_pi', shape=[K, 1], \
                                 initializer=tf.truncated_normal_initializer())
            
            with tf.name_scope('log_pi'):
                log_pi = tf.transpose(tf.nn.log_softmax(tf.transpose(psi)), name='log_pi') 
        
    with tf.device(device):
        # Calculate conditional responsibilities
        log_resp = calc_log_conditional_responsibilities(X, Mu, Sigma, log_pi)
        
        # Calculate loss
        loss = calc_neg_log_marg_prob(X, Mu, Sigma, log_pi)
        tf.summary.scalar('loss', loss)
        
        # Define optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate=0.01, \
                                           beta1=0.9, beta2=0.99, epsilon=1e-5).minimize(loss)
        
    with tf.device('/cpu:0'):
        # Add histogram summaries for variables of interest
        _add_histogram([Mu, Sigma, psi, log_pi, log_resp])
        
        # Merge all summaries
        merged = tf.summary.merge_all()
        
    return X, Mu, Sigma, log_pi, log_resp, loss, optimizer, merged

### Define training function:

In [5]:
def run_MoG(K_list, data, D=2, has_valid=False, device='cpu'):
    '''
    Runs MoG training algorithm more efficiently by not saving loss values.
        Tensorboard embedding enabled
    '''
    
    def subset_data(D, data):
        '''
        If has_valid is True, subsets:
            first 2/3 of data as training data
            remaining 1/3 of data as validation data
        '''
        if D == 2:
            temp = data
        divider = temp.shape[0] * 2 / 3
        return temp[:divider], temp[divider:]   
    
    def embed_data(D, train_writer):
        '''
        Embed data for visualization purposes
        '''
        # Define input data
        input_data = data
        input_data_name = 'data{}D.npy'.format(D)
        
        # Create variable to embed
        data_to_embed = tf.Variable(input_data, name=input_data_name, trainable=False, collections=[])

        # Define projector configurations
        config = projector.ProjectorConfig()
        
        # Add embedding
        embedding = config.embeddings.add()
        
        # Connect tf.Variable to embedding
        embedding.tensor_name = data_to_embed.name

        # Evaluate tf.Variable
        sess.run(data_to_embed.initializer)
        
        # Create save checkpoint
        saver = tf.train.Saver([data_to_embed])
        saver.save(sess, SUMMARY_DIR + sub_dir + '/train/model.ckpt', MAX_ITER)

        # Write projector_config.pbtxt in LOG_DIR
        projector.visualize_embeddings(train_writer, config)
    
    #######################
    ##  Function begins  ##
    #######################
    '''
    cluster_centres: 11 x K x D
    train_resp:        11 x N x K
    '''
    
    # Assert correct value for D
    assert D == 2 or D == 100
    
    # Define locally global function
    MAX_ITER = 1500
    CURR_TIME = '{:%b%d %H_%M_%S}'.format(datetime.datetime.now())
    SUMMARY_DIR = CURRENT_DIR + LOG_DIR + '/MoG/' + CURR_TIME
    
    # Create list to store run results
    results = []
    
    for K in K_list:
        # Clear any pre-defined graph
        tf.reset_default_graph()
        
        # Build TensorFlow graph
        X, Mu, Sigma, log_pi, log_resp, loss, optimizer, merged = build_MoG(K, D, device)
        
        # Select appropriate input_data
        if has_valid:
            input_data, valid_data = subset_data(D)
        else:
            input_data = data

        # Create arrays to log cluster_centres, cluster_variances, pi's, and responsbility indices
        train_loss = np.array([])[:, np.newaxis]
        if has_valid:
            valid_loss = np.array([])[:, np.newaxis]
            valid_resp = np.array([]).reshape(0, valid_data.shape[0], K)
        cluster_centres = np.array([]).reshape(0, K, D)
        cluster_variances = np.array([])[:, np.newaxis].reshape(0, K, D, D)
        cluster_pi = np.array([])[:, np.newaxis].reshape(0, K)
        train_resp = np.array([]).reshape(0, input_data.shape[0], K)
        
        # Begin session
        with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess:
            # Log start time
            start_time = time.time()

            # Create sub-directory title
            sub_dir = '/K={},D={},valid={}'.format(K, D, has_valid)
            
            # Create summary writers
            train_writer = tf.summary.FileWriter(SUMMARY_DIR + sub_dir + '/train', graph=sess.graph)
            if has_valid:
                valid_writer = tf.summary.FileWriter(SUMMARY_DIR + sub_dir + '/valid')
                
            # Initialise all TensorFlow variables
            tf.global_variables_initializer().run()
            
            # Define iterator
            curr_iter = 0
            
            # Calculate training (and validation) loss, 
            # cluster centres and responsibility indices before any training
            err, summaries, clusters, variances, log_prior_pi, log_train_indices = \
                sess.run([loss, merged, Mu, Sigma, log_pi, log_resp], feed_dict={X:input_data})
            train_loss = np.append(train_loss, err)
            train_writer.add_summary(summaries, curr_iter)
            
            # Log clusters and responsibility indices
            cluster_centres = np.append(cluster_centres, clusters[np.newaxis,:,:], axis=0)
            cluster_variances = np.append(cluster_variances, variances[np.newaxis,:,:,:], axis=0)
            cluster_pi = np.append(cluster_pi, np.exp(np.transpose(log_prior_pi)), axis=0)

            train_resp = np.append(train_resp, np.exp(log_train_indices)[np.newaxis,:,:], axis=0)
            
            # Log validation data
            if has_valid:
                err, log_valid_indices, summaries  = sess.run([loss, log_resp, merged], feed_dict={X:valid_data})
                
                valid_loss = np.append(valid_loss, err)
                valid_resp = np.append(valid_resp, np.exp(log_valid_indices)[np.newaxis, :, :], axis=0)
                valid_writer.add_summary(summaries, curr_iter)
            
            # Begin training
            while curr_iter < MAX_ITER:                
                # Train graph
                _, summaries, err = sess.run([optimizer, merged, loss], feed_dict={X:input_data})
                
                # Add training loss
                train_loss = np.append(train_loss, err)
                train_writer.add_summary(summaries, curr_iter + 1)

                # Log validation loss
                if has_valid:
                    summaries, err = sess.run([merged, loss], feed_dict={X:valid_data})
                    
                    valid_loss = np.append(valid_loss, err)
                    valid_writer.add_summary(summaries, curr_iter)
                
                # Log responsibility indices and cluster centres every 10% of maximum iteration
                if ((float(curr_iter) + 1) * 100 / MAX_ITER) % 10 == 0:
                    clusters, variances, log_prior_pi, log_train_indices = \
                        sess.run([Mu, Sigma, log_pi, log_resp], feed_dict={X:input_data})
                    
                    cluster_centres = np.append(cluster_centres, clusters[np.newaxis,:,:], axis=0)
                    cluster_variances = np.append(cluster_variances, variances[np.newaxis,:,:,:], axis=0)
                    cluster_pi = np.append(cluster_pi, np.exp(np.transpose(log_prior_pi)), axis=0)
                    
                    train_resp = np.append(train_resp, np.exp(log_train_indices)[np.newaxis,:,:], axis=0)
                    
                    if has_valid:
                        log_valid_indices = sess.run(log_resp, feed_dict={X:valid_data})
                        valid_resp = np.append(valid_resp, np.exp(log_valid_indices)[np.newaxis, :, :], axis=0)
                
                # Post training progress to user, every 100 iterations
                if curr_iter % 100 == 99:
                    print 'iter: {:3d}'.format(curr_iter + 1)
                
                curr_iter += 1
            
            # End of while loop
            print 'Max iteration reached'
            
            # Embed data
            # embed_data(D, train_writer)
            
            # Close writers
            train_writer.close()
            if has_valid:
                valid_writer.close()
            
            if not has_valid:
                results.append(
                    {
                        'K': K,
                        'train_loss': train_loss,
                        'cluster_centres': cluster_centres,
                        'cluster_variances': cluster_variances,
                        'cluster_pi': cluster_pi,
                        'train_resp': train_resp,
                        'time_of_run': '{:%b%d %H_%M_%S}'.format(datetime.datetime.now())
                    }
                )
            else:
                results.append(
                {
                    'K': K,
                    'train_loss': train_loss,
                    'valid_loss': valid_loss,
                    'cluster_centres': cluster_centres,
                    'cluster_variances': cluster_variances,
                    'cluster_pi': cluster_pi,
                    'train_resp': train_resp,
                    'valid_resp': valid_resp,
                    'time_of_run': '{:%b%d %H_%M_%S}'.format(datetime.datetime.now())
                }
            )
            
            # TODO calculate convergence
            print 'K: {:3d}, duration: {:3.1f}s\n'.format(K, time.time() - start_time)
                                                                              
    print 'RUN COMPLETED'
    return results

## Section 3.2.2.2: MoG on $\textit{data2D.npy}$ without validation $(K = 3)$

In [14]:
result = run_MoG(K_list=[2], data=data, D=2, has_valid=False)
%store result

iter: 100
iter: 200
iter: 300
iter: 400
iter: 500
iter: 600
iter: 700
iter: 800
iter: 900
iter: 1000
iter: 1100
iter: 1200
iter: 1300
iter: 1400
iter: 1500
Max iteration reached
K:   2, duration: 10.3s

RUN COMPLETED
Stored 'result' (list)


In [13]:
result = run_MoG(K_list=[2], data=target, D=2, has_valid=False)
%store result

iter: 100
iter: 200
iter: 300
iter: 400
iter: 500
iter: 600
iter: 700
iter: 800
iter: 900
iter: 1000
iter: 1100
iter: 1200
iter: 1300
iter: 1400
iter: 1500
Max iteration reached
K:   2, duration: 10.5s

RUN COMPLETED
Stored 'result' (list)


### Plot loss vs number of updates

In [33]:
def loss_IGraph(loss):
    # Define data to plot
    trace = go.Scatter(
        x = range(loss.shape[0]),
        y = loss
    )
    data = go.Data([trace])
    
    # Define layout
    layout = go.Layout(
        title = '$-\\log P(\mathbf{X}) \\text{ vs. Number of Updates}$',
        xaxis = {'title': 'Updates'},
        yaxis = {'title': 'Loss'}
    )
    
    # Define figure
    figure = go.Figure(data=data, layout=layout)
    
    # Generate plot
    # py.iplot(figure, filename='/ECE521: A3/Q2: Mixture of Gaussians/Q2.2_MoG_loss', sharing='private')
    return pyo.iplot(figure)

# Generate loss function graph
figure = loss_IGraph(result[0]['train_loss'])

### Visualising clusters

In [38]:
def visualise_MoG_clusters(result, data):
    '''
    Final result by colouring data points by clusters generated by Mixture of Gaussian algorithm
    Input:
        result:           MoG training result with validation
    Notes:
        cluster_centres:  coordinates of cluster centres (K x D)
        cluter_variances: cluster variances (K)
        train_resp:       training responsibility indices for each run of K ((N*2/3) x K)
        valid_resp:       validation responsibility indices for each run of K ((N/3) x K)
    '''
    
    def _hex_to_rgb(colour_list):
        '''
        Convert hex values of type string to RGB of type int
        Input:
            colour_list: numpy array of type string (numColour x 1)
        Output:
            RGB: RGB component of type int (numColour x 3)
        '''
        RGB = np.array([])[np.newaxis,:].reshape(0,3)
        # Split hex values into R, G, B components
        # Convert components to int and store in RGB array
        for colour in colour_list:
            RGB = np.append(RGB, np.array([int(colour[1:3], 16), \
                                           int(colour[3:5], 16), \
                                           int(colour[5:7], 16)]).reshape(1, 3), axis=0)
        return RGB

    def _rgb_to_hex(RGB):
        '''
        Convert RGB of type int to hex string of format '#xxxxxx'
        Input:
            RGB: RGB component of type int (N x 3)
        Output:
            hex_colours: (N x 1)
        '''
        hex_colours = np.array([])
        # Convert RGB ints to a single hex string
        RGB = RGB.astype(int)
        for colour in RGB:
            hex_colours = np.append(hex_colours, '#{:02X}{:02X}{:02X}'.format(colour[0], colour[1], colour[2]))
        return hex_colours
   
    def get_colour_gradient(resp):
        '''
        Return the 'average' colour based on Plotly's default colour list and responsibility index
        Input:
            idx: responsibility index (N x K)
        Output:
            average_colour (N x 1)
        '''
        # Assert error if there are more colours than available colours
        N = resp.shape[0]
        K = resp.shape[1]
        try:
            assert K <= colour_list.shape
        except AssertionError:
            print 'Not enough colours to colour all K clusters. Consider increasing number of colours in colour_list.'

        # Matrix multiply resp (N x K) and RGB-ed colour_list (K x 3) to obtain 'average' colour
        # Multiply max resp to whiten less certain data points
        # assigned_colour = np.matmul(resp, _hex_to_rgb(colour_list[:K]))
        assigned_colour = np.matmul(np.eye(K, dtype='int')[np.argmax(resp, axis=1)], _hex_to_rgb(colour_list[:K]))
        white_layer = np.repeat(255, N * 3).reshape(N, 3)
        
        # Append white_layer to assigned_colour on axis=2
        # pre_whitened (N x K x 2)
        pre_whitened = np.append(assigned_colour[:,:,np.newaxis], white_layer[:,:,np.newaxis], axis=2)

        # Create weights (N x 2)
        # Second layer takes the converse of the maximum responsibility (N x 1)
        weights = np.append(np.ones(N)[:,np.newaxis], 1 - np.amax(resp, axis=1)[:, np.newaxis], axis=1)

        # Conform shape of weights to shape of pre_whitened
        weights = np.transpose(np.tile(weights, (3, 1, 1)), (1, 0, 2))

        # Perform weighted-average to colours
        whitened_colour = np.average(pre_whitened, weights=weights, axis=2)

        # Return matrix of colour in hex form
        return _rgb_to_hex(whitened_colour)

    def calc_ellipse_coordinates(centres, variances):
        '''
        Create x- and y-coordinates for ellipses for each cluster
        Assumptions:
            Dimension of data point is 2
        Returns:
            ellipse: x- and y-coordinates for K ellipses (N x K x D)
        '''
        # Create trace for region to encompass 95% of the points (using Chi-squared critical value)
        # Assuming joint independence and equal marginal variances
        
        # Chi-squared with df 2 and alpha=5%
        crit_val = 5.991
        
        # Obtain eigenvalues (K x D) and eigenvectors (K x D x D) for ellipses
        evalues, evectors = np.linalg.eig(variances)
        
        # Sort evalues and evectors in descending order of eigenvalue
        for k in range(len(evalues)):
            idx = np.argsort(evalues[k])[::-1]
            evalues[k] = evalues[k, idx]
            evectors[k] = evectors[k, :, idx]
        
        # Calculate axes length
        axis_lengths = np.sqrt(evalues * crit_val)
        
        # Calculate coordinates to trace ellipse
        t = np.arange(-np.pi, np.pi + np.pi / 50, np.pi / 50) # Parameter
        x = axis_lengths[:,0] * np.cos(t)[:, np.newaxis]
        y = axis_lengths[:,1] * np.sin(t)[:, np.newaxis]
        
        # Stack x- and y-coordinates along axis=2
        ellipse = np.stack([x, y], axis=2)
        
        # Define rotation angle (K x 1) and rotation matrix (K x 2 x 2)
        angles = np.arctan(evectors[:, 0, 1] / evectors[:, 0, 0])[:, np.newaxis]
        
        rot_mat = np.append(np.append(np.cos(angles), -np.sin(angles), axis=1)[:,:,np.newaxis], \
                            np.append(np.sin(angles), np.cos(angles), axis=1)[:,:,np.newaxis], axis=2)
        
        # Rotate ellipse
        ellipse = np.squeeze(np.matmul(np.expand_dims(ellipse, axis=2), np.expand_dims(rot_mat, axis=0)), axis=2)
        
        # Add centres to ellipse
        ellipse += centres[np.newaxis,:,:]
        
        return ellipse
    
    #######################
    ##  Function begins  ##
    #######################
    
    # Define K and divider between training and validation data
    K = result['K']
    divider = data.shape[0] * 2 / 3 # Anything before K is part of the training data. Anything after is part of validation data
    
    # Store cluster parameters and responsibility indices
    centres = result['cluster_centres'][-1]
    variances = result['cluster_variances'][-1]
    train_resp = result['train_resp'][-1]
    # valid_resp = result['valid_resp'][-1]
    
    # Create ellipse coordinates
    ellipse = calc_ellipse_coordinates(centres, variances)
    
    # Define colour list as per Plotly's default colour list
    colour_list = np.array(['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b'])
    
    # Define blank figure
    figure = {
        'data': [],
        'layout': {}
    }
    
    # Create trace for training data points
    # Create trace for validation data points
    train_data_trace = {
        'x': data[:,0],
        'y': data[:,1],
        'mode': 'markers',
        'marker': {
            'size': 4,
            'color': colour_list[np.argmax(train_resp, axis=1)] #get_colour_gradient(valid_resp)
        }
    }
    
    # Append data traces
    figure['data'].append(train_data_trace)
    
    for k in range(K):
        # Create trace for cluster centres
        centre_trace = {
            'x': np.round([centres[k][0]], 3),
            'y': np.round([centres[k][1]], 3),
            'name': 'Cluster {}'.format(k + 1),
            'mode': 'markers',
            'marker': {
                    'size': 12,
                    'symbol': 'diamond',
                    'color': colour_list[k],
                    'line': {'width': 3}
                }   
        }

        # Create trace for region encompassing 95% of data points
        variance_trace = {
            'x': ellipse[:,k,:][:,0],
            'y': ellipse[:,k,:][:,1],
            'hoverinfo': 'none',
            'mode': 'lines',
            'name': 'Cluster {}'.format(k + 1),
            'marker': {
                'color': colour_list[k]
            }
        }
        
        # Add cluster trace
        for trace in [centre_trace, variance_trace]:
            figure['data'].append(trace)

    # Generate figure layout
    figure['layout'] = go.Layout(
        width = 900,
        height = 900,
        showlegend = False,
        title = 'MoG Clustering Visualisation (K = {})'.format(K),
        xaxis = {'range': [-4, 4], 'autorange': True},
        yaxis = {'range': [-5, 2], 'autorange': True}
    )
    
    return pyo.iplot(figure)

### Create GIFs

In [35]:
def generate_gif_images(result):
    '''
    Creates snapshots of animated plots, with data points coloured by clusters
    Input:
        result:           MoG training result with validation
    Notes:
        cluster_centres:  coordinates of cluster centres (11 x K x D)
        cluter_variances: cluster variances (11 x K)
        train_resp:       training responsibility indices for each run of K (11 x (N*2/3) x K)
        valid_resp:       validation responsibility indices for each run of K (11 x (N/3) x K)
    '''
    
    def _hex_to_rgb(colour_list):
        '''
        Convert hex values of type string to RGB of type int
        Input:
            colour_list: numpy array of type string (numColour x 1)
        Output:
            RGB: RGB component of type int (numColour x 3)
        '''
        RGB = np.array([])[np.newaxis,:].reshape(0,3)
        # Split hex values into R, G, B components
        # Convert components to int and store in RGB array
        for colour in colour_list:
            RGB = np.append(RGB, np.array([int(colour[1:3], 16), \
                                           int(colour[3:5], 16), \
                                           int(colour[5:7], 16)]).reshape(1, 3), axis=0)
        return RGB

    def _rgb_to_hex(RGB):
        '''
        Convert RGB of type int to hex string of format '#xxxxxx'
        Input:
            RGB: RGB component of type int (N x 3)
        Output:
            hex_colours: (N x 1)
        '''
        hex_colours = np.array([])
        # Convert RGB ints to a single hex string
        RGB = RGB.astype(int)
        for colour in RGB:
            hex_colours = np.append(hex_colours, '#{:02X}{:02X}{:02X}'.format(colour[0], colour[1], colour[2]))
        return hex_colours
    
    def get_colour_gradient(resp):
        '''
        Return the 'average' colour based on Plotly's default colour list and responsibility index
        Input:
            idx: responsibility index (N x K)
        Output:
            average_colour (N x 1)
        '''
        # Assert error if there are more colours than available colours
        N = resp.shape[0]
        K = resp.shape[1]
        try:
            assert K <= colour_list.shape
        except AssertionError:
            print 'Not enough colours to colour all K clusters. Consider increasing number of colours in colour_list.'

        # Matrix multiply resp (N x K) and RGB-ed colour_list (K x 3) to obtain 'average' colour
        # Multiply max resp to whiten less certain data points
        # assigned_colour = np.matmul(resp, _hex_to_rgb(colour_list[:K]))
        assigned_colour = np.matmul(np.eye(K, dtype='int')[np.argmax(resp, axis=1)], _hex_to_rgb(colour_list[:K]))
        white_layer = np.repeat(255, N * 3).reshape(N, 3)

        # Append white_layer to assigned_colour on axis=2
        # pre_whitened (N x K x 2)
        pre_whitened = np.append(assigned_colour[:,:,np.newaxis], white_layer[:,:,np.newaxis], axis=2)

        # Create weights (N x 2)
        # Second layer takes the converse of the maximum responsibility (N x 1)
        weights = np.append(np.ones(N)[:,np.newaxis], 1 - np.amax(resp, axis=1)[:, np.newaxis], axis=1)

        # Conform shape of weights to shape of pre_whitened
        weights = np.transpose(np.tile(weights, (3, 1, 1)), (1, 0, 2))

        # Perform weighted-average to colours
        whitened_colour = np.average(pre_whitened, weights=weights, axis=2)

        # Return matrix of colour in hex form
        return _rgb_to_hex(whitened_colour)
    
    def calc_ellipse_coordinates(centres, variances):
        '''
        Create x- and y-coordinates for ellipses for each cluster
        Assummptions:
            Joint independence and equal marginal variances
            Dimension of data point is 2
        Returns:
            ellipse: x- and y-coordinates for K ellipses (N x K x D)
        '''
        # Create trace for region to encompass 95% of the points (using Chi-squared critical value)
        # Assuming joint independence and equal marginal variances
        
        # Chi-squared with df 2 and alpha=5%
        crit_val = 5.991
        
        # Obtain eigenvalues (K x D) and eigenvectors (K x D x D) for ellipses
        evalues, evectors = np.linalg.eig(variances)
        
        # Sort evalues and evectors in descending order of eigenvalue
        for k in range(len(evalues)):
            idx = np.argsort(evalues[k])[::-1]
            evalues[k] = evalues[k, idx]
            evectors[k] = evectors[k, :, idx]
        
        # Calculate axes length
        axis_lengths = np.sqrt(evalues * crit_val)
        
        # Calculate coordinates to trace ellipse
        t = np.arange(-np.pi, np.pi + np.pi / 50, np.pi / 50) # Parameter
        x = axis_lengths[:,0] * np.cos(t)[:, np.newaxis]
        y = axis_lengths[:,1] * np.sin(t)[:, np.newaxis]
        
        # Stack x- and y-coordinates along axis=2
        ellipse = np.stack([x, y], axis=2)
        
        # Define rotation angle (K x 1) and rotation matrix (K x 2 x 2)
        angles = np.arctan(evectors[:, 0, 1] / evectors[:, 0, 0])[:, np.newaxis]
        
        rot_mat = np.append(np.append(np.cos(angles), -np.sin(angles), axis=1)[:,:,np.newaxis], \
                            np.append(np.sin(angles), np.cos(angles), axis=1)[:,:,np.newaxis], axis=2)
        
        # Rotate ellipse
        ellipse = np.squeeze(np.matmul(np.expand_dims(ellipse, axis=2), np.expand_dims(rot_mat, axis=0)), axis=2)
        
        # Add centres to ellipse
        ellipse += centres[np.newaxis,:,:]
        
        return ellipse
    
    #######################
    ##  Function begins  ##
    #######################
    
    # Define K and divider between training and validation data
    K = result['K']
    divider = data.shape[0] * 2 / 3 # Anything before K is part of the training data. Anything after is part of validation data
    
    # Store cluster parameters and responsibility indices
    centres = result['cluster_centres']
    variances = result['cluster_variances']
    train_resp = result['train_resp']
    valid_resp = result['valid_resp']
    
    # Define colour list as per Plotly's default colour list
    colour_list = np.array(['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b'])
    
    # Define blank figure
    figure = {
        'data': [],
        'layout': {}
    }
    
    # Create layout
    figure['layout'] = {
        'width': 900,
        'height': 900,
        'xaxis': {'range': [-4, 4], 'autorange': False},
        'yaxis': {'range': [-5, 2], 'autorange': False},
        'title': 'MoG Clutering Visualisation (K = {})'.format(K),
        'showlegend': False
    }
    
    # Define slider step
    slider_steps = []
    
    # Define slider ticker labels
    slider_values = ['0%', '10%', '20%', '30%', '40%', '50%', '60%', '70%', '80%', '90%', '100%']
    
    for i in range(11):
        slider_step = {
            'args': [
                [slider_values[i]],
                {'frame': {'duration': 300, 'redraw': False},
                 'mode': 'immediate',
                 'transition': {'duration': 300}}
            ],
            'label': slider_values[i],
            'method': 'animate'
        }

        # Append slider stes to slider dictionary
        slider_steps.append(slider_step)
    
    # Create snapshots
    for i in range(11):
        # Clears figure data for new snapshot
        figure['data'] = []

        # Create trace for validation data points
        valid_data_trace = {
            'x': data[divider:][:,0],
            'y': data[divider:][:,1],
            'mode': 'markers',
            'hoverinfo': 'none',
            'marker': {
                'size': 4,
                'color': get_colour_gradient(valid_resp[i])
            }
        }

        # Append data traces
        figure['data'].append(valid_data_trace)

        for k in range(K):
            # Create trace for cluster centres
            centre_trace = {
                'x': np.round([centres[i][k][0]], 3),
                'y': np.round([centres[i][k][1]], 3),
                'name': 'Cluster {}'.format(k + 1),
                'mode': 'markers',
                'marker': {
                        'size': 12,
                        'symbol': 'diamond',
                        'color': colour_list[k],
                        'line': {'width': 3}
                    }   
            }

            # Create ellipse coordinates
            ellipse = calc_ellipse_coordinates(centres[i], variances[i])
    
            # Create trace for region encompassing 95% of data points
            variance_trace = {
                'x': ellipse[:,k,:][:,0],
                'y': ellipse[:,k,:][:,1],
                'hoverinfo': 'none',
                'mode': 'lines',
                'name': 'Cluster {}'.format(k + 1),
                'marker': {
                    'color': colour_list[k]
                }
            }

            # Add cluster trace
            for trace in [centre_trace, variance_trace]:
                figure['data'].append(trace)

        # Define slider dictionary
        slider_dict = {
            'active': i, # Slider knob's relative starting location
            'pad': {'b': 10, 't': 50}, # Bottom and top padding
            'len': 1, # Slider length
            'x': 0, # Slider x-position
            'y': 0, # Slider y-position
            'yanchor': 'top', 
            'xanchor': 'left',
            'currentvalue': { # Displays current value selected by slider
                'font': {'size': 20},
                'prefix': 'Training: ',
                'visible': True,
                'xanchor': 'right'
            },
            'transition': {'duration': 300, 'easing': 'cubic-in-out'},
            'steps': slider_steps
        }
    
        # Add sliders to layout
        figure['layout']['sliders'] = [slider_dict]
        
        # Save snapshots locally
        py.plotly.image.save_as(figure, filename='Q2.2.3_K={0}_gif_{1:02d}.png'.format(K, i), \
                                width=900, height=900, scale=1)

In [39]:
for snapshot in result:
    visualise_MoG_clusters(snapshot, data)

## Create Radial Basis Function Network (RBFN) TensorFlow Graph:

In [11]:
def build_RBFN(Mu, Sigma, device='gpu'):
    '''
    Builds TensorFlow graph for RBFN

    Input:
        Mu:      Cluster centres (K x D)
        Sigma:   Covariance matrix (K x D x D)
        device:  Computation mode ('cpu' or 'gpu')
    Internal variables:
        Y: Input data matrix (N x D)
        X: Target matrix (N x D)
        W: Weight matrix
        b: Bias
    '''
    
    def radial_basis_function(Y, Mu, Sigma):
        '''
        Activation method using radial basis function.
        
        Inputs:
            X:     Input to activation function (N x D)
            Mu:    Cluster centers (K x D)
            Sigma: Covariance matrix (K x D x D)
            
        Returns:
            Activation based on exponent of Mahalanobis distance.
        '''
        # Calculate distance of each point from each cluster centres (N x K x 1 x D)
        dist_from_mean = tf.subtract(tf.expand_dims(tf.expand_dims(Y, 1), 1), tf.expand_dims(tf.expand_dims(Mu, 1), 0), \
                                     name='dist_from_mean')
        
        # Shape Sigma for upcoming matmul
        Sigma_mod = tf.tile(tf.expand_dims(Sigma, 0), [tf.shape(Y)[0], 1, 1, 1])
        
        # Define Mahalanobis distance (N x K)
        m_dist = tf.squeeze(- 0.5 * tf.matmul(dist_from_mean, tf.matmul(Sigma_mod, \
                                                                       tf.transpose(dist_from_mean, [0, 1, 3, 2]))), \
                            axis=[2, 3], name='mahalanobis_dist')
        
        # Define activation 
        activation = tf.exp(m_dist, name='hidden_radial_activation')
        
        return activation
    
    def _add_histogram(vars_):
        '''
        Helper function to add histogram tag to variables
        Input:
            var: variable to be tagged with histogram summary
        '''
        for var in vars_:
            tf.summary.histogram(var.op.name, var)
        
        
        
    ###################
    # Function begins #
    ###################
    
    # Define computation device
    try:
        assert device == 'cpu' or device == 'gpu'
    except AssertionError:
        print 'Invalid device chosen. Please use \'cpu\' or \'gpu\'.'
        quit()
    device = '/' + device + ':0'
    
    # Infer dimensions
    K = Mu.shape[0]
    D = Mu.shape[1]
    
    with tf.device('/cpu:0'):
        # Define placeholder variables
        with tf.name_scope('placeholders'):
            Y = tf.placeholder(tf.float32, shape=[None, D], name='input_layer')
            X = tf.placeholder(tf.float32, shape=[None, D], name='target')
                                    
        # Define weights and biases connecting hidden & output layer
        with tf.name_scope('variables'):
            W = tf.Variable(tf.truncated_normal(shape=[K, D]), tf.float32, name='weights')
            b = tf.Variable(tf.truncated_normal(shape=[]), tf.float32, name='bias')
            
    with tf.device(device):
        # Define radial activation function
        with tf.name_scope('hidden_layer'):
            activations = radial_basis_function(Y, Mu, Sigma)
            
        # Define output layer
        with tf.name_scope('output_layer'):
            Xhat = tf.add(tf.matmul(activations, W), b, name='prediction')
        
        # Calculate metrics
        with tf.name_scope('metrics'):
            # Calculate loss function:
            with tf.name_scope('loss'):
                # MSE: Mean squared error
                MSE = tf.reduce_mean(tf.reduce_sum(tf.pow(Xhat - X, 2), axis=1), name='MSE')
                tf.summary.scalar('MSE', MSE)
                
        # Define optimizer
        with tf.name_scope('Adam'):
            optimizer = tf.train.AdamOptimizer().minimize(MSE)
    
    # Add histogram summaries
    with tf.device('/cpu:0'):
        _add_histogram([activations, W, b])
        
        # Merge all summaries
        merged = tf.summary.merge_all()
        
    return Y, X, MSE, optimizer, merged, activations, W

## Define training function for RBFN

In [20]:
def train_RBFN(data, target, Mu, Sigma, print_update=False, device='gpu'):
    '''
    Trains the RBFN on a leave-one out basis.
    Inputs:
        data:     Input values
        target:   Output values
        Mu:       Cluster centres (K x D)
        Sigma:    Covariance matrix (K x D x D)
        device:   Computation mode (cpu or gpu)
    Returns:
        Causality index, Rxy
    '''
    
    def leave_one_out(n, data, target):
        '''
        Returns a copy of data and target with the n-th element left out.
        Inputs:
            data:   Input values
            target: Output values
        Returns:
            A copy of data and target, with element n left out.
        '''
        assert n >= 0 and n < len(data)
        
        return np.delete(data, n, 0), np.delete(target, n, 0)
    
    def log_summaries_and_update_user(curr_iter, print_update=False):
        '''
        Log summaries into saved TF file and prints losses to user if print_update is True.
        Inputs:
            curr_iter:    Current Iteration
            print_update: Boolean
        '''
        err, summaries = sess.run([MSE, merged], feed_dict={Y: train_data, X: train_target})
        writer.add_summary(summaries, curr_iter)
        
        if print_update == True:
            print 'Iteration: {:5d} Training MSE: {:>8.2f}'.format(curr_iter, err)
        
        
        
    # Define constants
    N = len(data) # Total number of datapoints
    MAX_ITER = 1500 # Maximum number of training iterations
    CURR_TIME = '{:%b%d %H_%M_%S}'.format(datetime.datetime.now()) # Current time
    
    # Define array to store SSE for each pass of algorithm
    epsilon = np.array([])
    
    # Iterate through algorithm N times
    for n in range(N):
        # Leave one data point out in each pass of algorithm
        train_data, train_target = leave_one_out(n, data, target)
        
        # Dump previous graph definition (if it exists), and define RBFN graph
        tf.reset_default_graph()
        Y, X, MSE, optimizer, merged, activations, W = build_RBFN(Mu, Sigma, device)
        
        # Begin TensorFlow session
        with tf.Session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)) as sess:
            # Define summary writer
            writer = tf.summary.FileWriter(os.path.join('./Logs/RBFN', CURR_TIME), graph=sess.graph)
            
            # Initialise all TensorFlow variables
            tf.global_variables_initializer().run()
            
            # Define current iteration
            curr_iter = 0
            
            # Calculate initial values for summaries
            log_summaries_and_update_user(curr_iter + n * MAX_ITER + 1, print_update)
            
            # Begin training
            while curr_iter < MAX_ITER:
                # Perform gradient descent
                sess.run(optimizer, feed_dict={Y: train_data, X: train_target})
                
                # Log summaries every 100 iterations
                if (curr_iter + 1) % 200 == 0:
                    log_summaries_and_update_user(curr_iter + n * MAX_ITER + 1, print_update)
                    
                # Increment counter
                curr_iter += 1
                
            # Finished training
            print 'Training {} of {} complete!'.format(n + 1, N)
            
            # Calculate RMS for n-th element not used to train model, and append to array
            epsilon = np.append(epsilon, sess.run(MSE, feed_dict={Y: np.expand_dims(data[n,:], 0), \
                                                                  X: np.expand_dims(target[n,:], 0)}))
            
            # Close writers
            writer.close()
    
#     return epsilon
    
    # Calculate SSE, SST and delta
#     MSE = np.sqrt(np.mean(np.power(epsilon_xy, 2)))
#     MST = np.sqrt(np.mean(np.power(np.sum(np.power(target - np.mean(target, axis=0), 2), axis=1), 2)))
#     delta = MSE / MST
    
    SSE = np.sqrt(np.mean(epsilon))
    SST = np.sqrt(np.mean(np.power(target - np.mean(target, axis=0), 2)))
    delta = SSE / SST
    
    # Calculate causality index, R_xy
    sigma = 5
    R_xy = 1. / np.exp(delta / sigma)
    
    return R_xy

### Train data

In [13]:
Mu = result[-1]['cluster_centres'][-1].astype(np.float32)
Sigma = result[-1]['cluster_variances'][-1].astype(np.float32)

# Call train function
epsilon_xy = train_RBFN(data=data, target=target, Mu=Mu, Sigma=Sigma)

Training 1 of 21 complete!
Training 2 of 21 complete!
Training 3 of 21 complete!
Training 4 of 21 complete!
Training 5 of 21 complete!
Training 6 of 21 complete!
Training 7 of 21 complete!
Training 8 of 21 complete!
Training 9 of 21 complete!
Training 10 of 21 complete!
Training 11 of 21 complete!
Training 12 of 21 complete!
Training 13 of 21 complete!
Training 14 of 21 complete!
Training 15 of 21 complete!
Training 16 of 21 complete!
Training 17 of 21 complete!
Training 18 of 21 complete!
Training 19 of 21 complete!
Training 20 of 21 complete!
Training 21 of 21 complete!


In [14]:
# Calculate SSE, SST and delta
MSE = np.sqrt(np.mean(epsilon_xy))
MST = np.sqrt(np.mean(np.sum(np.power(target - np.mean(target, axis=0), 2), axis=1)))
delta = MSE / MST

# Calculate causality index, R_xy
sigma = 5
R_xy = 1. / np.exp(delta / sigma)

In [15]:
print 'MSE: {:.2f}, MST: {:.2f}, delta: {:.2f}, R_xy: {:.2f}'.format(MSE, MST, delta, R_xy)

MSE: 1.55, MST: 1.46, delta: 1.06, R_xy: 0.81


In [37]:
# Evaluate causality(x->y)
result = run_MoG(K_list=[1], data=data, D=2, has_valid=False)

Mu = result[-1]['cluster_centres'][-1].astype(np.float32)
Sigma = result[-1]['cluster_variances'][-1].astype(np.float32)

R_xy = train_RBFN(data=data, target=target, Mu=Mu, Sigma=Sigma)

# Evaluate causality(y->x)
result = run_MoG(K_list=[1], data=target, D=2, has_valid=False)

Mu = result[-1]['cluster_centres'][-1].astype(np.float32)
Sigma = result[-1]['cluster_variances'][-1].astype(np.float32)

R_yx = train_RBFN(data=target, target=data, Mu=Mu, Sigma=Sigma)

print R_xy
print R_yx

iter: 100
iter: 200
iter: 300
iter: 400
iter: 500
iter: 600
iter: 700
iter: 800
iter: 900
iter: 1000
iter: 1100
iter: 1200
iter: 1300
iter: 1400
iter: 1500
Max iteration reached
K:   1, duration: 6.1s

RUN COMPLETED
Training 1 of 21 complete!
Training 2 of 21 complete!
Training 3 of 21 complete!
Training 4 of 21 complete!
Training 5 of 21 complete!
Training 6 of 21 complete!
Training 7 of 21 complete!
Training 8 of 21 complete!
Training 9 of 21 complete!
Training 10 of 21 complete!
Training 11 of 21 complete!
Training 12 of 21 complete!
Training 13 of 21 complete!
Training 14 of 21 complete!
Training 15 of 21 complete!
Training 16 of 21 complete!
Training 17 of 21 complete!
Training 18 of 21 complete!
Training 19 of 21 complete!
Training 20 of 21 complete!
Training 21 of 21 complete!
iter: 100
iter: 200
iter: 300
iter: 400
iter: 500
iter: 600
iter: 700
iter: 800
iter: 900
iter: 1000
iter: 1100
iter: 1200
iter: 1300
iter: 1400
iter: 1500
Max iteration reached
K:   1, duration: 5.8s

RUN

In [17]:
# Calculate SSE, SST and delta
MSE = np.sqrt(np.mean(R_xy))
MST = np.sqrt(np.mean(np.sum(np.power(target - np.mean(target, axis=0), 2), axis=1)))
delta = MSE / MST

# Calculate causality index, R_xy
sigma = 5
causality_xz = 1. / np.exp(delta / sigma)

# Calculate SSE, SST and delta
MSE = np.sqrt(np.mean(R_yx))
MST = np.sqrt(np.mean(np.sum(np.power(data - np.mean(data, axis=0), 2), axis=1)))
delta = MSE / MST

# Calculate causality index, R_xy
sigma = 5
causality_zx = 1. / np.exp(delta / sigma)

In [18]:
print 'X->Z: {}, Z->X: {}'.format(causality_xz, causality_zx)

X->Z: 0.813298457885, Z->X: 0.800858698783
