# Detecting Causality in Eastman Benchmark Data
### using techniques developed so far including:
- Probabilistic Graphical Models
- Convergent Cross Mapping (CCM) algorithm
- Cross Mapping Smoothness (CMS) algorithm

In [38]:
# Import relevant libraries
import numpy as np # For tensor manipulation in Python
import pandas as pd # Store dataset as a Panda dataframe
import csv # To load csv files

# Interactive plotting
import plotly
import plotly.offline as pyo # For offline interactive plotting
import plotly.graph_objs as go # For generating JSON plot objects in Plotly


# Activate Plotly Offline for Jupyter
pyo.init_notebook_mode(connected=True)

# Set printing precision
np.set_printoptions(2)

## Algorithm Definitions
### Convergent Cross Mapping (CCM)

In [3]:
def CCM(data, target, k, attractor_viz=False, prediction_corr_viz=False):
    '''
    Perform convergent cross-mapping (CCM) algorithm described in paper.
    Inputs:
        data:   Data to perform k-NN, a numpy array (N x P)
        target: Target values to perform prediction (N x P)
        k:      Number of nearest neighbours (scalar)
    Returns:
        predictions: Predicted values (1-D array)
        causality:   Calculated causality from correlation plot (float)
    '''
    def euclidean_dist(A, B=None):
        '''
        Calculate the euclidean distance for rows in matrix A and rows in matrix B.
        If B is None, calculates distances for rows between matrix A.
        Inputs:
            A: A matrix (a x P)
            B: A matrix (b x k x P)
        Returns:
            A distance matrix (a x b), indicating the distance of all non-i-th point to the i-th point. 
        ''' 
        # Define input matrices with expanded dimensions
        A_expanded = np.expand_dims(A, 2)
        
        # Calculate distance of each point and every other point
        if B is None:
            return np.sqrt(np.sum(np.square(A_expanded - np.transpose(A_expanded, (2, 1, 0))), axis=1))
        else:
            return np.sqrt(np.sum(np.square(np.transpose(A_expanded, (0,2,1)) - B), axis=2))
   
    def kNN(k, data):
        '''
        Return the nearest neighbours to each row in data in the form of a responsibility matrix.
        Inputs:
            k:    Number of nearest neighbours (scalar)
            data: Data to perform k-NN, a numpy array (N x P)
        Returns:
            A responsibility matrix (N x k), listing the indices of the k-nearest neighbours for each row
        '''

        def responsibilities(k, distances):
            '''
            Finds the k-nearest neighbours to each point by index.
            Inputs:
                k:         Number of nearest neighbours (scalar)
                distances: A distance matrix (N x N)
            Returns:
                A responsibility matrix (N x k), listing the indices of the k-nearest neighbours for each row
            '''
            return np.argsort(distances)[:,1:(k + 1)]

        return responsibilities(k, euclidean_dist(data))

    def predict_target(data, target, responsibilities):
        '''
        Performa a prediction of the target based on a weighting of contemporaneous neighbours of data.
        Inputs:
            data:             Data values (N x P)
            target:           Target values to perform prediction (N x P)
            responsibilities: A responsibility matrix (N x k)
        Returns:
            An array of predicted target values (N)
        '''

        def calculate_weights(data, responsibilities):
            '''
            Calculate weights based on the k-nearest neighbours
            Inputs:
                data:             Data values (N x P)
                responsibilities: A responsibility matrix (N x k)
            Returns:
                A matrix of weights (N x k)
            '''
            # Obtain shape of responsibilities
            N, k = responsibilities.shape

            # Calculate values for numerator
            for i in range(k):
                numerator = np.exp( - np.divide(euclidean_dist(data, data[responsibilities]), \
                                                euclidean_dist(data, data[responsibilities])[:,0][:, np.newaxis]))

            # Calculate denominator
            denominator = np.sum(numerator, axis=1, keepdims=True)

            # Calculate and return weights
            return np.divide(numerator, denominator)
        
        weights = calculate_weights(data, responsibilities)
        return np.sum(target[responsibilities] * np.expand_dims(weights, axis=2), axis=1)
    
    def visualise_attractor(data, target, responsibilities, predictions):
        '''
        Produce Plotly animation on a 1 x 2 subplot to visualise nearest neighbours of data and target.
        Inputs:
            data:        Data values (N x P)
            target:      Target values (N x P)
            responsibilities: A responsibility matrix (N x k)
            predictions: Predicted values for target (N x P)
        '''
        fig = plotly.tools.make_subplots(rows=1, cols=2, specs=[[{'is_3d': True}, {'is_3d': True}]])
        
        # Define colour list
        colour_list = np.array(['#b3b3b3', '#0f3957', '#1f77b4', '#ff7f0e'])

        # Define blank figure
        figure = {
            'data': [],
            'layout': {},
            'frames': []
        }

        # Create layout
        figure['layout'] = {
            'width': 1000,
            'height': 700,
            'scene1': {
                'domain': {
                    'x': [0, 0.45],
                    'y': [0., 1.]
                }
            },
            'scene2': {
                'domain': {
                    'x': [0.55, 1.],
                    'y': [0., 1.]
                }
            },
            'title': 'Visualising Nearest Neighbours on Attractors',
            'showlegend': False
        }
        
        # Define buttons
        figure['layout']['updatemenus'] = [
            {
                'buttons': [
                    {
                        'args': [None, {'frame': {'duration': 1000, 'redraw': False},
                                 'fromcurrent': True, 'transition': {'duration': 0, 'easing': 'quadratic-in-out'}}],
                        'label': 'Play',
                        'method': 'animate'
                    },
                    {
                        'args': [[None], {'frame': {'duration': 0, 'redraw': False}, 'mode': 'immediate',
                        'transition': {'duration': 0}}],
                        'label': 'Pause',
                        'method': 'animate'
                    }
                ],
                'direction': 'left',
                'pad': {'r': 10, 't': 87},
                'showactive': False,
                'type': 'buttons',
                'x': 0.1,
                'xanchor': 'right',
                'y': 0,
                'yanchor': 'top'
            }
        ]
        
        # Define slider dictionary
        slider_dict = {
            'active': 0, # Slider knob's relative starting location
            'pad': {'b': 10, 't': 50}, # Bottom and top padding
            'len': 0.9, # Slider length
            'x': 0.1, # Slider x-position
            'y': 0, # Slider y-position
            'yanchor': 'top', 
            'xanchor': 'left',
            'currentvalue': { # Displays current value selected by slider
                'font': {'size': 20},
                'prefix': 'Time index: ',
                'visible': True,
                'xanchor': 'right'
            },
            'transition': {'duration': 300, 'easing': 'cubic-in-out'},
            'steps': []
        }

        # Create frames
        for i in range(len(predictions)):
            # Define a dictionary for each frame
            frame = {
                'data': [],
                'name': str(i + 1) # Used to connect each frame to slider value
            }
            
            # Create raw data trace
            data_trace = {
                'x': data[:,0],
                'y': data[:,1],
                'z': data[:,2],
                'mode': 'markers',
                'type': 'scatter3d',
                'hoverinfo': 'none',
                'scene': 'scene1',
                'marker': {
                    'size': 4,
                    'color': colour_list[0]
                }
            }
            
            # Create source point trace
            source_trace = {
                'x': [data[i,0]],
                'y': [data[i,1]],
                'z': [data[i,2]],
                'mode': 'markers',
                'type': 'scatter3d',
                'name': 'Source',
                'scene': 'scene1',
                'hoverinfo': 'name',
                'marker': {
                    'size': 10,
                    'symbol': 'diamond',
                    'color': colour_list[2],
                    'line': {'width': 1}
                }
            }
            
            # Create source neighbours trace
            source_neighbour_trace = {
                'x': data[responsibilities[i,:],0],
                'y': data[responsibilities[i,:],1],
                'z': data[responsibilities[i,:],2],
                'mode': 'markers',
                'type': 'scatter3d',
                'name': 'Source Neighbour',
                'scene': 'scene1',
                'hoverinfo': 'name',
                'marker': {
                    'size': 4,
                    'color': colour_list[2],
                }
            }
            
            # Create target trace
            target_trace = {
                'x': target[:,0],
                'y': target[:,1],
                'z': target[:,2],
                'mode': 'markers',
                'type': 'scatter3d',
                'scene': 'scene2',
                'hoverinfo': 'none',
                'marker': {
                    'size': 4,
                    'color': colour_list[0]
                }
            }
            
            # Create destination point trace
            actual_destination_trace = {
                'x': [target[i,0]],
                'y': [target[i,1]],
                'z': [target[i,2]],
                'mode': 'markers',
                'type': 'scatter3d',
                'name': 'Actual Target',
                'scene': 'scene2',
                'hoverinfo': 'name',
                'marker': {
                    'size': 12,
                    'symbol': 'diamond',
                    'color': colour_list[3],
                    'line': {'width': 2}
                }
            }
            
            # Create destination neighbours trace
            destination_neighbour_trace = {
                'x': target[responsibilities[i,:],0],
                'y': target[responsibilities[i,:],1],
                'z': target[responsibilities[i,:],2],
                'mode': 'markers',
                'type': 'scatter3d',
                'name': 'Target Neighbours',
                'scene': 'scene2',
                'hoverinfo': 'name',
                'marker': {
                    'size': 4,
                    'color': colour_list[2],
                }
            } 
            
            # Create predicted destination trace
            predicted_destination_trace = {
                'x': [predictions[i,0]],
                'y': [predictions[i,1]],
                'z': [predictions[i,2]],
                'mode': 'markers',
                'type': 'scatter3d',
                'name': 'Predicted Target',
                'scene': 'scene2',
                'hoverinfo': 'name',
                'marker': {
                    'size': 10,
                    'symbol': 'diamond',
                    'color': colour_list[2],
                    'line': {'width': 2}
                }
            }

            # Append traces to frame
            for trace in [data_trace, source_trace, source_neighbour_trace, \
                          target_trace, destination_neighbour_trace, \
                          actual_destination_trace, predicted_destination_trace]:
                frame['data'].append(trace)
            
            # Append frame to figure
            figure['frames'].append(frame)
            
            # Define slider step
            slider_step = {
                'args': [
                    [i + 1],
                    {'frame': {'duration': 300, 'redraw': False},
                     'mode': 'immediate',
                     'transition': {'duration': 0}}
                ],
                'label': i + 1,
                'method': 'animate'
            }
            
            # Append slider step to slider dictionary
            slider_dict['steps'].append(slider_step)
            
        # Add sliders to layout
        figure['layout']['sliders'] = [slider_dict]
        
        # Define figure['data']
        figure['data'] = figure['frames'][0]['data']
        
        # Save snapshots locally
        pyo.iplot(figure)
    
    def visualise_predictions(target, predictions):
        '''
        Create a scatterplot visualising predictions vs. target.
        Inputs:
            target:      Target values (N x P)
            predictions: Prediction values (N x P)
        '''
        trace = go.Scatter(
            x = target[:,-1],
            y = predictions[:,-1],
            mode = 'markers',
        )
        
        line_trace = go.Scatter(
            x = [0, 1],
            y = [0, 1],
            mode = 'lines',
            hoverinfo = 'none',
            line = {
                'color': '#000000',
                'dash': 'dash',
                'width': 3
            }
        )
        
        layout = go.Layout(
            title = 'Correlation Plot (r = {})'\
                    .format(np.round(np.corrcoef(target[:,-1], predictions[:,-1])[0,-1], 3)),
            showlegend = False,
            height = 800,
            width = 700,
            xaxis = {'title': 'Target',},
            yaxis = {'title': 'Prediction', 'scaleanchor': 'x'},
        )
        
        figure = go.Figure(data=go.Data([trace, line_trace]), layout=layout)
        pyo.iplot(figure)
        
    
    ###################
    # Function begins #
    ###################
    
    # Find indices of k-nearest neighbours
    responsibilities = kNN(k, data)
    print 'k-NN complete!'
    
    # Calculate predicted target values
    predictions = predict_target(data, target, responsibilities)
    print 'Predictions complete!'
    
    # Create interactive attractor animation
    if attractor_viz == True:
        visualise_attractor(data, target, responsibilities, predictions)
        
    # Create correlation plot
    if prediction_corr_viz == True:
        visualise_predictions(target, predictions)
        
    # Calculate causality based on correlation
    causality = np.round(np.corrcoef(target[:,-1], predictions[:,-1])[0,-1], 3)
    
    return predictions, causality

### Cross Mapping Smoothness (CMS)

In [4]:
def CMS_mod(data, target, coupling_parameter=None, scope=20, \
                   show_clusters=False, show_corr_plot=False, show_causality_time_series=False):
    '''
    Calculate the causality index of data -> target.
    Inputs:
        data:               Input values (N x P) (suspected 'effect' variable)
        target:             Output values (N x P)(suspected 'cause' variable)
        scope:              Length of time series subset to perform causality calculations (int)
        show_clusters: If True, scatter plot of clusters will be shown for first set of calculation (Boolean)
    Returns:
        An array of normalised causality index (N - scope)
    '''
    
    def l2(A, axis=None):
        '''
        Calculates the L2-norm of a tensor, at a specified axis.
        Inputs:
            A:    A tensor.
            axis: Summation axis.
        Returns:
            L2-norm of tensor.
        '''
        return np.sqrt(np.sum(np.square(A), axis=axis))
    
    def RMS(A):
        '''
        Perform a root-mean square operation.
        Input:
            A: A 1-D array (N)
        Returns:
            Root means square of A.
        '''
        return np.sqrt(np.mean(np.square(A)))
    
    def euclidean_dist(A, B=None):
        '''
        Calculate the euclidean distance for rows in matrix A and rows in matrix B.
        If B is None, calculates distances for rows between matrix A.
        Inputs:
            A: A matrix (a x P)
            B: A matrix (b x P)
        Returns:
            A distance matrix (a x b), indicating the distance of all non-i-th point to the i-th point. 
        ''' 
        # Define input matrices with expanded dimensions
        A_expanded = np.expand_dims(A, 2)

        # Calculate distance of each point and every other point
        if B is None:
            B_expanded = A_expanded
        else:
            B_expanded = np.expand_dims(B, 2)

        return l2(A_expanded - np.transpose(B_expanded, (2, 1, 0)), axis=1)

    def find_cluster_params(data):
        '''
        Given datapoints, find centres and variances for each radial basis.
        Assumptions:
            Each datapoint is itself a radial basis function.
            Variances are assumed to be identical for all basis functions. Calculated as four times the
                average L2 squared distance between all pairwise datapoints.
        Inputs:
            data: Data values (N x P)
        Returns:
            centres:  Centre coordinates for each radial basis (N x P)
            variance: Variance of radial basis (scalar)
        '''
        # Calculate Euclidean distance matrix (N x N)
        distances = euclidean_dist(data)

        # Obtain upper triangular of distances (excluding diagonals)
        distances[np.triu_indices(distances.shape[0], 0)] = 0

        # Calculate variance as four times of [average(Euclidean distance)]^2
        variance = 0.001 *(np.mean(distances) * np.true_divide(np.size(distances), np.sum(distances != 0)))**2

        return data, variance

    def calc_radial_basis_activations(data, centres, variances):
        '''
        Calculates the activations for the hidden radial basis layer.
        Inputs:
            data: Data values (N x P)
            centres: RBF centres (K x P)
            variances: RBF variances (scalar)
        Returns:
            Radial basis activations (N x K)
        '''
        # Calculate Gaussian exponent
        actv = np.exp( - np.divide(euclidean_dist(data, centres)**2, 2 * np.transpose(variances)))
            
        return actv

    def train_RBFN_weights(data, target, centres, variances):
        '''
        Train the weights of a Radial Basis Function Network by solving for values in parameter \mathbf{\alpha}.
        Normalise the activations before solving for \mathbf{\alpha}, 
            and re-normalising the values of \mathbf{\alpha} after.

        Inputs:
            data:   Data values (N x P)
            target: Target values (N x P)
            centres: Cluster centres (K x P)
            variances: Cluster variances (K)
        Returns:
            Trained weights, \mathbf{\alpha} (K x P)
        '''
        # Obtain activations
        activations = calc_radial_basis_activations(data, centres, variances)
        
        # Store the L2-norms of columns of activations in a matrix
        L2 = l2(activations, axis=0)
        
        # Normalise activation values by their L2-norms
        actvn_norm = np.divide(activations, np.expand_dims(L2, axis=0))
        
        # Solve system of linear equations for alpha
        weights = np.linalg.solve(actvn_norm, target)

        # Return re-normalised alpha
        return np.divide(weights, np.expand_dims(L2, axis=1))

    def RBFN_calc(data, centres, variances, weights):
        '''
        Calculate the output of the trained RBFN.
        Inputs:
            data:      Data values (N x P)
            centres:   RBF centres (K x P)
            variances: RBF variances (scalar)
            weights:   Trained weights (K x P)
        Returns:
            Predicted target value (N x P), assuming data and target have same dimensionality
        '''
        # Calculate radial basis activations
        actv = calc_radial_basis_activations(data, centres, variances)
        
        # Calculate and return predicted output
        return np.matmul(actv, weights)
    
    def visualise_clusters(data, centres, variances):
        '''
        Final result by colouring data points by clusters generated by Mixture of Gaussian algorithm
        Inputs:
            data:      Data points (N x P)
            centres:   Ccoordinates of cluster centres (K x P)
            variances: Cluster variances (float)
            
        '''
        def calc_ellipse_coordinates(centres, variances):
            '''
            Create x- and y-coordinates for ellipses for each cluster
            Assumptions:
                Dimension of data point is 2
            Returns:
                ellipse: x- and y-coordinates for K ellipses (N x K x D)
            '''
            # Create trace for region to encompass 95% of the points (using Chi-squared critical value)
            # Assuming joint independence and equal marginal variances

            # Chi-squared with df 2 and alpha=5%
            crit_val = 5.991

            # Calculate axes length
            axis_lengths = np.sqrt(variances * crit_val)

            # Calculate coordinates to trace ellipse
            t = np.arange(-np.pi, np.pi + np.pi / 50, np.pi / 50) # Parameter
            x = np.transpose(centres[:,0][:, np.newaxis]) + axis_lengths * np.cos(t)[:, np.newaxis]
            y = np.transpose(centres[:,1][:, np.newaxis]) + axis_lengths * np.sin(t)[:, np.newaxis]

            # Stack x- and y-coordinates along axis=2
            ellipse = np.stack([x, y], axis=2)

            return ellipse

        #######################
        ##  Function begins  ##
        #######################

        # Create ellipse coordinates
        ellipse = calc_ellipse_coordinates(centres, variances)

        # Define colour list as per Plotly's default colour list
        colour_list = np.array(['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b'])

        # Define blank figure
        figure = {
            'data': [],
            'layout': {}
        }
        
        # Create data trace
        data_trace = {
            'x': np.round(data[:, 0], 3),
            'y': np.round(data[:, 1], 3),
            'mode': 'markers',
            'hoverinfo': 'none',
            'marker': {
                'color': '#d3d3d3'
            }
        }
        
        figure['data'].append(data_trace)

        for k in range(len(centres)):
            # Create trace for cluster centres
            centre_trace = {
                'x': np.round([centres[k][0]], 3),
                'y': np.round([centres[k][1]], 3),
                'hoverinfo': 'none',
                'mode': 'markers',
                'marker': {
                        'size': 12,
                        'symbol': 'diamond',
                        'color': colour_list[0],
                        'line': {'width': 3}
                    }   
            }

            # Create trace for region encompassing 95% of data points
            variance_trace = {
                'x': ellipse[:,k,:][:,0],
                'y': ellipse[:,k,:][:,1],
                'hoverinfo': 'none',
                'mode': 'lines',
                'marker': {
                    'color': colour_list[0]
                }
            }

            # Add cluster trace
            for trace in [centre_trace, variance_trace]:
                figure['data'].append(trace)

        # Generate figure layout
        figure['layout'] = go.Layout(
            width = 900,
            height = 900,
            showlegend = False,
            title = 'Visualisation of First {} Clusters'.format(len(centres)),
            xaxis = {'autorange': True},
            yaxis = {'autorange': True, 'scaleanchor': 'x'}
        )

        return pyo.iplot(figure)
    
    def correlation_plot(target, prediction):
        '''
        Produce a correlation plot between latest time slices of target and prediction.
        Input:
            target:     Target values (N x P)
            prediction: Predicted target values (N x P)
        '''
        # Create trace for points to plot
        point_trace = go.Scatter(
            x = target[:,-1],
            y = prediction[:,-1],
            mode = 'markers',
        )
        
        print target.shape
        print prediction.shape
        
        # Create a line trace for plotting a 45deg line
        line_trace = go.Scatter(
            x = [np.min(target, axis=-1), np.max(target, axis=-1)],
            y = [np.min(target, axis=-1), np.max(target, axis=-1)],
            mode = 'lines',
            line = {
                'dash': 'dash'
            }
        )
        
        # Define layout
        layout = go.Layout(
            width = 800,
            height = 900,
            showlegend = False,
            xaxis = { 'title': 'Target value' },
            yaxis = { 'title': 'Predicted target value', 'scaleanchor': 'x' },
            title = 'Correlation Plot between Target & Predicted Values'
        )
        
        # Define figure
        figure = go.Figure(data=go.Data([point_trace, line_trace]), layout=layout)
        
        # Plot figure offline
        pyo.iplot(figure)
    
    def casuality_time_series(causality, coupling_parameter):
        '''
        Create a time series plot of the causality indices.
        Inputs:
            causality: Causality index
        '''
        # Generate empty figure
        figure = {
            'data': [],
            'layout': {},
        }
        
        # Define colour palette
        colour_list = np.array(['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b'])
        
        # Define trace
        trace = {
            'x': range(len(causality)),
            'y': causality,
            'mode': 'lines',
            'name': 'Causality Index',
        }
        # Append trace
        figure['data'].append(trace)
        
        # Define layout
        figure['layout'] = {
            'title': 'Time Series Plot of Causality Index',
            'xaxis': {'title': 'Time Index'},
            'yaxis': {
                'title': 'Causality Index',
                'titlefont': { 'color': colour_list[0] },
                'tickfont': { 'color': colour_list[0] }, 
                'range': [0, 1],
            },
            'showlegend': False,
        }
        
        # Define trace for coupling parameter
        if coupling_parameter is not None:
            # Plot the last len(causality) values
            param_trace = {
                'x': range(len(causality)),
                'y': coupling_parameter[-len(causality):],
                'mode': 'lines',
                'name': 'Coupling Parameter',
                'yaxis': 'y2',
                'line': { 'dash': 'dash' }
            }
            
            # Append trace
            figure['data'].append(param_trace)
            
            # Add secondary axis to layout
            figure['layout']['yaxis2'] = {
                'title': 'Coupling parameter',
                'titlefont': { 'color': colour_list[1] },
                'tickfont': { 'color': colour_list[1] },
                'range': [0, np.max(coupling_parameter) * 1.1],
                'overlaying': 'y',
                'side': 'right',
                'showgrid': False,
            }
        
        # Display figure
        pyo.iplot(figure)
        
    
    ###################
    # Function Begins #
    ###################
    
    # Error checking
    try:
        assert data.shape[0] >= scope
    except:
        print 'Error: Time series is shorter than scope. Please ensure scope is not shorter than the length of your time series.'
    
    # Define variables
    N = data.shape[0]
    causality = np.zeros(N - scope + 1)
    
    # Visualise clusters to see if centres and variances are appropriate
    if (show_clusters == True):
        centres, variances = find_cluster_params(data=data[:scope]) 
        visualise_clusters(data, centres, variances)

    print 'Calculating causality indices...'
    
    for i in range(N - scope + 1):
        # Define subset of working data and target
        scoped_data = data[i:(scope + i),:]
        scoped_target = target[i:(scope + i),:]
        
        # Define error variable
        error = np.zeros(scope)
        
        for j in range(scope):
            # Obtain centres and variances of RBFs using leave-one-out scoped dataset
            centres, variances = find_cluster_params(data=np.delete(scoped_data, j, 0))
            
            # Train weights of RBFN using leave-one-out scoped dataset
            weights = train_RBFN_weights(
                data = np.delete(scoped_data, j, 0), 
                target = np.delete(scoped_target, j, 0),
                centres = centres,
                variances = variances
            )
            
            # Calculate predicted value of output
            prediction = RBFN_calc(
                data = scoped_data[j,:][np.newaxis, :],
                centres = centres,
                variances = variances,
                weights = weights
            )
            
            # Calculate error
            error[j] = l2(prediction - scoped_target[j,:])
        
        # Calculate \delta
        delta = RMS(error) / RMS(l2(scoped_data - np.mean(scoped_data, axis=0), axis=1))
        
        # Calculate causality index
        causality[i] = np.exp( - delta / 5.)
        
    print 'Causality calculations complete!'
    
    # If True, display a correlation plot
    if show_corr_plot == True:
        correlation_plot(target, prediction)
        
    # If True, display a time series of calculated causality
    if show_causality_time_series == True:
        casuality_time_series(causality, coupling_parameter)
        
    return causality

### Helper Methods

In [5]:
def generate_delayed_vector(data, embed_dim, delay=1):
    '''
    Generate a delayed embedding vector of the data.
    Input:
        data:      A dictionary
        embed_dim: Embedding dimensions (int)
        delay:     Number of samples between each time series point (int)
    '''
    
    assert embed_dim > 1
    assert delay >= 1

    # Get length from random key in dictionary
    N = len(data[np.random.choice(list(data))])
    N_embed = N - (embed_dim - 1) * delay
    
    # Generate empty storage variable
    data_embed = {}
    
    # Initialize empty array for each key
    for key in data.keys():
        data_embed[key] = np.zeros((N_embed, embed_dim))
        # Generate phase-shifted vectors for each key
        for i in range(embed_dim):
            data_embed[key][:,i] = data[key][(i * delay):(N_embed + i * delay)]
        
    return data_embed

In [6]:
def visualise_data(data, ex):
    '''
    Visualise data through an interactive graph.

    Input:
        data: Dictionary of numpy time-series variables in order of X, Y, Z, v1 and v2
        ex: Example being plotted (1, 2, 3) (string)

    Returns:
        Interactive offline Plotly chart
    '''
    # Define empty trace and variable name list
    traces = []

    # Define and append trace for each variable
    # Set visibility of noise variables to 'legendonly'
    for key in sorted(data):
        traces.append(
            go.Scatter(
                x = range(len(data[key])),
                y = data[key],
                name = key,
                visible = 'legendonly' if key == 'v1' or key == 'v2' else True
            )
        )

    # Convert traces to JSON-like object
    data = go.Data(traces)

    # Define JSON-like layout
    layout = go.Layout(
        title = 'Time-series Plots of Example ' + str(ex),
        xaxis = {'title': 'Sampling time'},
    )

    # Generate and plot figure
    figure = go.Figure(data=data, layout=layout)
    pyo.iplot(figure)

In [7]:
def collapse_data(data, deg, ex, burn_in=0):
    '''
    Given degree d, collapse an N x M time-series data matrix into a (N - d + 1) x (d x M) data matrix. 
    Additional (d - 1) x M columns are for 'initial', ('intermediary') and 'final' values of all M variables.

    Inputs:
        data:    Matrix of M time-series each of length N (N x M)
        d:       Degree of Bayesian network
        ex:      Example being processed (1, 2, 3) (string)
        burn_in: Initial number of sampling intervals ignored to ensure stationarity of time-series

    Returns:
        Collapsed data matrix of size (N - d + 1) x (d x M)
        Headers of collapsed matrix
    '''
    # Define N and M
    N, M = data.shape
    
    # Define empty output matrix
    new_mat = np.zeros((N - deg + 1, deg * M))
    
    # Define header key
    head_key = ['r1_', 'r2_', 'y1_', 'y2_'] if (str(ex) == '3' or str(ex) == '3_mod') else ['X', 'Y', 'Z']
    
    # Define empty header list
    header = []
    
    # Create d duplicates of each time-series, and shift duplicated time-series by one sampling interval
    # Populate header list
    for col, var in enumerate(data.T):
        for i in range(deg):
            new_mat[:, col * deg + i] = var[i:(N - deg + i + 1)]
            header.append(head_key[col] + str(i))
    
    return new_mat[max(burn_in - 1, 0):], header

In [303]:
def output_csv(df, deg, burn_in=0):
    '''
    Output data into CSV file.

    Input:
        df:      Panda dataframe
        deg:     Degree of Bayesian network
        burn_in: Initial number of sampling intervals ignored to ensure stationarity of time-series
    '''
    # Obtain rows of dataframe
    N = df.shape[0]
    
    try:
        assert N > burn_in
    except:
        print 'Burn_in value exceeds length of dataframe.'
    
    # Define empty dataframe
    output = pd.DataFrame()
    
    # Create d duplicates of each time-series, and shift duplicated time-series by one sampling interval
    for field, series in df.iteritems():
        for i in range(deg):
            output = pd.concat([output, series.shift(i).rename(field + '_' + str(i))], axis=1)
    
    # Removes rows containing NaN 
    # Reset index to start from 0
    # Drop first burn_in rows
    # Output to csv
    (output
     .dropna()
     .reset_index(drop=True)
     .iloc[burn_in:]
     .to_csv(path_or_buf='./Data/Bayesian Network/eastman_deg{}.csv'.format(deg), index=False))

## Import Eastman Dataset

In [298]:
eastman = pd.read_csv('./Data/eastman.csv')
eastman.head()

Unnamed: 0,PC1.PV,FC3.PV,LC1.PV,FC1.PV,FC4.PV,TC1.PV,FC6.PV,PC2.PV,LC3.PV,FC5.PV,...,TI8.PV,TI7.PV,PI2.PV,FI3.PV,LI1.PV,FC3.SP,FC1.SP,FC6.SP,FC5.SP,FC8.SP
0,4.2034,6411.2,55.535,7.6151,13.42,108.38,1874.6,7.6552,42.375,4.7302,...,88.204,91.54,1.5884,2.4994,59.577,6393.9,7.5868,1888.3,4.6892,0.87133
1,4.2061,6372.4,55.247,7.6119,13.307,108.43,1838.6,7.7258,41.479,4.7358,...,88.204,91.54,1.5884,2.5317,59.577,6391.5,7.6717,1806.5,4.6809,0.87171
2,4.2256,6390.0,55.431,7.6151,13.302,108.47,1834.1,7.7018,41.556,4.7358,...,88.204,91.54,1.5946,2.5463,59.546,6372.4,7.6038,1809.1,4.6725,0.87196
3,4.2228,6370.3,55.219,7.6055,13.43,108.51,1851.4,7.6952,41.958,4.7136,...,88.204,91.54,1.6103,2.5168,59.546,6374.8,7.6717,1847.4,4.6649,0.86945
4,4.2089,6380.1,55.195,7.6151,13.323,108.59,1838.0,7.7435,41.826,4.7136,...,88.246,91.59,1.5978,2.5175,59.515,6384.3,7.6887,1837.2,4.6566,0.8697


## Output Phase-Shifted Eastman Dataset

In [304]:
output_csv(eastman, 2, 0);