This notebook contains the code used to perform goodness of fit tests for functional data using KSD. The main part of this code ids the KSD class which facilitates the KSD computations. 

The data used for the tests is either generated using functions in this notebook or other notebooks inn the repository which build upon more advanced samplers. 

In [2]:
import numpy as np
import tqdm
from tqdm import tqdm_notebook as tqdm_ntb
from scipy.stats import multinomial
from functools import partial
import csv
import matplotlib.pyplot as plt
import time
import multiprocessing
from functools import partial
from multiprocessing import Pool
import scipy.interpolate

The KSD class if the main object used in the tests.

It is calculated using a finite dimensional approximation of the data using the basis corresponding to the base Gaussian measure.

For example, if we are considering functions over L^{2}([0,1]) and the base Gaussian measure is a Brownian motion. Then the basis is the standard Brownian basis e_{n}(t) = \sqrt{2}\sin((n-0.5)\pi t)

We approximate L^{2} inner products by <f,g> = \sum_{n=1}^{\infty}<f,e_{n}><g,e_{n}> \approx \sum_{n=1}^{n_freqs}<f,e_{n}><g,e_{n}> for some value n_freqs standing for number of frequencies.

So the data one inputs into KSD is the n_freqs coefficients with respect to a basis.

This is different from the projections used in functional data analysis which aim to use a low number of frequencies to represent signals, often no more than 12-15. Whereas we use a high number, n_freqs = 100 in our experiments, as it is purely a technique to make inner products easier to compute rather than for statistical efficiency.

# KSD Class

In [3]:
def form_distance_mat(x1,y1,x2,y2,A,B = None):
    """
    Description:
        Forms a distance matrix with ij-th entry <A(x1_i-y1_j),B(x2_i-y2_j)> where <.,.> is Euclidean inner product
        and x_i = x[:,i] is i-th column of data, analogous for y_i. 
        If B = None then B becomes the identity.
    Arg:
        x1,x2: (d,n) matrix, data in columns
        y1,y2: (d,m) matrix, data in columns
        A: (d,d) matrix
        B: (d,d) matrix
    Return:
        dist_mat: (n,m) matrix with ij-th entry <A(x1_i - y1_j), B(x2_i - y2_j)> 
                  where x1_i = x[:,i] is i-th data column, analogous for ,x2_i,y1_j,y2_j
    """  
    d = x1.shape[0]
    n = x1.shape[1]
    m = y1.shape[1]
    
    if (B is None) or (B.all() is None):
        B = np.eye(d)
    
    mat_x1x2 = np.einsum("ji,ji -> i", A @ x1, B @ x2)
    mat_x1x2 = np.reshape(mat_x1x2,(n,1))
    mat_x1x2 = np.tile(mat_x1x2,(1,m))
    
    mat_y1y2 = np.einsum("ji,ji -> i", A @ y1, B @ y2)
    mat_y1y2 = np.reshape(mat_y1y2,(1,m))
    mat_y1y2 = np.tile(mat_y1y2,(n,1))
    
    mat_x1y2 = np.einsum("ji,jk -> ik",A @ x1, B @ y2)
    mat_y1x2 = np.einsum("jk,ji -> ik",A @ y1, B @ x2)
    
    dist_mat = mat_x1x2 + mat_y1y2 - mat_x1y2 - mat_y1x2
    
    return dist_mat

class KSD:
    """
    Description:
        Class to represent an instance of kernel Stein discrepancy.
        Has methods to ouput the Stein kernel evaluated on data
    """  
    def __init__(self,C,T,DU = 0,kernel_type = "SE", gamma = -1):
        """
        Arg:
            C: (d,d) matrix representing the covariance operator
            T: (d,d) matrix representing the hyperparameter
            DU: Function for the DU term in KSD. Default is DU = 0 which makes the DU term be 0.
            kernel_type: either "SE" or "IMQ"
            gamma: lengthscale, if -1 then median heuristic is employed
        """  
        self.C = C
        self.T = T
        self.DU = DU
        self.kernel_type = kernel_type
        self.gamma = gamma
        

    def __call__(self, x, y):
        """
        Arg:
            x: (d,n) data matrix
            y: (d,m) data matrix
        Return:
            Stein_mat: (n,m) matrix with ij-th entry the Stein kernel h evaluated at x_i,y_j 
        """

        n = np.shape(x)[1]
        m = np.shape(y)[1] 
        
        sqr_dist_mat = form_distance_mat(x,y,x,y,T,T)
            
        # median heuristic
        if self.gamma == -1:
            self.gamma = np.sqrt(np.median(sqr_dist_mat[sqr_dist_mat > 0]))
            #print(self.gamma)
            # changes the T which will be used later
            self.T = self.T/self.gamma
            # renormalises the squared distance matrix already computed that'll be used later
            sqr_dist_mat = sqr_dist_mat/(self.gamma**2)
        
        # introduces variable S to make calculations easier
        S = self.C @ np.transpose(self.T) @ self.T
        # form the CDU terms
        if self.DU == 0:
            CDUx = np.zeros(np.shape(x))
            CDUy = np.zeros(np.shape(y))
        else:
            CDUx = C @ self.DU(x)
            CDUy = C @ self.DU(y)
        
        # <x + CDU(x),y+CDU(y)> term
        term1 = np.einsum("ji,jk -> ik",x+CDUx,y + CDUy)
        # - <S(x-y),x-y> term
        term2 = -1 * form_distance_mat(x,y,x,y,S)
        # - <S(x-y),CDU(x)-CDU(y)> term
        term3 = -1 * form_distance_mat(x,y,CDUx,CDUy,S)
        # Tr(SC) term
        term4 = np.trace(S @ C)
        # ||S(x-y)||^2 term
        term5 = -1 * form_distance_mat(x,y,x,y,S,S)
    
        # calculations are taken from example of Stein kernels in paper associated with SE and IMQ base kernels
        if self.kernel_type == "SE":
            
            SE_mat = np.exp(-0.5 * sqr_dist_mat)
            
            Stein_mat = SE_mat * (term1 + term2 + term3 + term4 + term5)
            
            return Stein_mat
        
        if self.kernel_type == "IMQ":
            
            IMQ_mat = (sqr_dist_mat + 1)**(-0.5)
            
            Stein_mat = (term1 * IMQ_mat)  + ((term2 + term3 + term4) * (IMQ_mat**3)) + (3 * term5 * (IMQ_mat**5))
            
            return Stein_mat
            

In [4]:
class GoodnessOfFitTest:
    """
    Description: A single goodness-of-fit test which can produce a p-value given data and a KSD object
    """
    def __init__(self, discrepancy, x):
        """
        Args:
            discrepancy: A callable that returns a matrix of Stein kernel evaluations
            x: (d,n) matrix of data
        """
        self.d = discrepancy
        self.x = x
        self.n = x.shape[1]
        

    def compute_pvalue(self, nbootstrap):
        """
        Arg:
            nbootstrap: Number of bootstrap samples.
        Return:
            bootstrap_stats: bootstraped test statistics
            test_stat: the test statistic calculated using observed data
            pvalue: p-value based on comparing test_stat with bootstrap_stats
        """
        # Form the test statistic from evaluations of the Stein kernel
        stein_matrix = self.d(self.x, self.x)
        u_matrix = stein_matrix - np.diag(np.diag(stein_matrix))
        test_stat = u_matrix.sum() / self.n / (self.n-1)
        
        # Obtain bootstrap samples using multi-nomial distribution
        bootstrap_stats = np.zeros(n_bootstrap)
        for i in range(n_bootstrap):
            W = np.random.multinomial(self.n,(1./self.n)*np.ones(self.n))
            W = (W-1)/self.n
            bootstrap_stats[i] = W @ u_matrix @ W
        
        # Calculate p-value
        pvalue = (bootstrap_stats > test_stat).mean()

        return (bootstrap_stats, test_stat, pvalue)
    

# Brownian goodness-of-fit tests

For each experiment we need a sampler that provides the data. 

As our implementation requires data projected to the basis of the base Gaussian the samplers will simulate the trajectories and then projecct to n_freqs basis elements of Brownian motion. 

In [5]:
# Brownian motion basis used to project data onto
def BM_basis(n_freqs,obs):
    X = np.zeros((n_freqs,len(obs)))
    for i in range(1,n_freqs+1):
        X[i-1,:] = np.sqrt(2)*np.sin((i-0.5)*np.pi*obs)
    return X

# Generates Ornstein-Uhlenbeck trajectories
def OU_sampler(N,grid_size,theta,mu=5,random_state = None):
    rng = np.random.RandomState(random_state)
    dt = 1/grid_size
    X = np.zeros((N,grid_size))
    noise = rng.randn(N,grid_size)*np.sqrt(dt)
    for i in range(1,grid_size):
        X[:,i] = X[:,i-1] + theta*(mu-X[:,i-1])*dt + noise[:,i]
    return X

# Generates Brownian motion clipped to certain a frequency
# Since the samples are computed using random variables against BM basis elements
# and we only use the coefficients in the computation of KSD, we can simulate 
# this data by simply simulating the random variable coefficients
def BM_clip(N,n_freqs,clip_freq,grid_size = 100,random_state = None):
    rng = np.random.RandomState(random_state)
    C = np.zeros(n_freqs)
    lambda_diag = np.array([1/(np.pi * (n-0.5))**2 for n in range(1,clip_freq + 1)])
    C[:clip_freq] = lambda_diag
    coefs = rng.multivariate_normal(mean = np.zeros(n_freqs),cov = np.diag(C),size = N)
    return coefs.T

# Generates OU trajectories projected to a specified number of frequencies of Brownian motion basis
def OU_freqs_sampler(N,n_freqs,theta,mu=5,sig=1,random_state = None):
    grid_size = 100
    obs = np.linspace(0,1,grid_size,endpoint=True)
    basis = BM_basis(n_freqs,obs)
    OU_vals = OU_sampler(N,grid_size,theta,mu,random_state)
    return (1/grid_size)*np.dot(OU_vals,basis.T).T

# Generates samples from the referenced Cuesta-Albertos et al 2007 paper
def CA_sampler(N,grid_size,a_1,a_2,a_3,random_state = None):
    BM_arr = OU_sampler(N,grid_size,theta=0,mu=0,random_state=random_state)
    obs = np.linspace(0,1,grid_size,endpoint=False)
    det_arr = 1 + a_1*(obs**2) + a_2*np.sin(2*np.pi*obs) + a_3*np.exp(obs)
    return BM_arr * det_arr

# Generates trajectories from CA_sampler projected to a specified number of frequencies of Brownian motion basis
def CA_freqs_sampler(N,n_freqs,a_1,a_2,a_3,random_state = None):
    grid_size = 100
    basis = BM_basis(n_freqs,np.linspace(0,1,grid_size,endpoint=False))
    AC_vals = CA_sampler(N,grid_size,a_1,a_2,a_3,random_state = random_state)
    return (1/grid_size)*np.dot(AC_vals,basis.T).T

# Generates trajectories from Ditzhaus and Gaigall 2018 referenced paper projected to a specified number of frequencies of Brownian motion basis
def Ditzhaus_freqs_sampler(N,n_freqs=100,a=1,b=0,random_state = None):
    grid_size = 100
    X = a*OU_sampler(N,grid_size,theta = 0,random_state=random_state)
    obs = np.linspace(0,1,grid_size,endpoint=False)
    X += b*obs*(obs-1)
    basis = BM_basis(n_freqs,np.linspace(0,1,grid_size,endpoint=False))
    return (1/grid_size)*np.dot(X,basis.T).T

In [5]:
# Specify number of frequencies to use in numerical approximation of inner products
# Here we use 100 meaning we are working in the space spanned by the first
# 100 basis functions of Brownian motion
n_freqs = 100

# Set the target covariance operator in matrix form with respect to 
# the specified number of basis elements, these are the eigenvalues 
# of Brownian motion decomposition
C = np.diag([(1/((i-0.5)*np.pi))**(2) for i in np.arange(1,n_freqs+1)])

# Set hyperparameters
T_1 = np.eye(n_freqs)
n_adjust_freqs = 50
T_2 = np.eye(n_freqs)
T_2[np.diag_indices(n_adjust_freqs)] = C[np.diag_indices(n_adjust_freqs)]**(-1)

# Set median heuristic
gamma = -1

# Set test specification
n_tests = 500
n_bootstrap = 2000

# Set random seed
rng_X = 1234

# Store kernel and hyperparameter specifications
kernel_list = ["SE","IMQ"]
T_list = [T_1,T_2]

In [18]:
# Uncomment which ever experiment it is you want to run

# Experiment 1
n_samples = 50
sampler = partial(BM_clip,n_freqs = n_freqs,clip_freq = n_freqs,random_state = rng_X)

# Experiment 2:
# n_samples = 50
# sampler = partial(BM_clip,n_freqs = n_freqs,clip_freq = 5,random_state = rng_X)

# Experimen 3: 
# n_samples = 25
# sampler = partial(OU_freqs_sampler,n_freqs = n_freqs,theta = 0.5,mu=5,random_state = rng_X)

# Experiment 4:
# n_samples = 50
# sampler = partial(CA_freqs_sampler,n_freqs = n_freqs,a_1=1,a_2=0,a_3=0,random_state = rng_X)

# Experiment 5:
# n_samples = 50
# sampler = partial(CA_freqs_sampler,n_freqs = n_freqs,a_1=0,a_2=1,a_3=0,random_state = rng_X)

# Experiment 6:
# n_samples = 25
# sampler = partial(Ditzhaus_freqs_sampler,n_freqs = n_freqs,a=2,b=0,random_state = rng_X)

# Experiment 7:
# n_samples = 25
# sampler = partial(Ditzhaus_freqs_sampler,n_freqs = n_freqs,a=1,b=1.5,random_state = rng_X)


# We samples n_samples * n_tests many samples so we have neough samples for all the tests 
# performed to calculate power
test_data = sampler(n_samples*n_tests)

In [19]:
for kernel in kernel_list:
    for i in range(len(T_list)):
        T = T_list[i]
        rej = 0
        for t in tqdm_ntb(range(n_tests)):
            my_KSD = KSD(C,T,kernel_type = kernel)
            data = test_data[:,t*n_samples:(t+1)*n_samples]
            single_test = GoodnessOfFitTest(my_KSD,data)
            _,_,pvalue = single_test.compute_pvalue(n_bootstrap)
            rej += (pvalue < 0.05)
        print(kernel," with T_", i+1, " n_samples ", n_samples, "has power ",rej/n_tests)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/500 [00:00<?, ?it/s]

SE  with T_ 1  n_samples  25 has power  0.526


  0%|          | 0/500 [00:00<?, ?it/s]

SE  with T_ 2  n_samples  25 has power  0.99


  0%|          | 0/500 [00:00<?, ?it/s]

IMQ  with T_ 1  n_samples  25 has power  0.618


  0%|          | 0/500 [00:00<?, ?it/s]

IMQ  with T_ 2  n_samples  25 has power  0.868


# Gibbs goodness-of-fit tests

The following cells perform a goodness-of-fit test for Gibbs measures.

These cells are used in the paper to perform the goodness-of-fit tests where either the samples are perturbed by a mean function, or the samples are being taken over a larger terminal time and the performance of the sampler is being evaluated.

Code for the adaptation of the samplers used for these experiments can be found in this repository. 

In [6]:
def BB_basis(n_freqs,obs,TT):
    X = np.zeros((n_freqs,len(obs)))
    for i in range(1,n_freqs+1):
        X[i-1,:] = (np.sqrt(2) / np.sqrt(TT)) * np.sin(i*np.pi*obs/TT)
    return X

def sin_DU(x,obs, TT = 50.0):
    alpha = 0.7
    dim = np.shape(x)[0]
    x_recon = np.dot(BB_basis(dim,obs,TT).T,x)
    DUx = ((alpha**2) * np.sin(x_recon)*np.cos(x_recon)) - ((alpha / 2)*np.sin(x_recon))
    DUx_freqs = (TT/len(obs))*np.dot(BB_basis(dim,obs,TT),DUx)
    return DUx_freqs

def sin_DU_M(x,obs,TT=50.0,u = -np.pi,v = 3*np.pi):
    alpha = 0.7
    dim = np.shape(x)[0]
    x_recon = np.dot(BB_basis(dim,obs,TT).T,x)
    M = u + (obs/TT)*(v-u)
    M = np.reshape(M,(len(obs),1))
    DUx = ((alpha**2) * np.sin(x_recon + M)*np.cos(x_recon + M)) - ((alpha / 2)*np.sin(x_recon + M))
    DUx_freqs = (TT/len(obs))*np.dot(BB_basis(dim,obs,TT),DUx)
    return DUx_freqs

def OU_DU(x,obs,TT=50.0,alpha = -5.0,beta = -1.0):
    dim = np.shape(x)[0]
    x_recon = np.dot(BB_basis(dim,obs,TT).T,x)
    DUx = beta**2 * x_recon + alpha * beta
    DUx_freqs = (TT/len(obs))*np.dot(BB_basis(dim,obs,TT),DUx)
    return DUx_freqs

def OU_DU_M(x,obs,TT=50.0,alpha = -5.0,beta = -1.0,u = -1.0,v = 2.0):
    dim = np.shape(x)[0]
    x_recon = np.dot(BB_basis(dim,obs,TT).T,x)
    M = u + (obs/TT)*(v-u)
    M = np.reshape(M,(len(obs),1))
    DUx = beta**2 * (x_recon + M) + alpha * beta
    DUx_freqs = (TT/len(obs))*np.dot(BB_basis(dim,obs,TT),DUx)
    return DUx_freqs

def interpolate_samples(X,inter_grid,original_grid):
    """
        Arg:
            X: (n_samples,n_points) data matrrix
            inter_grid: the grid up to which the data shall be linearly interpolated
            original_grid: the grid of points the trajectories were generated over
        Return:
            X: (n_samples,n_points) matrix of trajectories that have been linearly interpolated up to inter_grid
    """
    Y = np.zeros((len(X),len(inter_grid)))
    # for each trajectory linearly interpolate up to inter_grid
    for i in range(len(X)):
        Y[i,:] = np.interp(inter_grid,original_grid,X[i,:])
    return Y

In [40]:
# Load the dataset, set the discretisation parameter L, the perturbation parameter delta 
# and the terminal time TT 
# For all experimentss L = 6 as is used in Bierkins et al. 
paths = np.load('sin_paths/PDMP_paths/sin_PDMP_GoF_Data.npy')
L = 6
delta = 0.0
TT = 50.0

# Set up observation grid
obs = np.linspace(0,TT,2**(L+1) + 1,endpoint=True)

# Set the start (u) and end (v) values
u = -np.pi
v = 3*np.pi
# u = -1
# v = 2

# n_inter_points = 2**(L+1) + 1
# inter_grid = np.linspace(0,TT,n_inter_points,endpoint = True)
# # paths = interpolate_samples(paths.T,inter_grid,obs).T
paths = paths.T
np.random.shuffle(paths)
paths = paths.T

# Set correct choice of DU
DU_partial = partial(sin_DU_M,obs = obs, TT = TT,u=u,v=v)
# DU_partial = partial(OU_DU_M,obs = obs, TT = TT,u=u,v=v)

In [41]:
# Specify number of frequencies to use in numerical approximation of inner products
# Here we use 100 meaning we are working in the space spanned by the first
# 100 basis functions of Brownian bridge
n_freqs = 100

# Set the target covariance operator in matrix form with respect to 
# the specified number of basis elements, these are the eigenvalues 
# of Brownian bridge decomposition
C = np.diag([(1/(i*np.pi/TT))**(2) for i in np.arange(1,n_freqs + 1)])

# Set hyperparameters
T_1 = np.eye(n_freqs)
n_adjust_freqs = 50
T_2 = np.eye(n_freqs)
T_2[np.diag_indices(n_adjust_freqs)] = C[np.diag_indices(n_adjust_freqs)]**(-1)

# Set median heuristic
gamma = -1

# Set test specification
n_tests = 100
n_bootstrap = 2000
n_samples = 100

# Set random seed
rng_X = 1234

# Store kernel and hyperparameter specifications
kernel_list = ["SE","IMQ"]
T_list = [T_1,T_2]

In [1]:
# Form the meann function M 
M = u + (obs/TT)*(v-u)
M = np.reshape(M,(len(obs),1))
# Centre the paths
paths = paths - M
# Add the perturbation whose size is dictated by delta
paths = paths + (delta * M)
# Calculate the basis representation of the paths that is used for the KSD calculation
path_freqs = (TT/len(obs))*np.dot(BB_basis(n_freqs,obs,TT),paths)

NameError: name 'u' is not defined

In [43]:
for kernel in kernel_list:
    for i in np.arange(len(T_list)):
        T = T_list[i]
        rej = 0
        for t in tqdm_ntb(range(n_tests)):
            my_KSD = KSD(C,T,DU = DU_partial,kernel_type = kernel)
            data = path_freqs[:,t*n_samples:(t+1)*n_samples]
            single_test = GoodnessOfFitTest(my_KSD,data)
            _,_,pvalue = single_test.compute_pvalue(n_bootstrap)
            rej += (pvalue < 0.05)
        print(kernel," with T_", i+1, " n_samples ", n_samples, " has power ",rej/n_tests)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/100 [00:00<?, ?it/s]

SE  with T_ 1  n_samples  100  has power  0.9


  0%|          | 0/100 [00:00<?, ?it/s]

SE  with T_ 2  n_samples  100  has power  0.91


  0%|          | 0/100 [00:00<?, ?it/s]

IMQ  with T_ 1  n_samples  100  has power  0.92


  0%|          | 0/100 [00:00<?, ?it/s]

IMQ  with T_ 2  n_samples  100  has power  0.92
