In [1]:
from tda import PD, PWGK, PL, PSSK

import tda
import numpy as np
import os
import random
import json
import math

from itertools import combinations

In [2]:
# compute MMDu
def n_mmd(mat_gram, unbias=True):
    n_total = mat_gram.shape[0]
    n = int(n_total / 2)
    mat_xx = mat_gram[0:n, 0:n]
    mat_yy = mat_gram[n:n_total, n:n_total]
    mat_xy = mat_gram[0:n, n:n_total]
    sum_xx = sum(sum(mat_xx))
    sum_yy = sum(sum(mat_yy))
    sum_xy = sum(sum(mat_xy))
    if unbias:
        sum_xx -= sum(np.diag(mat_xx))
        sum_yy -= sum(np.diag(mat_yy))
        sum_xy -= sum(np.diag(mat_xy))
        return (sum_xx + sum_yy - 2 * sum_xy) / (n - 1)
    else:
        return (sum_xx + sum_yy - 2 * sum_xy) / n


def hist_wchi(mat_gram, num_hist=int(1e+4)):
    n = len(mat_gram)

    # centered Gram matrix
    mat_center = np.empty((n, n))
    vec_gram = sum(mat_gram)
    val_total = sum(vec_gram)
    for i in range(n):
        for j in range(i + 1):
            mat_center[i, j] = (mat_gram[i, j]
                                - ((vec_gram[i] + vec_gram[j]) / n)
                                + (val_total / (n ** 2)))
            mat_center[j, i] = mat_center[i, j]

    # estimated eigenvalues
    vec_nu = np.sort(np.linalg.eigh(mat_center)[0])[::-1][0: - 1]
    vec_lambda = vec_nu / (n - 1)
    sum_lambda = sum(vec_lambda)

    # histogram of the null distribution (weighted chi square)
    vec_hist = np.empty(num_hist)
    for i in range(num_hist):
        vec_z = np.random.normal(0, np.sqrt(2), n - 1) ** 2
        vec_hist[i] = np.inner(vec_lambda, vec_z) - 2 * sum_lambda

    return np.sort(vec_hist)[::-1]


def extract_submat(mat_gram, num_m=None):
    n_total = mat_gram.shape[0]
    n = int(n_total / 2)
    if num_m is None:
        num_m = n - 1
    d = int(2 * num_m)
    mat = np.empty((d, d))
    idx_x = random.sample(range(0, n), num_m)
    idx_y = random.sample(range(n, n_total), num_m)
    idx_xy = idx_x + idx_y
    for i, a in enumerate(idx_xy):
        for j, b in enumerate(idx_xy):
            mat[i, j] = mat_gram[a, b]
    return mat


def two_sample_test(mat_gram, alpha=0.05, num_m=None, num_test=500):
    vec_wchi = hist_wchi(mat_gram)                    # null distribution of psi-hat
    vec_p_value = np.empty(num_test)
    for temp_test in range(num_test):                 # for l=1,...,N
        mat_reduced = extract_submat(mat_gram, num_m)  # resample m samples
        value_mmd = n_mmd(mat_reduced)                 # compute mMMDu
        vec_temp = np.where(vec_wchi > value_mmd)[0]   # how many psi-hat's are greater than mMMDu?
        vec_p_value[temp_test] = len(vec_temp) / len(vec_wchi)
    return vec_p_value, len(np.where(vec_p_value < alpha)[0]) / num_test


In [3]:
# import PD - json, each pds are saved as unnamed array

with open("/users/chulm/hypotest/sim_rev/beetlepd.json") as f:
    pd = json.load(f)

In [4]:
pd1dat = np.array(pd[0])
pd1mat = np.transpose( np.resize(pd1dat, (3,int(len(pd1dat)/3)) ) )
pd1mat1 = pd1mat[pd1mat[:,0]==1,1:]
pd1mat1

array([[19.6396, 40.5827],
       [20.458 , 27.1967],
       [22.3653, 27.6706],
       [29.5865, 34.2615],
       [14.1339, 18.1496],
       [ 9.5918, 12.7848],
       [23.3545, 25.4296],
       [14.7989, 16.3304],
       [13.6672, 15.0779],
       [ 5.3358,  6.6726],
       [ 8.2898,  9.4404],
       [10.8704, 11.9515],
       [17.067 , 17.7264],
       [ 7.9129,  8.5603],
       [17.9538, 18.3539],
       [ 3.2959,  3.5893],
       [19.0892, 19.278 ],
       [12.4488, 12.5745],
       [ 4.9354,  5.0584],
       [15.9173, 16.0238],
       [ 6.4543,  6.4836]])

In [5]:
pd1list=[]
for ii in range(len(pd)):
    pd1dat = np.array(pd[ii])
    pd1mat = np.transpose( np.resize(pd1dat, (3,int(len(pd1dat)/3)) ) )
    pd1mat1 = pd1mat[pd1mat[:,0]==1,1:]
    pd1mat1fin = pd1mat1.astype(np.float)
    pd1list.append(pd1mat1fin)

In [6]:
# functions for PWGK
func_kernel = tda.function_kernel("Gaussian", sigma=3 )
#func_weight = tda.function_weight("arctan", arc_c=0.5, arc_p=0.5, lin_el=1)
func_weight = tda.function_weight("none")

In [63]:
nrep=100
nset=20
ntotalset=4000
rr=0
(rr+1)*nset+ntotalset

4020

In [7]:
len(pd1list)

8000

In [8]:
pd1list[4001]

array([[ 9.2182, 14.4925],
       [11.6715, 14.5005],
       [ 5.1413,  7.9637],
       [17.5636, 20.3002],
       [ 6.5295,  9.0986],
       [ 5.317 ,  7.0545],
       [ 9.6348, 11.1544],
       [ 3.5198,  4.8807],
       [ 7.3698,  8.5967],
       [ 8.5008,  9.7071],
       [ 5.473 ,  6.6594],
       [ 5.129 ,  6.3065],
       [ 5.6332,  6.5844],
       [ 4.7933,  5.678 ],
       [ 5.274 ,  6.1332],
       [ 5.8132,  6.4948],
       [11.8653, 12.4861],
       [ 4.28  ,  4.8927],
       [ 3.5702,  4.1253],
       [16.2526, 16.7369],
       [ 5.6093,  5.8888],
       [13.7806, 14.035 ],
       [ 4.641 ,  4.8237],
       [ 5.4242,  5.5897],
       [20.0394, 20.1671],
       [ 4.3622,  4.4784],
       [ 5.6151,  5.6849]])

# between two

In [13]:
nrep=100
nset=20
ntotalset=4000
per=np.zeros(nrep)
for rr in range(nrep):
    # import dimension one PDs
    combpdlist = pd1list[rr*nset:(rr+1)*nset]
    combpdlist.extend(pd1list[(rr*nset+ntotalset):((rr+1)*nset+ntotalset)])
    # compute gram matrix
    pwgk = PWGK(combpdlist, func_kernel, func_weight, sigma=3,approx=True)
    mat_gaussian_pwgk = pwgk.gram_matrix()
    # define gram matrix
    name_rkhs = ["Linear", "Gaussian"][1]
    mat_gram_pwgk = tda.matrix_gram(mat_gaussian_pwgk, name_rkhs)[0]
    num_reject = two_sample_test(mat_gram_pwgk, alpha=0.05, num_m=20, num_test=500)[1]
    per[rr] = num_reject
1-np.sum(per<0.05)/nrep # linear

1.0

# between stable

In [10]:
nrep=100
nset=20
ntotalset=2000
per=np.zeros(nrep)
for rr in range(nrep):
    # import dimension one PDs
    combpdlist = pd1list[rr*nset:(rr+1)*nset]
    combpdlist.extend(pd1list[(rr*nset+ntotalset):((rr+1)*nset+ntotalset)])
    # compute gram matrix
    pwgk = PWGK(combpdlist, func_kernel, func_weight, sigma=3,approx=True)
    mat_gaussian_pwgk = pwgk.gram_matrix()
    # define gram matrix
    name_rkhs = ["Linear", "Gaussian"][1]
    mat_gram_pwgk = tda.matrix_gram(mat_gaussian_pwgk, name_rkhs)[0]
    per[rr] = num_reject
1-np.sum(per<0.05)/nrep # linear

0.0

In [52]:
per

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

# between aperiodic

In [11]:
nrep=100
nset=20
ntotalset=2000
per=np.zeros(nrep)
for rr in range(nrep):
    # import dimension one PDs
    combpdlist = pd1list[(rr*nset+4000):((rr+1)*nset+4000)]
    combpdlist.extend(pd1list[(rr*nset+ntotalset+4000):((rr+1)*nset+ntotalset+4000)])
    # compute gram matrix
    pwgk = PWGK(combpdlist, func_kernel, func_weight, sigma=3,approx=True)
    mat_gaussian_pwgk = pwgk.gram_matrix()
    # define gram matrix
    name_rkhs = ["Linear", "Gaussian"][1]
    mat_gram_pwgk = tda.matrix_gram(mat_gaussian_pwgk, name_rkhs)[0]
    num_reject = two_sample_test(mat_gram_pwgk, alpha=0.05, num_m=20, num_test=500)[1]
    per[rr] = num_reject
1-np.sum(per<0.05)/nrep # linear
per

array([0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.028, 1.   , 0.   , 0.   ,
       0.   , 0.   , 0.476, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.004,
       0.   ])

In [12]:
1-np.sum(per<0.05)/nrep # linear

0.020000000000000018

In [40]:
rr=0
combpdlist = pd1list[(rr*nset+4000):((rr+1)*nset+4000)]
combpdlist.extend(pd1list[(rr*nset+ntotalset+4000):((rr+1)*nset+ntotalset+4000)])
bd=combpdlist[0][0]

In [41]:
np.maximum( np.arctan(math.pow((bd[1] - bd[0]) / 0.5, 0.5)), 0.0)

0.3111162227139398

# LV60

In [31]:
# import dim 0 PDs
combpdlist = pd0list[3*27:4*27]
combpdlist.extend(pd0list[4*27:5*27])
# compute gram matrix
pwgk = PWGK(combpdlist, func_kernel, func_weight, sigma=1.5, approx=True)
mat_gaussian_pwgk = pwgk.gram_matrix()
# define gram matrix
name_rkhs = ["Linear", "Gaussian"][1]
mat_gram_pwgk = tda.matrix_gram(mat_gaussian_pwgk, name_rkhs)[0]
num_reject = two_sample_test(mat_gram_pwgk, alpha=0.05, num_m=27, num_test=1000)
num_reject[1]

1.0

In [32]:
# import dim 1 PDs
combpdlist = pd1list[3*27:4*27]
combpdlist.extend(pd1list[4*27:5*27])
# compute gram matrix
pwgk = PWGK(combpdlist, func_kernel, func_weight, sigma=1.5, approx=True)
mat_gaussian_pwgk = pwgk.gram_matrix()
# define gram matrix
name_rkhs = ["Linear", "Gaussian"][1]
mat_gram_pwgk = tda.matrix_gram(mat_gaussian_pwgk, name_rkhs)[0]
num_reject = two_sample_test(mat_gram_pwgk, alpha=0.05, num_m=27, num_test=1000)
num_reject[1]

1.0

In [33]:
# import dim 2 PDs
combpdlist = pd2list[3*27:4*27]
combpdlist.extend(pd2list[4*27:5*27])
# compute gram matrix
pwgk = PWGK(combpdlist, func_kernel, func_weight, sigma=1.5, approx=True)
mat_gaussian_pwgk = pwgk.gram_matrix()
# define gram matrix
name_rkhs = ["Linear", "Gaussian"][1]
mat_gram_pwgk = tda.matrix_gram(mat_gaussian_pwgk, name_rkhs)[0]
num_reject = two_sample_test(mat_gram_pwgk, alpha=0.05, num_m=27, num_test=1000)
num_reject[1]

1.0

# F42 vs LV60

In [34]:
# import dim 0 PDs
combpdlist = pd0list[1*27:2*27]
combpdlist.extend(pd0list[3*27:4*27])
# compute gram matrix
pwgk = PWGK(combpdlist, func_kernel, func_weight, sigma=1.5, approx=True)
mat_gaussian_pwgk = pwgk.gram_matrix()
# define gram matrix
name_rkhs = ["Linear", "Gaussian"][1]
mat_gram_pwgk = tda.matrix_gram(mat_gaussian_pwgk, name_rkhs)[0]
num_reject = two_sample_test(mat_gram_pwgk, alpha=0.05, num_m=27, num_test=1000)
num_reject[1]

1.0

In [35]:
# import dim 1 PDs
combpdlist = pd1list[1*27:2*27]
combpdlist.extend(pd1list[3*27:4*27])
# compute gram matrix
pwgk = PWGK(combpdlist, func_kernel, func_weight, sigma=1.5, approx=True)
mat_gaussian_pwgk = pwgk.gram_matrix()
# define gram matrix
name_rkhs = ["Linear", "Gaussian"][1]
mat_gram_pwgk = tda.matrix_gram(mat_gaussian_pwgk, name_rkhs)[0]
num_reject = two_sample_test(mat_gram_pwgk, alpha=0.05, num_m=27, num_test=1000)
num_reject[1]

1.0

In [36]:
# import dim 2 PDs
combpdlist = pd2list[1*27:2*27]
combpdlist.extend(pd2list[3*27:4*27])
# compute gram matrix
pwgk = PWGK(combpdlist, func_kernel, func_weight, sigma=1.5, approx=True)
mat_gaussian_pwgk = pwgk.gram_matrix()
# define gram matrix
name_rkhs = ["Linear", "Gaussian"][1]
mat_gram_pwgk = tda.matrix_gram(mat_gaussian_pwgk, name_rkhs)[0]
num_reject = two_sample_test(mat_gram_pwgk, alpha=0.05, num_m=27, num_test=1000)
num_reject[1]

1.0

In [38]:
nr=num_reject[0]

0.0

In [39]:
nr

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.