In [None]:
from tda import PD, PWGK, PL, PSSK

import tda
import numpy as np
import os
import random
import json

from itertools import combinations

In [None]:
# compute MMDu
def n_mmd(mat_gram, unbias=True):
    n_total = mat_gram.shape[0]
    n = int(n_total / 2)
    mat_xx = mat_gram[0:n, 0:n]
    mat_yy = mat_gram[n:n_total, n:n_total]
    mat_xy = mat_gram[0:n, n:n_total]
    sum_xx = sum(sum(mat_xx))
    sum_yy = sum(sum(mat_yy))
    sum_xy = sum(sum(mat_xy))
    if unbias:
        sum_xx -= sum(np.diag(mat_xx))
        sum_yy -= sum(np.diag(mat_yy))
        sum_xy -= sum(np.diag(mat_xy))
        return (sum_xx + sum_yy - 2 * sum_xy) / (n - 1)
    else:
        return (sum_xx + sum_yy - 2 * sum_xy) / n


def hist_wchi(mat_gram, num_hist=int(1e+4)):
    n = len(mat_gram)

    # centered Gram matrix
    mat_center = np.empty((n, n))
    vec_gram = sum(mat_gram)
    val_total = sum(vec_gram)
    for i in range(n):
        for j in range(i + 1):
            mat_center[i, j] = (mat_gram[i, j]
                                - ((vec_gram[i] + vec_gram[j]) / n)
                                + (val_total / (n ** 2)))
            mat_center[j, i] = mat_center[i, j]

    # estimated eigenvalues
    vec_nu = np.sort(np.linalg.eigh(mat_center)[0])[::-1][0: - 1]
    vec_lambda = vec_nu / (n - 1)
    sum_lambda = sum(vec_lambda)

    # histogram of the null distribution (weighted chi square)
    vec_hist = np.empty(num_hist)
    for i in range(num_hist):
        vec_z = np.random.normal(0, np.sqrt(2), n - 1) ** 2
        vec_hist[i] = np.inner(vec_lambda, vec_z) - 2 * sum_lambda

    return np.sort(vec_hist)[::-1]


def extract_submat(mat_gram, num_m=None):
    n_total = mat_gram.shape[0]
    n = int(n_total / 2)
    if num_m is None:
        num_m = n - 1
    d = int(2 * num_m)
    mat = np.empty((d, d))
    idx_x = random.sample(range(0, n), num_m)
    idx_y = random.sample(range(n, n_total), num_m)
    idx_xy = idx_x + idx_y
    for i, a in enumerate(idx_xy):
        for j, b in enumerate(idx_xy):
            mat[i, j] = mat_gram[a, b]
    return mat


def two_sample_test(mat_gram, alpha=0.05, num_m=None, num_test=500):
    vec_wchi = hist_wchi(mat_gram)                    # null distribution of psi-hat
    vec_p_value = np.empty(num_test)
    for temp_test in range(num_test):                 # for l=1,...,N
        mat_reduced = extract_submat(mat_gram, num_m)  # resample m samples
        value_mmd = n_mmd(mat_reduced)                 # compute mMMDu
        vec_temp = np.where(vec_wchi > value_mmd)[0]   # how many psi-hat's are greater than mMMDu?
        vec_p_value[temp_test] = len(vec_temp) / len(vec_wchi)
    return vec_p_value, len(np.where(vec_p_value < alpha)[0]) / num_test


# False positive rate

In [None]:
# simulation parameters
npc=10
nset=500
nsig=4

In [None]:
# import PD - json, each pds are saved as unnamed array

with open("twopd.json") as f:
    twopd = json.load(f)

twopdlist=[]
for ii in range(len(twopd)):
    twopdsublist = []
    for jj in range(len(twopd[0])):
        twopddat = np.array(twopd[ii][jj])
        twopdmat = np.transpose( np.resize(twopddat, (3,int(len(twopddat)/3)) ) )
        twopddim1 = twopdmat[twopdmat[:,0]==1,1:]
        twopdsublist.append(twopddim1)
    twopdlist.append(twopdsublist)

In [None]:
comb = np.array(list(combinations(range(nset), 2)))
random.seed(2)
ind=random.sample(range(len(comb)),nset)

In [None]:
# functions for PWGK
func_kernel = tda.function_kernel("Gaussian", sigma=(0.5) )

## constant weight

In [None]:
# no weight
func_weight = tda.function_weight("none")
perz_cons=np.zeros((nsig,nset))
for ii in range(nsig):
    print(ii)
    for jj in range(nset):
        # import dimension one PDs
        combpdlist = twopdlist[ii][npc*(comb[ind[jj]][0]):npc*(comb[ind[jj]][0]+1)]
        combpdlist.extend(twopdlist[ii][npc*(comb[ind[jj]][1]):npc*(comb[ind[jj]][1]+1)])
        # compute gram matrix
        pwgk = PWGK(combpdlist, func_kernel, func_weight, sigma=(0.5),approx=True)
        mat_gaussian_pwgk = pwgk.gram_matrix()
        # define gram matrix
        name_rkhs = ["Linear", "Gaussian"][1]
        mat_gram_pwgk = tda.matrix_gram(mat_gaussian_pwgk, name_rkhs)[0]
        num_reject = two_sample_test(mat_gram_pwgk, alpha=0.05, num_m=10, num_test=1000)[1]
        perz_cons[ii,jj]=num_reject

In [None]:
1-np.sum(perz_cons<0.05,axis=1)/nset # constant

## arctangent weight

In [None]:
# arctan weight
func_weight = tda.function_weight("arctan", arc_c=0.5, arc_p=0.5, lin_el=1)
perz_arctan=np.zeros((nsig,nset))
for ii in range(nsig):
    print(ii)
    for jj in range(nset):
        # import dimension one PDs
        combpdlist = twopdlist[ii][npc*(comb[ind[jj]][0]):npc*(comb[ind[jj]][0]+1)]
        combpdlist.extend(twopdlist[ii][npc*(comb[ind[jj]][1]):npc*(comb[ind[jj]][1]+1)])
        # compute gram matrix
        pwgk = PWGK(combpdlist, func_kernel, func_weight, sigma=(0.5),approx=True)
        mat_gaussian_pwgk = pwgk.gram_matrix()
        # define gram matrix
        name_rkhs = ["Linear", "Gaussian"][1]
        mat_gram_pwgk = tda.matrix_gram(mat_gaussian_pwgk, name_rkhs)[0]
        num_reject = two_sample_test(mat_gram_pwgk, alpha=0.05, num_m=10, num_test=1000)[1]
        perz_arctan[ii,jj]=num_reject

In [None]:
1-np.sum(perz_arctan<0.05,axis=1)/nset # constant

## linear weight

In [None]:
# linear weight
func_weight = tda.function_weight("linear", lin_el=1)
perz_linear=np.zeros((nsig,nset))
for ii in range(nsig):
    print(ii)
    for jj in range(nset):
        # import dimension one PDs
        combpdlist = twopdlist[ii][npc*(comb[ind[jj]][0]):npc*(comb[ind[jj]][0]+1)]
        combpdlist.extend(twopdlist[ii][npc*(comb[ind[jj]][1]):npc*(comb[ind[jj]][1]+1)])
        # compute gram matrix
        pwgk = PWGK(combpdlist, func_kernel, func_weight, sigma=(0.5),approx=True)
        mat_gaussian_pwgk = pwgk.gram_matrix()
        # define gram matrix
        name_rkhs = ["Linear", "Gaussian"][1]
        mat_gram_pwgk = tda.matrix_gram(mat_gaussian_pwgk, name_rkhs)[0]
        num_reject = two_sample_test(mat_gram_pwgk, alpha=0.05, num_m=10, num_test=1000)[1]
        perz_linear[ii,jj]=num_reject

In [None]:
1-np.sum(perz_linear<0.05,axis=1)/nset # constant

# Power

In [None]:
# import PD - json, each pds are saved as unnamed array

with open("onepd.json") as f:
    onepd = json.load(f)
with open("twopd.json") as f:
    twopd = json.load(f)

# convert json (saved as unnamed array) to python list
onepdlist=[]
for ii in range(len(onepd)):
    onepdsublist = []
    for jj in range(len(onepd[0])):
        onepddat = np.array(onepd[ii][jj])
        onepdmat = np.transpose( np.resize(onepddat, (3,int(len(onepddat)/3)) ) )
        onepddim1 = onepdmat[onepdmat[:,0]==1,1:]
        onepdsublist.append(onepddim1)
    onepdlist.append(onepdsublist)
    
twopdlist=[]
for ii in range(len(twopd)):
    twopdsublist = []
    for jj in range(len(twopd[0])):
        twopddat = np.array(twopd[ii][jj])
        twopdmat = np.transpose( np.resize(twopddat, (3,int(len(twopddat)/3)) ) )
        twopddim1 = twopdmat[twopdmat[:,0]==1,1:]
        twopdsublist.append(twopddim1)
    twopdlist.append(twopdsublist)

In [None]:
# functions for PWGK
func_kernel = tda.function_kernel("Gaussian", sigma=0.5 )

## linear weight

In [None]:
# linear
func_weight = tda.function_weight("linear", lin_el=1)
perz_linear=np.zeros((nsig,nset))
for ii in range(nsig):
    print(ii)
    for jj in range(nset):
        # import dimension one PDs
        combpdlist = onepdlist[ii][npc*(jj):npc*(jj+1)]
        combpdlist.extend(twopdlist[ii][npc*(jj):npc*(jj+1)])
        # compute gram matrix
        pwgk = PWGK(combpdlist, func_kernel, func_weight, sigma=0.5,approx=True)
        mat_gaussian_pwgk = pwgk.gram_matrix()
        # define gram matrix
        name_rkhs = ["Linear", "Gaussian"][1]
        mat_gram_pwgk = tda.matrix_gram(mat_gaussian_pwgk, name_rkhs)[0]
        num_reject = two_sample_test(mat_gram_pwgk, alpha=0.05, num_m=10, num_test=1000)[1]
        perz_linear[ii,jj]=num_reject

In [None]:
1-np.sum(perz_linear<0.05,axis=1)/nset # linear

## arctangent weight

In [None]:
# arctan
func_weight = tda.function_weight("arctan", arc_c=0.5, arc_p=0.5, lin_el=1)
perz_arctan=np.zeros((nsig,nset))
for ii in range(nsig):
    print(ii)
    for jj in range(nset):
        # import dimension one PDs
        combpdlist = onepdlist[ii][npc*(jj):npc*(jj+1)]
        combpdlist.extend(twopdlist[ii][npc*(jj):npc*(jj+1)])
        # compute gram matrix
        pwgk = PWGK(combpdlist, func_kernel, func_weight, sigma=0.5,approx=True)
        mat_gaussian_pwgk = pwgk.gram_matrix()
        # define gram matrix
        name_rkhs = ["Linear", "Gaussian"][1]
        mat_gram_pwgk = tda.matrix_gram(mat_gaussian_pwgk, name_rkhs)[0]
        num_reject = two_sample_test(mat_gram_pwgk, alpha=0.05, num_m=10, num_test=1000)[1]
        perz_arctan[ii,jj]=num_reject

In [None]:
1-np.sum(perz_arctan<0.05,axis=1)/nset # soft-arc

## constant weight

In [None]:
# constant
func_weight = tda.function_weight("constant", lin_el=1)
perz_cons=np.zeros((nsig,nset))
for ii in range(nsig):
    print(ii)
    for jj in range(nset):
        # import dimension one PDs
        combpdlist = onepdlist[ii][npc*(jj):npc*(jj+1)]
        combpdlist.extend(twopdlist[ii][npc*(jj):npc*(jj+1)])
        # compute gram matrix
        pwgk = PWGK(combpdlist, func_kernel, func_weight, sigma=0.5,approx=True)
        mat_gaussian_pwgk = pwgk.gram_matrix()
        # define gram matrix
        name_rkhs = ["Linear", "Gaussian"][1]
        mat_gram_pwgk = tda.matrix_gram(mat_gaussian_pwgk, name_rkhs)[0]
        num_reject = two_sample_test(mat_gram_pwgk, alpha=0.05, num_m=10, num_test=1000)[1]
        perz_cons[ii,jj]=num_reject

In [None]:
1-np.sum(perz_cons<0.05,axis=1)/nset # constant