# Test power analysis for independence tests

In [None]:
import numpy as np
from tqdm.notebook import tqdm
import pickle
import os

from synthetic_data import generate_data
from independence import test_power
from plot import plot_samples, plot_power

### Test hyperparameters
We start by specifying some hyperparameters for all of our tests.

In [None]:
# number of trials and permutations
n_trials = 200
n_perms = 1000

# number of samples and number of points functional data samples are (randomly) observed and discretised
n_samples = [100, 150, 200]
n_obs = 100
n_preds = 100

# define discretised period
upper_limit = 1
pred_points = np.linspace(0, upper_limit, n_preds)

# number of Fourier basis functions and std of normal distribution of sampled coefficients
n_basis = 3
sd = 1

# different kernels for independence test
K_list = ['K_ID', 'K_dft']

# statistical significance level
alpha = 0.05

In [None]:
# create folders to save results
if not os.path.exists('results'):
    os.mkdir('results')

if not os.path.exists('results/marginal'):
    os.mkdir('results/marginal')

if not os.path.exists('results/joint'):
    os.mkdir('results/joint')

if not os.path.exists('results/conditional'):
    os.mkdir('results/conditional')

## Marginal independence test

In [None]:
test = 'marginal'

# historical dependence is easier to detect the higher a is
a_list = [0, 0.2, 0.4, 0.6, 0.8, 1]

We iterate over various values of $a$ and different kernels.

In [None]:
type_II_errors = {}

for n_sample in tqdm(n_samples):
    print('Sample size:', int(n_sample))
    type_II_errors[int(n_sample)] = {}
    for K in K_list:
        print('Kernel:', K)
        type_II_errors[int(n_sample)][str(K)] = []
        for a in a_list:
            print('a:', a)
            # generate synthetic data
            X, Y = generate_data(dep=test, n_samples=int(n_sample), n_trials=n_trials, n_obs=n_obs, n_preds=n_preds, a=a, upper_limit=upper_limit, n_basis=n_basis, sd=sd)
            
            # conduct n tests
            power = test_power(X=X, Y=Y, n_trials=n_trials, n_perms=n_perms, alpha=alpha, K=K, test=test)
            type_II_errors[n_sample][K].append(power)
            print('Test power:', power)
            print('----------')
    print('----------')
    
power_hist = open('results/{}/test_power_hist_{}.pkl'.format(test, test), 'wb')
pickle.dump(type_II_errors, power_hist)
power_hist.close()

In [None]:
# plot n samples
n = 10
plot_samples(X[:n], pred_points, upper_limit)

In [None]:
type_II_errors = pickle.load(open('results/{}/test_power_hist_{}.pkl'.format(test, test), 'rb'))

In [None]:
# power over strength of dependence
plot_power(type_II_errors, n_samples, a_list, n_tests, test, K_list)

## Joint independence test

In [None]:
test = 'joint'

# number of variables in network
n_vars = 4

# historical dependence is easier to detect the higher a is
a_list = [0, 0.2, 0.4, 0.6, 0.8, 1]

In [None]:
type_II_errors = {}

for n_sample in tqdm(n_samples):
    print('Sample size:', int(n_sample))
    type_II_errors[int(n_sample)] = {}
    for K in K_list:
        print('Kernel:', K)
        type_II_errors[int(n_sample)][str(K)] = []
        for a in a_list:
            print('a:', a)
            # generate synthetic data
            edges_dict, X_dict = generate_data(dep=test, n_samples=int(n_sample), n_trials=n_trials, n_obs=n_obs, n_preds=n_preds, n_vars=n_vars, a=a, upper_limit=upper_limit, n_basis=n_basis, sd=sd)
            
            # conduct n trials
            power = test_power(X=X_dict, edges_dict=edges_dict, n_trials=n_trials, n_perms=n_perms, alpha=alpha, K=K, test=test)
            type_II_errors[n_sample][K].append(power)
            print('Test power:', power)
            print('----------')
    print('----------')
    
power_hist = open('results/{}/test_power_hist_{}.pkl'.format(test, test), 'wb')
pickle.dump(type_II_errors, power_hist)
power_hist.close()

In [None]:
plot_power(type_II_errors, n_samples, a_list, n_trials, test, K_list)

## Conditional independence test

In [None]:
test = 'conditional'

# number of conditional variables
n_vars = 1

# # historical dependence between X and Y is easier to detect the higher a' is
a_prime_list = [0, 2, 4, 6, 8, 10]

# range of possible values for lambda
lambs = [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6]

K_list = ['K_ID'] # run first only with K_ID and then later with K_dft (takes ~15 times longer)

In [None]:
type_II_errors = {}

for n_sample in tqdm(n_samples):
    print('Sample size:', int(n_sample))
    type_II_errors[int(n_sample)] = {}
    for K in K_list:
        print('Kernel:', K)
        type_II_errors[int(n_sample)][str(K)] = []
        for a_prime in a_prime_list:
            print("a':", a_prime)
            # generate synthetic data
            X, Y, Z = generate_data(dep=test, n_samples=int(n_sample), n_trials=n_trials, n_obs=n_obs, n_preds=n_preds, a=1, a_prime=a_prime, upper_limit=upper_limit, n_basis=n_basis, sd=sd)
            
            # conduct n trials
            power = test_power(X=X, Y=Y, Z=Z, n_trials=n_trials, n_perms=n_perms, alpha=alpha, K=K, test=test, lambs=lambs)
            type_II_errors[n_sample][K].append(power)
            print('Test power:', power)
            print('----------')
    print('----------')
    
power_hist = open('results/{}/test_power_hist_{}.pkl'.format(test, test), 'wb')
pickle.dump(type_II_errors, power_hist)
power_hist.close()

In [None]:
plot_power(type_II_errors, n_samples, a_list, n_trials, test, K_list)