In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
import numpy as np
import pandas as pd
import pickle

__Keys__ of `dict_results` have the form `(dim, sample_size, power, mult, shift_par)`.

__Values__ of `dict_results` have the form `(statistic, threshold)`.


# HSIC
To check the statistical power, we take the 200 reputations for every configuration written in the keys and compute the probability of rejection, given the null hypothesis is false.

$H_0: P_{XY} = P_X P_Y$

In [None]:
# define folder and file
path = 'high_HSIC'
test_ = 'p'
test__ = 'Gamma' # 'permutation' # 
dim = 25
file = 'dependencies_{}_[{}]'.format(test_, dim)

dict_results = pickle.load(open('{}/{}.pkl'.format(path, file), 'rb'))

In [None]:
power = {}
rejections = {}

for key in dict_results.keys():
    #print(key)
    rejections[key] = 0
    for i in range(len(dict_results[key])):
        if dict_results[key][i][0] > dict_results[key][i][1]:
            rejections[key] += 1
        else:
            pass
    # statistical power
    power[key] = 100 * rejections[key] / len(dict_results[key])

## Visualisations

In [None]:
power_df = pd.DataFrame.from_dict(data=power, orient='index').reset_index()
power_df0 = power_df['index'].apply(pd.Series)
power_df1 = pd.concat([power_df0, power_df], axis=1, ignore_index=True).drop([4], axis=1)
power_df2 = power_df1.rename(columns={0: 'dimension', 1: 'sample_size', 2: 'shift', 3: 'test', 5: 'power'}).sort_values(by=['sample_size', 'shift', 'test'], axis=0)

# 'shift' defines the lag
dep_linear0 = power_df2[(power_df2['test']=='dependence_linear') & (power_df2['shift']==0)]
dep_sine0 = power_df2[(power_df2['test']=='dependence_sine') & (power_df2['shift']==0)]
indep0 = power_df2[(power_df2['test']=='independence') & (power_df2['shift']==0)]
dep_cubic0 = power_df2[(power_df2['test']=='dependence_cubic') & (power_df2['shift']==0)]
dep_quartic0 = power_df2[(power_df2['test']=='dependence_quartic') & (power_df2['shift']==0)]

dep_linear1 = power_df2[(power_df2['test']=='dependence_linear') & (power_df2['shift']==1)]
dep_sine1 = power_df2[(power_df2['test']=='dependence_sine') & (power_df2['shift']==1)]
indep1 = power_df2[(power_df2['test']=='independence') & (power_df2['shift']==1)]
dep_cubic1 = power_df2[(power_df2['test']=='dependence_cubic') & (power_df2['shift']==1)]
dep_quartic1 = power_df2[(power_df2['test']=='dependence_quartic') & (power_df2['shift']==1)]

dep_linear2 = power_df2[(power_df2['test']=='dependence_linear') & (power_df2['shift']==2)]
dep_sine2 = power_df2[(power_df2['test']=='dependence_sine') & (power_df2['shift']==2)]
indep2 = power_df2[(power_df2['test']=='independence') & (power_df2['shift']==2)]
dep_cubic2 = power_df2[(power_df2['test']=='dependence_cubic') & (power_df2['shift']==2)]
dep_quartic2 = power_df2[(power_df2['test']=='dependence_quartic') & (power_df2['shift']==2)]

dep_linear3 = power_df2[(power_df2['test']=='dependence_linear') & (power_df2['shift']==3)]
dep_sine3 = power_df2[(power_df2['test']=='dependence_sine') & (power_df2['shift']==3)]
indep3 = power_df2[(power_df2['test']=='independence') & (power_df2['shift']==3)]
dep_cubic3 = power_df2[(power_df2['test']=='dependence_cubic') & (power_df2['shift']==3)]
dep_quartic3 = power_df2[(power_df2['test']=='dependence_quartic') & (power_df2['shift']==3)]

# Type I error and test power

- type I error: rejecting true $H_0$
- type II error: not rejecting false $H_0$
- $\text{power} = 1 - \text{type II error}$

Based on 200 trials for each sample size.

In [None]:
# independence / type I error

x0, y0 = indep0['sample_size'], indep0['power']
x1, y1 = indep1['sample_size'], indep1['power']
x2, y2 = indep2['sample_size'], indep2['power']
x3, y3 = indep3['sample_size'], indep3['power']

plt.figure(figsize=(12, 8))
plt.xlabel('Sample size per dimension', size=16)
plt.ylabel('type I error', size=16)
plt.ylim(-1, 11)
plt.hlines(y=5, xmin=16, xmax=128, colors='k', linestyles='dotted', label='Significance $\alpha$')
plt.title('{}, d = {}, {}'.format(indep0.test.unique()[0], dim, test__), size=16)
plt.plot(x0, y0, 'b', x1, y1, 'g', x2, y2, 'r', x3, y3, 'm')
plt.legend(['shift = 0', 'shift = 1', 'shift = 2', 'shift = 3', 'Significance'])
plt.savefig('visuals/{}_{}_{}.png'.format(indep0.test.unique(), dim, test_))
plt.show()

In [None]:
# linear dependence / power

x0, y0 = dep_linear0['sample_size'], dep_linear0['power']
x1, y1 = dep_linear1['sample_size'], dep_linear1['power']
x2, y2 = dep_linear2['sample_size'], dep_linear2['power']
x3, y3 = dep_linear3['sample_size'], dep_linear3['power']

plt.figure(figsize=(12, 8))
plt.xlabel('Sample size per dimension', size=16)
plt.ylabel('Test power', size=16)
plt.ylim(-5, 105)
plt.hlines(y=100, xmin=16, xmax=128, colors='k', linestyles='dotted')
plt.title('{}, d = {}, {}'.format(dep_linear0.test.unique()[0], dim, test__), size=16)
plt.plot(x0, y0, 'b', x1, y1, 'g', x2, y2, 'r', x3, y3, 'm')
plt.legend(['shift = 0', 'shift = 1', 'shift = 2', 'shift = 3'])
plt.savefig('visuals/{}_{}_{}.png'.format(dep_linear0.test.unique(), dim, test_))
plt.show()

In [None]:
# sine dependence / power

x0, y0 = dep_sine0['sample_size'], dep_sine0['power']
x1, y1 = dep_sine1['sample_size'], dep_sine1['power']
x2, y2 = dep_sine2['sample_size'], dep_sine2['power']
x3, y3 = dep_sine3['sample_size'], dep_sine3['power']

plt.figure(figsize=(12, 8))
plt.xlabel('Sample size per dimension', size=16)
plt.ylabel('Test power', size=16)
plt.ylim(-5, 105)
plt.hlines(y=100, xmin=16, xmax=128, colors='k', linestyles='dotted')
plt.title('{}, d = {}, {}'.format(dep_sine0.test.unique()[0], dim, test__), size=16)
plt.plot(x0, y0, 'b', x1, y1, 'g', x2, y2, 'r', x3, y3, 'm')
plt.legend(['shift = 0', 'shift = 1', 'shift = 2', 'shift = 3'])
plt.savefig('visuals/{}_{}_{}.png'.format(dep_sine0.test.unique(), dim, test_))
plt.show()

In [None]:
# cubic dependence / power

x0, y0 = dep_cubic0['sample_size'], dep_cubic0['power']
x1, y1 = dep_cubic1['sample_size'], dep_cubic1['power']
x2, y2 = dep_cubic2['sample_size'], dep_cubic2['power']
x3, y3 = dep_cubic3['sample_size'], dep_cubic3['power']

plt.figure(figsize=(12, 8))
plt.xlabel('Sample size per dimension', size=16)
plt.ylabel('Test power', size=16)
plt.ylim(-5, 105)
plt.hlines(y=100, xmin=16, xmax=128, colors='k', linestyles='dotted')
plt.title('{}, d = {}, {}'.format(dep_cubic0.test.unique()[0], dim, test__), size=16)
plt.plot(x0, y0, 'b', x1, y1, 'g', x2, y2, 'r', x3, y3, 'm')
plt.legend(['shift = 0', 'shift = 1', 'shift = 2', 'shift = 3'])
plt.savefig('visuals/{}_{}_{}.png'.format(dep_cubic0.test.unique(), dim, test_))
plt.show()

In [None]:
# quartic dependence / power

x0, y0 = dep_quartic0['sample_size'], dep_quartic0['power']
x1, y1 = dep_quartic1['sample_size'], dep_quartic1['power']
x2, y2 = dep_quartic2['sample_size'], dep_quartic2['power']
x3, y3 = dep_quartic3['sample_size'], dep_quartic3['power']

plt.figure(figsize=(12, 8))
plt.xlabel('Sample size per dimension', size=16)
plt.ylabel('Test power', size=16)
plt.ylim(-5, 105)
plt.hlines(y=100, xmin=16, xmax=128, colors='k', linestyles='dotted')
plt.title('{}, d = {}, {}'.format(dep_quartic0.test.unique()[0], dim, test__), size=16)
plt.plot(x0, y0, 'b', x1, y1, 'g', x2, y2, 'r', x3, y3, 'm')
plt.legend(['shift = 0', 'shift = 1', 'shift = 2', 'shift = 3'])
plt.savefig('visuals/{}_{}_{}.png'.format(dep_quartic0.test.unique(), dim, test_))
plt.show()

# MMD

## Mean shift

To check the statistical power, we take the 200 reputations for every configuration written in the keys and compute the probability of rejection, given the null hypothesis is false.

$H_0: P_X = P_Y$

$\mathbb{E}[Y] = \mathbb{E}[X] + \delta \ \mathbb{E}[X^3]$

In [None]:
# define folder and file
path = 'high_MMD_mean'
test_ = 'g'  # 'p'
test__ = 'Gamma' # 'permutation'
dim = 25
file = 'mean_shifts_{}_[{}]'.format(test_, dim)

dict_results = pickle.load(open('{}/{}.pkl'.format(path, file), 'rb'))

In [None]:
# mean shift values delta
deltas = np.concatenate([np.linspace(0, 0.2, 11), np.linspace(0.25, 4, 39)])
plt.plot(range(len(deltas)), deltas);

In [None]:
power = {}
rejections = {}

for key in dict_results.keys():
    #print(key)
    rejections[key] = 0
    for i in range(len(dict_results[key])):
        if dict_results[key][i][0] > dict_results[key][i][1]:
            rejections[key] += 1
        else:
            pass
    # statistical power
    power[key] = 100 * rejections[key] / len(dict_results[key])

## Visualisations

In [None]:
power_df = pd.DataFrame.from_dict(data=power, orient='index').reset_index()
power_df0 = power_df['index'].apply(pd.Series)
power_df1 = pd.concat([power_df0, power_df], axis=1, ignore_index=True).drop([4], axis=1)
power_df2 = power_df1.rename(columns={0: 'dimension', 1: 'sample_size', 2: 'delta', 3: 'shift', 5: 'power'}).sort_values(by=['sample_size', 'delta'], axis=0)

In [None]:
sample_64 = power_df2[power_df2['sample_size']==64]
sample_128 = power_df2[power_df2['sample_size']==128]
sample_256 = power_df2[power_df2['sample_size']==256]
sample_512 = power_df2[power_df2['sample_size']==512]

In [None]:
for test in [sample_64, sample_128, sample_256, sample_512]:
    x, y = test['delta'], test['power']
    plt.figure(figsize=(12, 8))
    plt.xlabel('Mean shift $\delta$', size=16)
    plt.ylabel('Test power', size=16)
    plt.ylim(0, 105)
    plt.hlines(y=100, xmin=0, xmax=4, colors='k', linestyles='dotted')
    plt.title('Mean shift: sample size = {}, dim = {}, {}'.format(int(test.sample_size.unique()[0]), dim, test__), size=16)
    plt.plot(x, y)
    plt.savefig('visuals/mean_{}_{}_{}.png'.format(test.sample_size.unique(), dim, test_))
    plt.show()

## Variance shift

To check the statistical power, we take the 200 reputations for every configuration written in the keys and compute the probability of rejection, given the null hypothesis is false.

$H_0: P_X = P_Y$

$\text{Var}[Y] = \text{Var}[X] + \gamma$

In [None]:
# define folder and file
path = 'high_MMD_var'
test_ = 'g'  # 'g'
test__ = 'Gamma' # 'permutation' 
dim = 25
file = 'var_shifts_{}_[{}]'.format(test_, dim)

dict_results = pickle.load(open('{}/{}.pkl'.format(path, file), 'rb'))

In [None]:
# variance shift values gamma
gams = np.concatenate([np.linspace(0, 0.005, 11), np.linspace(0.008, 1, 39)])
plt.plot(range(len(gams)), gams);

In [None]:
power = {}
rejections = {}

for key in dict_results.keys():
    #print(key)
    rejections[key] = 0
    for i in range(len(dict_results[key])):
        if dict_results[key][i][0] > dict_results[key][i][1]:
            rejections[key] += 1
        else:
            pass
    # statistical power
    power[key] = 100 * rejections[key] / len(dict_results[key])

## Visualisations

In [None]:
power_df = pd.DataFrame.from_dict(data=power, orient='index').reset_index()
power_df0 = power_df['index'].apply(pd.Series)
power_df1 = pd.concat([power_df0, power_df], axis=1, ignore_index=True).drop([4], axis=1)
power_df2 = power_df1.rename(columns={0: 'dimension', 1: 'sample_size', 2: 'gamma', 3: 'shift', 5: 'power'}).sort_values(by=['sample_size', 'gamma'], axis=0)

In [None]:
sample_64 = power_df2[power_df2['sample_size']==64]
sample_128 = power_df2[power_df2['sample_size']==128]
sample_256 = power_df2[power_df2['sample_size']==256]
sample_512 = power_df2[power_df2['sample_size']==512]

In [None]:
for test in [sample_64, sample_128, sample_256, sample_512]:
    x, y = test['gamma'], test['power']
    plt.figure(figsize=(12, 8))
    plt.xlabel('Variance shift $\gamma$', size=16)
    plt.ylabel('Test power', size=16)
    plt.ylim(0, 105)
    plt.hlines(y=100, xmin=0, xmax=1, colors='k', linestyles='dotted')
    plt.title('Variance shift: sample size = {}, dim = {}, {}'.format(int(test.sample_size.unique()[0]), dim, test__), size=16)
    plt.plot(x, y)
    plt.savefig('visuals/var_{}_{}_{}.png'.format(test.sample_size.unique(), dim, test_))
    plt.show()