# Description

Generates a distribution of pvalues under the null hypothesis of no association.

This notebook uses individual gene pairs as input for CCC and parallelizes permutations.

# Modules loading

In [1]:
import numpy as np
from scipy.spatial.distance import squareform
from sklearn.metrics import pairwise_distances

from ccc.coef import ccc
from ccc import conf

# Settings

In [2]:
rs = np.random.RandomState(0)

In [3]:
DATA_N_OBJS, DATA_N_FEATURES = 100, 1000
PVALUE_N_PERMS = 1000

# Paths

In [4]:
OUTPUT_DIR = conf.RESULTS_DIR / "ccc_null-pvalues"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [5]:
OUTPUT_DIR

PosixPath('/opt/data/results/ccc_null-pvalues')

# Generate random data

In [6]:
data = rs.rand(DATA_N_OBJS, DATA_N_FEATURES)

In [7]:
data.shape

(10, 1000)

# Run CCC

In [8]:
def ccc_single(x, y):
    return ccc(x, y, n_jobs=1, pvalue_n_perms=PVALUE_N_PERMS, pvalue_n_jobs=conf.GENERAL["N_JOBS"])

In [9]:
cm_values = []
cm_pvalues = []

for i in range(data.shape[0] - 1):
    for j in range(i+1, data.shape[0]):
        v, p = ccc_single(data[i], data[j])
        cm_values.append(v)
        cm_pvalues.append(p)

In [10]:
assert len(cm_values) == len(cm_pvalues)
assert len(cm_values) == (DATA_N_OBJS * (DATA_N_OBJS - 1)) / 2

In [11]:
cm_values = np.array(cm_values)
cm_pvalues = np.array(cm_pvalues)

In [12]:
cm_values.shape

(45,)

In [13]:
cm_values

array([0.00254684, 0.00104179, 0.00320558, 0.00018284, 0.00186997,
       0.00147106, 0.00177705, 0.00194291, 0.00049431, 0.00425941,
       0.00148615, 0.00019465, 0.00363023, 0.0056535 , 0.00274262,
       0.00522602, 0.0022903 , 0.00320755, 0.00099358, 0.00532412,
       0.00253045, 0.00149274, 0.00629346, 0.00221865, 0.00627013,
       0.00389841, 0.00138057, 0.00221203, 0.00417506, 0.00241475,
       0.00504645, 0.00137032, 0.00529612, 0.00326284, 0.00375165,
       0.00377352, 0.00323483, 0.00277389, 0.00797598, 0.0026016 ,
       0.00238008, 0.00171082, 0.00084283, 0.0051361 , 0.00122446])

In [14]:
cm_pvalues.shape

(45,)

In [15]:
cm_pvalues

array([0.45454545, 0.81818182, 0.36363636, 1.        , 0.81818182,
       0.72727273, 0.90909091, 0.81818182, 1.        , 0.18181818,
       0.90909091, 1.        , 0.09090909, 0.09090909, 0.45454545,
       0.27272727, 0.54545455, 0.36363636, 0.90909091, 0.09090909,
       0.63636364, 0.90909091, 0.09090909, 0.72727273, 0.09090909,
       0.27272727, 0.90909091, 0.81818182, 0.27272727, 0.72727273,
       0.27272727, 1.        , 0.09090909, 0.54545455, 0.36363636,
       0.36363636, 0.45454545, 0.36363636, 0.09090909, 0.45454545,
       0.54545455, 0.90909091, 0.90909091, 0.09090909, 0.81818182])

# Save

In [16]:
output_file = OUTPUT_DIR / "gene_pairs-cm_values.npy"
display(output_file)

np.save(output_file, cm_values)

PosixPath('/opt/data/results/ccc_null-pvalues/gene_pairs-cm_values.npy')

In [17]:
output_file = OUTPUT_DIR / "gene_pairs-cm_pvalues.npy"
display(output_file)

np.save(output_file, cm_pvalues)