# Description

It generates random variables of varying sizes to compare the time taken by CCC and MIC.

This notebook uses 3 CPU core.

# Modules loading

In [1]:
%env CM_N_JOBS=3
%env NUMBA_NUM_THREADS=3
%env MKL_NUM_THREADS=3
%env OPEN_BLAS_NUM_THREADS=3
%env NUMEXPR_NUM_THREADS=3
%env OMP_NUM_THREADS=3

env: CM_N_JOBS=3
env: NUMBA_NUM_THREADS=3
env: MKL_NUM_THREADS=3
env: OPEN_BLAS_NUM_THREADS=3
env: NUMEXPR_NUM_THREADS=3
env: OMP_NUM_THREADS=3


In [2]:
import os
from time import time

import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr

from ccc import conf
from ccc.coef import ccc
from ccc.methods import mic

# Settings

In [3]:
N_JOBS = int(os.environ["CM_N_JOBS"])
display(N_JOBS)

3

In [4]:
OUTPUT_FILENAME = "time_test.pkl"

In [5]:
DATA_SIZES = [
    100,
    500,
    1000,
    5000,
    10000,
    50000,
    100000,
    1000000,
    10000000,
]

N_REPS = 10

In [6]:
np.random.seed(0)

# Paths

In [7]:
OUTPUT_DIR = conf.RESULTS_DIR / "time_test"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
display(OUTPUT_DIR)

PosixPath('/opt/data/results/time_test')

# Functions

In [8]:
# append to previous run
time_results = pd.read_pickle(OUTPUT_DIR / OUTPUT_FILENAME)

In [9]:
time_results.shape

(5330, 4)

In [10]:
def run_method(func, method_name, size):
    n_reps = N_REPS
    if size < 500:
        n_reps = 1000

    for r in range(n_reps):
        d1 = np.random.rand(size)
        d2 = np.random.rand(size)

        start_time = time()
        sim = func(d1, d2)
        end_time = time()
        met_time = end_time - start_time

        idx = time_results.shape[0]
        time_results.loc[idx] = [d1.shape[0], method_name, met_time, sim]

# Run

In [11]:
# initialize methods
ccc(np.random.rand(100), np.random.rand(100))

0.02047038824894002

In [12]:
for s in DATA_SIZES:
    print(f"Size: {s}")

    print("  p")
    run_method(lambda x, y: pearsonr(x, y)[0], "p-3", s)

    print("  s")
    run_method(lambda x, y: spearmanr(x, y)[0], "s-3", s)

    print("  cm")
    run_method(lambda x, y: ccc(x, y, n_jobs=N_JOBS), "cm-3", s)

    if s <= 50000:
        print("  mic_e")
        run_method(lambda x, y: mic(x, y, estimator="mic_e"), "mic_e-3", s)

    if s <= 10000:
        print("  mic")
        run_method(lambda x, y: mic(x, y), "mic-3", s)

    print("Saving to pickle")
    time_results.to_pickle(OUTPUT_DIR / OUTPUT_FILENAME)

    print("\n")

Size: 100
  p
  s
  cm
  mic_e
  mic
Saving to pickle


Size: 500
  p
  s
  cm
  mic_e
  mic
Saving to pickle


Size: 1000
  p
  s
  cm
  mic_e
  mic
Saving to pickle


Size: 5000
  p
  s
  cm
  mic_e
  mic
Saving to pickle


Size: 10000
  p
  s
  cm
  mic_e
  mic
Saving to pickle


Size: 50000
  p
  s
  cm
  mic_e
Saving to pickle


Size: 100000
  p
  s
  cm
Saving to pickle


Size: 1000000
  p
  s
  cm
Saving to pickle


Size: 10000000
  p
  s
  cm
Saving to pickle




# Summary of results

In [13]:
time_results.shape

(10660, 4)

In [14]:
time_results.head()

Unnamed: 0,data_size,method,time,sim
0,100,p-1,9.6e-05,-0.014058
1,100,p-1,3.8e-05,-0.019761
2,100,p-1,4.1e-05,-0.274154
3,100,p-1,3.4e-05,-0.071098
4,100,p-1,3.2e-05,0.016989
