# Description

Compares two different ccc implementations: one using the fully optimized CPU version of ccc, and the other one using new cuda-implemented `get_contingency_matrix`

# Remove pycache dir

In [21]:
!echo ${CODE_DIR}




In [22]:
!find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print

In [23]:
!find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -prune -exec rm -rf {} \;

In [24]:
!find ${CODE_DIR} -regex '^.*\(__pycache__\)$' -print

# Modules

In [25]:
import numpy as np

from ccc.coef import ccc

# Settings

In [26]:
N_REPS = 10

In [27]:
np.random.seed(0)

# Setup

In [28]:
# let numba compile all the code before profiling
ccc(np.random.rand(10), np.random.rand(10))

0.15625

# Run with `n_samples` small

## `n_samples=50`

In [29]:
N_SAMPLES = 50

In [30]:
x = np.random.rand(N_SAMPLES)
y = np.random.rand(N_SAMPLES)

In [31]:
def func():
    for i in range(N_REPS):
        ccc(x, y)

In [32]:
%%timeit func()
func()

8.2 ms ± 262 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [33]:
%%prun -s cumulative -l 20 -T 01-n_samples_small_50.txt
func()

 
*** Profile printout saved to text file '01-n_samples_small_50.txt'. 


## `n_samples=100`

In [34]:
N_SAMPLES = 100

In [35]:
x = np.random.rand(N_SAMPLES)
y = np.random.rand(N_SAMPLES)

In [36]:
def func():
    for i in range(N_REPS):
        ccc(x, y)

In [37]:
%%timeit func()
func()

18.4 ms ± 405 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [38]:
%%prun -s cumulative -l 20 -T 10-n_samples_small_100.txt
func()

 
*** Profile printout saved to text file '10-n_samples_small_100.txt'. 


## `n_samples=500`

In [39]:
N_SAMPLES = 500

In [40]:
x = np.random.rand(N_SAMPLES)
y = np.random.rand(N_SAMPLES)

In [41]:
def func():
    for i in range(N_REPS):
        ccc(x, y)

In [42]:
%%timeit func()
func()

29.7 ms ± 216 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [43]:
%%prun -s cumulative -l 20 -T 10-n_samples_small_500.txt
func()

 
*** Profile printout saved to text file '10-n_samples_small_500.txt'. 


## `n_samples=1000`

In [44]:
N_SAMPLES = 1000

In [45]:
x = np.random.rand(N_SAMPLES)
y = np.random.rand(N_SAMPLES)

In [46]:
def func():
    for i in range(N_REPS):
        ccc(x, y)

In [47]:
%%timeit func()
func()

43.3 ms ± 1.02 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [48]:
%%prun -s cumulative -l 20 -T 10-n_samples_small_1000.txt
func()

 
*** Profile printout saved to text file '10-n_samples_small_1000.txt'. 


# Run with `n_samples` large

## `n_samples=50000`

In [49]:
N_SAMPLES = 50000

In [50]:
x = np.random.rand(N_SAMPLES)
y = np.random.rand(N_SAMPLES)

In [51]:
def func():
    for i in range(N_REPS):
        ccc(x, y)

In [52]:
%%timeit func()
func()

2.4 s ± 15.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [53]:
%%prun -s cumulative -l 20 -T 10-n_samples_large_50000.txt
func()

 
*** Profile printout saved to text file '10-n_samples_large_50000.txt'. 


## `n_samples=100000`

In [54]:
N_SAMPLES = 100000

In [55]:
x = np.random.rand(N_SAMPLES)
y = np.random.rand(N_SAMPLES)

In [56]:
def func():
    for i in range(N_REPS):
        ccc(x, y)

In [57]:
%%timeit func()
func()

4.92 s ± 31.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [58]:
%%prun -s cumulative -l 20 -T 10-n_samples_large_100000.txt
func()

 
*** Profile printout saved to text file '10-n_samples_large_100000.txt'. 


# Profile with CProfile

In [62]:
from cProfile import Profile
from pstats import SortKey, Stats

def func():
    for i in range(N_REPS):
        ccc(x, y)

with Profile() as profile:
    func()
    (
        Stats(profile)
        .strip_dirs()
        .sort_stats(SortKey.CUMULATIVE)
        .print_stats()
    )

         8339 function calls in 5.036 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.014    0.014    5.036    5.036 2445792793.py:4(func)
       10    0.008    0.001    5.022    0.502 impl.py:307(ccc)
      200    0.001    0.000    4.991    0.025 threading.py:280(wait)
      790    4.990    0.006    4.990    0.006 {method 'acquire' of '_thread.lock' objects}
       10    0.001    0.000    3.150    0.315 impl.py:492(compute_coef)
       10    0.000    0.000    3.149    0.315 impl.py:485(cdist_func)
       10    0.001    0.000    3.149    0.315 impl.py:192(cdist_parts_parallel)
      100    0.001    0.000    3.141    0.031 _base.py:201(as_completed)
      100    0.000    0.000    3.140    0.031 threading.py:563(wait)
      100    0.000    0.000    1.851    0.019 _base.py:418(result)
       20    0.000    0.000    1.851    0.093 _base.py:602(result_iterator)
       50    0.010    0.000    0.010    0.000 {buil