In [1]:
from significance_of_mean_cuda import significance_of_mean_cuda
from utils import significance_of_mean
import numpy as np
import time
import multiprocessing
import concurrent.futures as cf
import pandas as pd

# Import experimental data

In [9]:
data = pd.read_csv("./exp/tcga/eig_stat_tcga.csv",delimiter="\t")
a_pattern = '_T_'
b_pattern = '_R_'
a_cols = [col for col in data.columns if a_pattern in col]
b_cols = [col for col in data.columns if b_pattern in col]
a = list()
b = list()
for index, row in data.iterrows():
    a.append(np.array(row[a_cols],dtype='float64'))
    b.append(np.array(row[b_cols],dtype='float64'))
AL = np.asarray(a)
BL = np.asarray(b)
bins = 200

## Compare the parallelized version with the non-parallelized to examine time difference and to check that they have matching p-values(if not, the value types of the parallelized version has to be re-tuned).

In [18]:
def run_parallelized(a,b):
    start = time.time()
    SGM = significance_of_mean_cuda(bins,dtype_v=np.uint32,dtype_A=np.float64)
    p = SGM.run(a,b)
    end = time.time()
    print(end - start)
    return p 

In [43]:
def run_non_parallelized(a,b):
    start = time.time()
    P = list()
    for a_s,b_s in zip(a,b):
        p = significance_of_mean(a_s,b_s,bins)
        P.append(p[0])     
    end = time.time()
    print(end - start)
    return P 

### It was not possible to parallelize the non-parallelized version over samples due to a memory error.

In [26]:
i1, i2 = 0,5
Ap, Bp = AL[i1:i2], BL[i1:i2]

In [27]:
PC = run_parallelized(Ap, Bp)

37.410171031951904


In [28]:
P = run_non_parallelized(Ap, Bp)

400.55655813217163


### Same output and ~10 times faster

In [38]:
round(400.55655813217163  / 37.410171031951904,1)

10.7

In [29]:
np.allclose(PC,P)

True

### It is perhaps fairer to the non-parallelized version to compare runtime over one sample.

In [30]:
i1, i2 = 0,1
Ap, Bp = AL[i1:i2], BL[i1:i2]

In [31]:
PC = run_parallelized(Ap, Bp)

5.436479091644287


In [32]:
P = run_non_parallelized(Ap, Bp)

76.9169590473175


### Same output and ~14 times faster

In [40]:
round(76.9169590473175  / 5.436479091644287,1)

14.1

In [41]:
np.allclose(PC,P)

True

#### One can conclude that the parallelized version is at least ten times faster.

## Run the parallelized version over all samples. It took approximately a week to calculate on the non-parallelized version.

In [6]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [7]:
PVAL = list()
start = time.time()
SGM = significance_of_mean_cuda(bins,dtype_v=np.uint32,dtype_A=np.float64)
for p in chunks(list(range(0,AL.shape[0])), 18):
    PVAL.append(SGM.run(AL[p],BL[p]))
end = time.time()
print(end - start)

20197.49423289299


In [42]:
round(20197.49423289299 / 3600,1)

5.6

# It tooked ~5.6h for the parallelized version.

In [47]:
round((BL.shape[0] * 76.9169590473175) / 3600, 1)

42.6

In [49]:
round(42.6 / 5.6, 1)

7.6

### Approximately 7.6 faster than the non-parallelized version.