In [1]:
import glob
from itertools import product
import os

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sourmash import signature as sig
from tqdm import tqdm

%matplotlib inline

## Info about the kmer hashes loaded

In [2]:
ksizes = 21, 27, 33, 51
ksize = ','.join(map(str, ksizes))
num_hashes = int(1e6)
dna = True
protein = True


In [3]:
output_prefix = 's3://olgabot-maca/facs/sourmash_compute_tissue_subset/'


In [6]:
# annotations = pd.read_csv("/home/olga/tabula-muris/00_data_ingest/18_global_annotation_csv/annotations_facs.csv")
# annotations['cell'] = annotations['cell'].str.replace('.', '-')
# print(annotations.shape)
# annotations.head()

In [7]:
# ! aws s3 ls $output_prefix | wc -l

### Copy the files to ndnd

In [8]:
# ! aws s3 sync $output_prefix /home/olga/scratch-pure/maca/facs/sourmash_compute_tissue_subset/

In [10]:
folder = '/data_lg/olga/maca/facs/sourmash_compute_tissue_subset/'

filenames = glob.glob(f'{folder}/*.sig')
downloaded_cells = [os.path.basename(f).split('.')[0] for f in filenames]
print(len(downloaded_cells))
downloaded_cells[:5]

1000


['B18-MAA000907-3_11_M-1-1',
 'K12-D041914-3_8_M-1-1',
 'O14-B001717-3_38_F-1-1',
 'L19-B001750-3_38_F-1-1',
 'E6-D041914-3_8_M-1-1']

In [None]:
annotations_signatures = annotations.query('cell in @downloaded_cells')
annotations_signatures.head()

In [None]:
sizes = annotations_signatures.groupby(['tissue', 'cell_ontology_class']).size()
sizes

In [None]:
100 * sizes/200

In [None]:
most_common_classes = sizes.groupby(level=0).nlargest(1)
most_common_classes

In [None]:
most_common_classes.index.levels[-1]

In [None]:
annotations_signatures_class_subset = annotations_signatures.query('cell_ontology_class in @most_common_classes.index.levels[-1]')
print(annotations_signatures_class_subset.shape)
annotations_signatures_class_subset.head()

In [None]:
annotations_signatures.cell_ontology_class.mode()[0]

In [None]:
annotations_signatures.sample()

In [None]:
n = 10

def filter_by_top_classes(df, n, col='cell_ontology_class'):
    most_common_class = df[col].mode()[0]
    df = df.query(f'{col} == @most_common_class')
    return df.sample(n, random_state=0)
    

annotations_random_subset = annotations_signatures.groupby('tissue').apply(filter_by_top_classes, n=n)
print(annotations_random_subset.shape)
annotations_random_subset.head()

In [None]:
annotations_random_subset.groupby(['tissue', 'cell_ontology_class']).size()

## Read signatures

In [None]:
range(3)

### Read only a random subset of the signatures

In [None]:

folder = '/data_lg/olga/maca/facs/sourmash_compute_tissue_subset/'

filenames = folder + annotations_random_subset['cell'] + '.sig'
print(f'len(filenames): {len(filenames)}')

siglist_subset = []

for filename in tqdm(filenames):
    loaded = sig.load_signatures(filename)
    siglist_subset.extend(loaded)

In [None]:
len(siglist_subset)

In [None]:
siglist_subset[:5]

#### Write the pickle file

In [None]:
%%time

import pickle

pickle_filename = f'{folder}/siglist_subset_npergroup{n}.pickle'

with open(pickle_filename, 'wb') as f:
    pickle.dump(siglist_subset, f)

In [None]:
! ls -lha $folder/*pickle

In [4]:
range(3)

range(0, 3)

### Read ALL signatures

In [None]:
%%time

folder = '/data_lg/olga/maca/facs/sourmash_compute_tissue_subset/'

filenames = glob.glob(f'{folder}/*.sig')


siglist = []

for filename in tqdm(filenames):
    loaded = sig.load_signatures(filename)
    siglist.extend(loaded)



  0%|          | 0/1000 [00:00<?, ?it/s][A[A

  0%|          | 1/1000 [02:32<42:25:55, 152.91s/it][A[A

  0%|          | 2/1000 [05:06<42:26:52, 153.12s/it][A[A

  0%|          | 3/1000 [07:40<42:26:44, 153.26s/it][A[A

  0%|          | 4/1000 [10:12<42:21:28, 153.10s/it][A[A

  0%|          | 5/1000 [12:45<42:14:52, 152.86s/it][A[A

  1%|          | 6/1000 [15:18<42:17:00, 153.14s/it][A[A

  1%|          | 7/1000 [17:56<42:34:22, 154.34s/it][A[A

  1%|          | 8/1000 [20:30<42:32:37, 154.39s/it][A[A

  1%|          | 9/1000 [23:05<42:34:30, 154.66s/it][A[A

  1%|          | 10/1000 [25:40<42:31:15, 154.62s/it][A[A

  1%|          | 11/1000 [28:14<42:25:06, 154.41s/it][A[A

  1%|          | 12/1000 [30:47<42:15:41, 153.99s/it][A[A

  1%|▏         | 13/1000 [33:24<42:30:40, 155.06s/it][A[A

  1%|▏         | 14/1000 [35:59<42:26:12, 154.94s/it][A[A

  2%|▏         | 15/1000 [38:33<42:17:35, 154.57s/it][A[A

  2%|▏         | 16/1000 [41:05<42:05:41, 15

 26%|██▌       | 257/1000 [10:37:35<30:27:58, 147.62s/it][A[A

 26%|██▌       | 258/1000 [10:40:10<30:49:25, 149.55s/it][A[A

 26%|██▌       | 259/1000 [10:42:51<31:31:06, 153.13s/it][A[A

 26%|██▌       | 260/1000 [10:45:31<31:55:43, 155.33s/it][A[A

 26%|██▌       | 261/1000 [10:48:05<31:45:34, 154.72s/it][A[A

 26%|██▌       | 262/1000 [10:50:40<31:45:40, 154.93s/it][A[A

 26%|██▋       | 263/1000 [10:53:13<31:36:07, 154.37s/it][A[A

 26%|██▋       | 264/1000 [10:55:47<31:31:47, 154.22s/it][A[A

 26%|██▋       | 265/1000 [10:58:21<31:28:08, 154.13s/it][A[A

 27%|██▋       | 266/1000 [11:00:54<31:20:20, 153.71s/it][A[A

 27%|██▋       | 267/1000 [11:03:29<31:22:23, 154.08s/it][A[A

 27%|██▋       | 268/1000 [11:06:05<31:26:26, 154.63s/it][A[A

 27%|██▋       | 269/1000 [11:08:42<31:34:29, 155.50s/it][A[A

 27%|██▋       | 270/1000 [11:11:16<31:24:36, 154.90s/it][A[A

 27%|██▋       | 271/1000 [11:13:54<31:35:07, 155.98s/it][A[A

 27%|██▋       | 272/1000

In [17]:
range(2)

range(0, 2)

In [19]:
len(filenames)

1000

In [18]:
len(siglist)

7948

In [23]:
siglist[0]

SourmashSignature('B18-MAA000907-3_11_M-1-1|tissue:Liver|subtissue:Non-hepatocytes|cell_ontology_class:endothelial_cell_of_hepatic_sinusoid|free_annotation:NA', eac1bcca)

In [24]:
cat jaccard_utils.py


from joblib import Parallel, delayed
import itertools

def jaccard_sigs(i, j, siglist):
    return siglist[i].jaccard(siglist[j])

def jaccard_sigs_idf(i, j, siglist, idf, mean_idf_per_cell):
    i_hashes = filter_idf(siglist[i].get_mins(), inverse_document_frequency, mean_idf_per_cell)
    j_hashes = filter_idf(siglist[j].get_mins(), inverse_document_frequency, mean_idf_per_cell)
    return jacard(i_hashes, j_hashes)


def jaccard(sample1, sample2):
    """Jaccard similarity between two sets"""
    intersection = len(sample1.intersection(sample2))
    union = len(sample1.union(sample2))
    return intersection/union

#### Write the pickle file

In [20]:
%%time

import pickle

pickle_filename = f'{folder}/siglist.pickle'

with open(pickle_filename, 'wb') as f:
    pickle.dump(siglist, f)

CPU times: user 1h 6min 56s, sys: 2min 42s, total: 1h 9min 38s
Wall time: 1h 9min 33s


In [21]:
range(3)

range(0, 3)

In [22]:
ls -lha $pickle_filename

-rw-rw-r-- 1 olga olga 87G Feb 12 20:01 /data_lg/olga/maca/facs/sourmash_compute_tissue_subset//siglist.pickle


In [25]:
def filter_siglist(siglist, ksize, moltype):
    if moltype == 'protein':
        molfilter = lambda x: x.minhash.is_protein
    else:
        molfilter = lambda x: not x.minhash.is_protein
    
    return [s for s in siglist if molfilter(s) and (s.minhash.ksize == ksize)]


In [27]:
%time siglist_k27_dna = filter_siglist(siglist, 27, "DNA")
print(len(siglist_k27_dna))
siglist_k27_dna[:5]

CPU times: user 11.2 ms, sys: 0 ns, total: 11.2 ms
Wall time: 10.8 ms
974


[SourmashSignature('B18-MAA000907-3_11_M-1-1|tissue:Liver|subtissue:Non-hepatocytes|cell_ontology_class:endothelial_cell_of_hepatic_sinusoid|free_annotation:NA', 42691629),
 SourmashSignature('K12-D041914-3_8_M-1-1|tissue:Bladder|subtissue:NA|cell_ontology_class:bladder_cell|free_annotation:Bladder_mesenchymal_cell', 933d809f),
 SourmashSignature('O14-B001717-3_38_F-1-1|tissue:Kidney|subtissue:NA|cell_ontology_class:endothelial_cell|free_annotation:NA', 38ddea47),
 SourmashSignature('L19-B001750-3_38_F-1-1|tissue:Spleen|subtissue:NA|cell_ontology_class:B_cell|free_annotation:NA', c04cf989),
 SourmashSignature('E6-D041914-3_8_M-1-1|tissue:Bladder|subtissue:NA|cell_ontology_class:bladder_cell|free_annotation:Bladder_mesenchymal_cell', 14b26b76)]

In [28]:
cat jaccard_utils.py


from joblib import Parallel, delayed
import itertools

def jaccard_sigs(i, j, siglist):
    return siglist[i].jaccard(siglist[j])

def jaccard_sigs_idf(i, j, siglist, idf, mean_idf_per_cell):
    i_hashes = filter_idf(siglist[i].get_mins(), inverse_document_frequency, mean_idf_per_cell)
    j_hashes = filter_idf(siglist[j].get_mins(), inverse_document_frequency, mean_idf_per_cell)
    return jacard(i_hashes, j_hashes)


def jaccard(sample1, sample2):
    """Jaccard similarity between two sets"""
    intersection = len(sample1.intersection(sample2))
    union = len(sample1.union(sample2))
    return intersection/union

In [83]:
import itertools

import numpy as np
from scipy.spatial.distance import squareform
from joblib import Parallel, delayed


def _compare_serial(siglist, iterator):
    n = len(siglist)
    values = np.ones((n, n))
    
    for i, j in iterator:
        jaccard = siglist[i].jaccard(siglist[j])
        
        values[i, j] = jaccard
        values[j, i] = jaccard
        
    return values


def compare_all_pairs(siglist, n_jobs=None, verbose=0, **kws):
    n = len(siglist)
    
    # Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A)
    iterator = itertools.combinations(range(n), 2)
    sig_iterator = itertools.combinations(siglist, 2)
    
    if n_jobs is None:
        values = _compare_serial(siglist, iterator)
    else:
        # This creates a condensed distance matrix
        condensed = Parallel(n_jobs=n_jobs, verbose=verbose, **kws)(delayed(sig1.jaccard)(sig2) for sig1, sig2 in sig_iterator)
        values = squareform(condensed)
#         values = condensed
        
    return values

## Serial vs parallel comparison

In [57]:
%time compare_all_pairs(siglist_k27_dna[:5])

CPU times: user 4.71 s, sys: 40 ms, total: 4.75 s
Wall time: 4.74 s


array([[1.      , 0.052594, 0.036896, 0.032664, 0.05257 ],
       [0.052594, 1.      , 0.048739, 0.044368, 0.104441],
       [0.036896, 0.048739, 1.      , 0.034484, 0.046098],
       [0.032664, 0.044368, 0.034484, 1.      , 0.0453  ],
       [0.05257 , 0.104441, 0.046098, 0.0453  , 1.      ]])

In [58]:
n = 6.26
n*(n-1)/2

16.4638

In [67]:
import cProfile
import re
cProfile.run('compare_all_pairs(siglist_k27_dna[:5], n_jobs=2, verbose=0)')

         669 function calls in 242.735 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000  242.735  242.735 <ipython-input-65-564599965297>:21(compare_all_pairs)
        4    0.000    0.000    0.000    0.000 <ipython-input-65-564599965297>:32(<genexpr>)
        1    0.000    0.000  242.735  242.735 <string>:1(<module>)
        4    0.000    0.000    0.000    0.000 _base.py:312(__init__)
       10    0.000    0.000    0.000    0.000 _base.py:382(__get_result)
        4    0.000    0.000    0.000    0.000 _base.py:388(add_done_callback)
       10    0.000    0.000  242.510   24.251 _base.py:405(result)
        1    0.000    0.000    0.000    0.000 _collections_abc.py:657(get)
        1    0.000    0.000    0.000    0.000 _memmapping_reducer.py:122(_get_temp_dir)
        2    0.000    0.000    0.000    0.000 _memmapping_reducer.py:279(__init__)
        1    0.000    0.000    0.000    0.000 _memmapping_

In [62]:
%time compare_all_pairs(siglist_k27_dna[:5], n_jobs=2, verbose=0)

CPU times: user 4min 1s, sys: 1.19 s, total: 4min 2s
Wall time: 4min 4s


[0.052594,
 0.036896,
 0.032664,
 0.05257,
 0.048739,
 0.044368,
 0.104441,
 0.034484,
 0.046098,
 0.0453]

In [66]:
%time compare_all_pairs(siglist_k27_dna[:5], n_ jobs=2, verbose=0, backend='threading')

CPU times: user 4.69 s, sys: 50.1 ms, total: 4.74 s
Wall time: 4.83 s


[0.052594,
 0.036896,
 0.032664,
 0.05257,
 0.048739,
 0.044368,
 0.104441,
 0.034484,
 0.046098,
 0.0453]

In [63]:
%time compare_all_pairs(siglist_k27_dna[:5], n_jobs=2, verbose=10)

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   26.5s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  1.6min


CPU times: user 4min 2s, sys: 170 ms, total: 4min 2s
Wall time: 4min 5s


[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed:  4.1min finished


[0.052594,
 0.036896,
 0.032664,
 0.05257,
 0.048739,
 0.044368,
 0.104441,
 0.034484,
 0.046098,
 0.0453]

In [48]:
(4*60 + 2)/6

40.333333333333336

In [40]:
%time compare_all_pairs(siglist_k27_dna[:10])

CPU times: user 29.3 s, sys: 190 ms, total: 29.5 s
Wall time: 29.5 s


array([[1.      , 0.052594, 0.036896, 0.032664, 0.05257 , 0.044477,
        0.026633, 0.032755, 0.034058, 0.041419],
       [0.052594, 1.      , 0.048739, 0.044368, 0.104441, 0.075196,
        0.031727, 0.043431, 0.04097 , 0.046796],
       [0.036896, 0.048739, 1.      , 0.034484, 0.046098, 0.043704,
        0.024897, 0.035947, 0.028144, 0.039598],
       [0.032664, 0.044368, 0.034484, 1.      , 0.0453  , 0.041797,
        0.030011, 0.049822, 0.044296, 0.038339],
       [0.05257 , 0.104441, 0.046098, 0.0453  , 1.      , 0.073886,
        0.035624, 0.044594, 0.045683, 0.048744],
       [0.044477, 0.075196, 0.043704, 0.041797, 0.073886, 1.      ,
        0.032788, 0.047804, 0.036424, 0.041845],
       [0.026633, 0.031727, 0.024897, 0.030011, 0.035624, 0.032788,
        1.      , 0.026536, 0.032024, 0.02673 ],
       [0.032755, 0.043431, 0.035947, 0.049822, 0.044594, 0.047804,
        0.026536, 1.      , 0.038092, 0.03525 ],
       [0.034058, 0.04097 , 0.028144, 0.044296, 0.045683, 0.0364

In [64]:
%time compare_all_pairs(siglist_k27_dna[:10], n_jobs=2)

CPU times: user 17min 58s, sys: 2 s, total: 18min
Wall time: 18min 2s


[0.052594,
 0.036896,
 0.032664,
 0.05257,
 0.044477,
 0.026633,
 0.032755,
 0.034058,
 0.041419,
 0.048739,
 0.044368,
 0.104441,
 0.075196,
 0.031727,
 0.043431,
 0.04097,
 0.046796,
 0.034484,
 0.046098,
 0.043704,
 0.024897,
 0.035947,
 0.028144,
 0.039598,
 0.0453,
 0.041797,
 0.030011,
 0.049822,
 0.044296,
 0.038339,
 0.073886,
 0.035624,
 0.044594,
 0.045683,
 0.048744,
 0.032788,
 0.047804,
 0.036424,
 0.041845,
 0.026536,
 0.032024,
 0.02673,
 0.038092,
 0.03525,
 0.033602]

In [37]:
%time compare_all_pairs(siglist_k27_dna[:20])

CPU times: user 1min 39s, sys: 640 ms, total: 1min 40s
Wall time: 1min 40s


array([[1.      , 0.052594, 0.036896, 0.032664, 0.05257 , 0.044477,
        0.026633, 0.032755, 0.034058, 0.041419, 0.028012, 0.038435,
        0.056022, 0.031442, 0.050267, 0.044169, 0.034657, 0.026591,
        0.043057, 0.041743],
       [0.052594, 1.      , 0.048739, 0.044368, 0.104441, 0.075196,
        0.031727, 0.043431, 0.04097 , 0.046796, 0.034856, 0.053615,
        0.111691, 0.039028, 0.104456, 0.055113, 0.039543, 0.041071,
        0.073189, 0.057099],
       [0.036896, 0.048739, 1.      , 0.034484, 0.046098, 0.043704,
        0.024897, 0.035947, 0.028144, 0.039598, 0.02737 , 0.036212,
        0.046941, 0.027451, 0.045927, 0.041513, 0.035812, 0.0268  ,
        0.036648, 0.036744],
       [0.032664, 0.044368, 0.034484, 1.      , 0.0453  , 0.041797,
        0.030011, 0.049822, 0.044296, 0.038339, 0.033337, 0.032502,
        0.045438, 0.043237, 0.036435, 0.03777 , 0.027723, 0.023143,
        0.03404 , 0.046541],
       [0.05257 , 0.104441, 0.046098, 0.0453  , 1.      , 0.073886,


In [38]:
%time compare_all_pairs(siglist_k27_dna[:20], n_jobs=2)

TypeError: 'function' object is not iterable

## Try memory mapping??

```
>>> import tempfile
>>> import os
>>> from joblib import load, dump

>>> temp_folder = tempfile.mkdtemp()
>>> filename = os.path.join(temp_folder, 'joblib_test.mmap')
>>> if os.path.exists(filename): os.unlink(filename)
>>> _ = dump(large_array, filename)
>>> large_memmap = load(filename, mmap_mode='r+')
```

In [68]:
folder

'/data_lg/olga/maca/facs/sourmash_compute_tissue_subset/'

In [70]:
from joblib import load, dump

filename = os.path.join(folder, 'siglist_first5.memmap')
if os.path.exists(filename): 
    os.unlink(filename)
_ = dump(siglist_k27_dna[:5], filename)
siglist_k27_dna_memmap = load(filename, mmap_mode='r+')

In [72]:
%time compare_all_pairs(siglist_k27_dna_memmap[:5], n_jobs=2, verbose=0)



CPU times: user 4min 4s, sys: 1.15 s, total: 4min 5s
Wall time: 4min 7s


[0.052594,
 0.036896,
 0.032664,
 0.05257,
 0.048739,
 0.044368,
 0.104441,
 0.034484,
 0.046098,
 0.0453]

In [81]:
%time compare_all_pairs(siglist_k27_dna[:5], n_jobs=1, require='sharedmem')

CPU times: user 5.12 s, sys: 40 ms, total: 5.16 s
Wall time: 5.14 s


[0.052594,
 0.036896,
 0.032664,
 0.05257,
 0.048739,
 0.044368,
 0.104441,
 0.034484,
 0.046098,
 0.0453]

In [80]:
%time compare_all_pairs(siglist_k27_dna_memmap[:5], n_jobs=1, require='sharedmem')

CPU times: user 5.08 s, sys: 69.7 ms, total: 5.14 s
Wall time: 5.13 s


[0.052594,
 0.036896,
 0.032664,
 0.05257,
 0.048739,
 0.044368,
 0.104441,
 0.034484,
 0.046098,
 0.0453]

In [79]:
%time compare_all_pairs(siglist_k27_dna_memmap[:5], n_jobs=2, require='sharedmem')

CPU times: user 5.02 s, sys: 30.1 ms, total: 5.05 s
Wall time: 5.13 s


[0.052594,
 0.036896,
 0.032664,
 0.05257,
 0.048739,
 0.044368,
 0.104441,
 0.034484,
 0.046098,
 0.0453]

In [78]:
%time compare_all_pairs(siglist_k27_dna[:5], n_jobs=2, require='sharedmem')

CPU times: user 5.16 s, sys: 30.1 ms, total: 5.19 s
Wall time: 5.27 s


[0.052594,
 0.036896,
 0.032664,
 0.05257,
 0.048739,
 0.044368,
 0.104441,
 0.034484,
 0.046098,
 0.0453]

In [77]:
%time compare_all_pairs(siglist_k27_dna[:5], n_jobs=4, require='sharedmem')

CPU times: user 5.17 s, sys: 40 ms, total: 5.21 s
Wall time: 5.28 s


[0.052594,
 0.036896,
 0.032664,
 0.05257,
 0.048739,
 0.044368,
 0.104441,
 0.034484,
 0.046098,
 0.0453]

In [84]:
%time compare_all_pairs(siglist_k27_dna[:10], n_jobs=4, require='sharedmem')

CPU times: user 22.9 s, sys: 230 ms, total: 23.2 s
Wall time: 23.1 s


array([[0.      , 0.052594, 0.036896, 0.032664, 0.05257 , 0.044477,
        0.026633, 0.032755, 0.034058, 0.041419],
       [0.052594, 0.      , 0.048739, 0.044368, 0.104441, 0.075196,
        0.031727, 0.043431, 0.04097 , 0.046796],
       [0.036896, 0.048739, 0.      , 0.034484, 0.046098, 0.043704,
        0.024897, 0.035947, 0.028144, 0.039598],
       [0.032664, 0.044368, 0.034484, 0.      , 0.0453  , 0.041797,
        0.030011, 0.049822, 0.044296, 0.038339],
       [0.05257 , 0.104441, 0.046098, 0.0453  , 0.      , 0.073886,
        0.035624, 0.044594, 0.045683, 0.048744],
       [0.044477, 0.075196, 0.043704, 0.041797, 0.073886, 0.      ,
        0.032788, 0.047804, 0.036424, 0.041845],
       [0.026633, 0.031727, 0.024897, 0.030011, 0.035624, 0.032788,
        0.      , 0.026536, 0.032024, 0.02673 ],
       [0.032755, 0.043431, 0.035947, 0.049822, 0.044594, 0.047804,
        0.026536, 0.      , 0.038092, 0.03525 ],
       [0.034058, 0.04097 , 0.028144, 0.044296, 0.045683, 0.0364

In [86]:
%time compare_all_pairs(siglist_k27_dna_memmap[:10], n_jobs=4, require='sharedmem')

CPU times: user 5.12 s, sys: 120 ms, total: 5.24 s
Wall time: 5.3 s


array([[0.      , 0.052594, 0.036896, 0.032664, 0.05257 ],
       [0.052594, 0.      , 0.048739, 0.044368, 0.104441],
       [0.036896, 0.048739, 0.      , 0.034484, 0.046098],
       [0.032664, 0.044368, 0.034484, 0.      , 0.0453  ],
       [0.05257 , 0.104441, 0.046098, 0.0453  , 0.      ]])

In [92]:
%time compare_all_pairs(siglist_k27_dna_memmap[:10], n_jobs=4)



CPU times: user 4min 1s, sys: 1.28 s, total: 4min 2s
Wall time: 4min 5s


array([[0.      , 0.052594, 0.036896, 0.032664, 0.05257 ],
       [0.052594, 0.      , 0.048739, 0.044368, 0.104441],
       [0.036896, 0.048739, 0.      , 0.034484, 0.046098],
       [0.032664, 0.044368, 0.034484, 0.      , 0.0453  ],
       [0.05257 , 0.104441, 0.046098, 0.0453  , 0.      ]])

In [85]:
%time compare_all_pairs(siglist_k27_dna[:20], n_jobs=8, require='sharedmem')

CPU times: user 1min 32s, sys: 1.42 s, total: 1min 34s
Wall time: 1min 33s


array([[0.      , 0.052594, 0.036896, 0.032664, 0.05257 , 0.044477,
        0.026633, 0.032755, 0.034058, 0.041419, 0.028012, 0.038435,
        0.056022, 0.031442, 0.050267, 0.044169, 0.034657, 0.026591,
        0.043057, 0.041743],
       [0.052594, 0.      , 0.048739, 0.044368, 0.104441, 0.075196,
        0.031727, 0.043431, 0.04097 , 0.046796, 0.034856, 0.053615,
        0.111691, 0.039028, 0.104456, 0.055113, 0.039543, 0.041071,
        0.073189, 0.057099],
       [0.036896, 0.048739, 0.      , 0.034484, 0.046098, 0.043704,
        0.024897, 0.035947, 0.028144, 0.039598, 0.02737 , 0.036212,
        0.046941, 0.027451, 0.045927, 0.041513, 0.035812, 0.0268  ,
        0.036648, 0.036744],
       [0.032664, 0.044368, 0.034484, 0.      , 0.0453  , 0.041797,
        0.030011, 0.049822, 0.044296, 0.038339, 0.033337, 0.032502,
        0.045438, 0.043237, 0.036435, 0.03777 , 0.027723, 0.023143,
        0.03404 , 0.046541],
       [0.05257 , 0.104441, 0.046098, 0.0453  , 0.      , 0.073886,


In [90]:
%timeit compare_all_pairs(siglist_k27_dna_memmap[:20], n_jobs=8, require='sharedmem')

5.17 s ± 124 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [89]:
%timeit compare_all_pairs(siglist_k27_dna_memmap[:20], n_jobs=8, require='sharedmem', backend='threading')

5.34 s ± 211 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [91]:
%timeit compare_all_pairs(siglist_k27_dna_memmap[:20], n_jobs=8)

4min 5s ± 2.81 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [126]:
import glob
import itertools
import os
import sys
import tempfile
import time

import click
from joblib import Parallel, delayed, load, dump
import numpy as np
import pandas as pd
from scipy.spatial.distance import squareform
from sourmash import signature as sig
from tqdm import tqdm


KSIZES = 21, 27, 33, 51
MOLECULES = 'dna', 'protein'
LOG2_NUM_HASHES = 8, 9, 10, 11, 12, 13, 14, 15, 16

def seconds_to_formatted_time(seconds):
    m, s = divmod(seconds, 60)
    h, m = divmod(m, 60)
    return '{:d}:{:02d}:{:02d}'.format(h, m, s)


def filter_siglist(siglist, ksize, moltype):
    if moltype == 'protein':
        molfilter = lambda x: x.minhash.is_protein
    else:
        molfilter = lambda x: not x.minhash.is_protein

    return [s for s in siglist if molfilter(s) and (s.minhash.ksize == ksize)]


def load_signatures(filenames):

    siglist = []

    for filename in tqdm(filenames):
        loaded = sig.load_signatures(filename)
        siglist.extend(loaded)
    return siglist


def _compare_serial(siglist, iterator):
    n = len(siglist)
    values = np.ones((n, n))

    for i, j in iterator:
        jaccard = siglist[i].jaccard(siglist[j])

        values[i, j] = jaccard
        values[j, i] = jaccard

    return values


def compare_all_pairs(siglist, n_jobs=None):
    n = len(siglist)

    # Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A)
    iterator = itertools.combinations(range(n), 2)
    sig_iterator = itertools.combinations(siglist, 2)

    if n_jobs is None or n_jobs == 1:
        values = _compare_serial(siglist, iterator)
    else:
        # This creates a condensed distance matrix
        condensed = Parallel(n_jobs=n_jobs, require='sharedmem',
                             backend='threading')(
            delayed(sig1.jaccard)(sig2) for sig1, sig2 in sig_iterator)
        values = squareform(condensed)

    return values


def _memmap_siglist(siglist):
    """Write a memory-mapped array of signatures"""
    temp_folder = tempfile.mkdtemp()
    filename = os.path.join(temp_folder, 'joblib_test.mmap')
    if os.path.exists(filename): os.unlink(filename)
    _ = dump(siglist, filename)
    large_memmap = load(filename, mmap_mode='r+')
    return large_memmap



def downsample_and_compare(signatures, log2_num_hash, molecule, ksize, n_jobs=None):
    t0 = time.time()
    
    filtered = filter_siglist(signatures, ksize, molecule)
    t1 = time.time()
    
    num_hash = 2**log2_num_hash
    downsampled = [s.minhash.downsample_n(num_hash) for s in filtered]
    t2 = time.time()
    
    memmapped = _memmap_siglist(downsampled)
    t3 = time.time()
    
    values = compare_all_pairs(memmapped, n_jobs=n_jobs)
    t4 = time.time()

    names = [s.name().split('|')[0] for s in filtered]
    t5 = time.time()

    df = pd.DataFrame(values, index=names, columns=names)
    t6 = time.time()

    print(f"--- num_hash: {num_hash}, molecule: {molecule}, "
          f"ksize: {ksize} ---")
    print(f"Time to filter on ksize and molecule: {seconds_to_formatted_time(t1-t0)}")
    print(f"Time to downsample on num_hash: {seconds_to_formatted_time(t2-t1)}")
    print(f"Time to write memory-mapped array: {seconds_to_formatted_time(t3-t2)}")
    print(f"Time to compare all pairs: {seconds_to_formatted_time(t4-t3)}")
    print(f"Time to get names of samples: {seconds_to_formatted_time(t5-t4)}")
    print(f"Time to create dataframe: {seconds_to_formatted_time(t6-t5)}")
    return df

In [96]:


iterable = itertools.product(log2_num_hashes, molecules, ksizes)

In [103]:
range(2)

range(0, 2)

## Do comparisons in serial

In [114]:
names = [s.name() for s in siglist] 
names[:5]

['B18-MAA000907-3_11_M-1-1|tissue:Liver|subtissue:Non-hepatocytes|cell_ontology_class:endothelial_cell_of_hepatic_sinusoid|free_annotation:NA',
 'B18-MAA000907-3_11_M-1-1|tissue:Liver|subtissue:Non-hepatocytes|cell_ontology_class:endothelial_cell_of_hepatic_sinusoid|free_annotation:NA',
 'B18-MAA000907-3_11_M-1-1|tissue:Liver|subtissue:Non-hepatocytes|cell_ontology_class:endothelial_cell_of_hepatic_sinusoid|free_annotation:NA',
 'B18-MAA000907-3_11_M-1-1|tissue:Liver|subtissue:Non-hepatocytes|cell_ontology_class:endothelial_cell_of_hepatic_sinusoid|free_annotation:NA',
 'B18-MAA000907-3_11_M-1-1|tissue:Liver|subtissue:Non-hepatocytes|cell_ontology_class:endothelial_cell_of_hepatic_sinusoid|free_annotation:NA']

In [112]:
s = siglist[0]

In [113]:
s_downsampled = s.minhash.downsample_n(100)
s_downsampled

<sourmash._minhash.MinHash at 0x7ffd634763c8>

In [None]:
s_downsampled.j

In [None]:
s.ja

In [119]:
%%time

n_jobs = 4

log2_num_hashes = 8, 9, 10
molecules = 'DNA', 'protein'
ksizes = 21, 27

similarities = []

iterable = itertools.product(log2_num_hashes, molecules, ksizes)

for log2_num_hash, molecule, ksize in iterable:
    print(f'--- log2_num_hash: {log2_num_hash}, molecule: {molecule}, ksize: {ksize} ---')
    %time similarity = downsample_and_compare(siglist, log2_num_hash, molecule, ksize, n_jobs)
    similarities.append(similarity)

--- log2_num_hash: 8, molecule: DNA, ksize: 21 ---
--- num_hash: 256, molecule: DNA, ks§ize: 21 ---
Time to filter on ksize and molecule: 0.0057s
Time to downsample on num_hash: 5.3e+02s
Time to write memory-mapped array: 5.5s
Time to compare all pairs: 1.2e+02s
CPU times: user 10min 44s, sys: 7.24 s, total: 10min 51s
Wall time: 10min 50s
--- log2_num_hash: 8, molecule: DNA, ksize: 27 ---
--- num_hash: 256, molecule: DNA, ks§ize: 27 ---
Time to filter on ksize and molecule: 0.0074s
Time to downsample on num_hash: 5.2e+02s
Time to write memory-mapped array: 5.4s
Time to compare all pairs: 1.1e+02s
CPU times: user 10min 28s, sys: 10.8 s, total: 10min 39s
Wall time: 10min 40s
--- log2_num_hash: 8, molecule: protein, ksize: 21 ---
--- num_hash: 256, molecule: protein, ks§ize: 21 ---
Time to filter on ksize and molecule: 0.0081s
Time to downsample on num_hash: 5.2e+02s
Time to write memory-mapped array: 5.5s
Time to compare all pairs: 1.2e+02s
CPU times: user 10min 32s, sys: 11.1 s, total: 

In [121]:
import xarray as xr

In [123]:
iterable = itertools.product(log2_num_hashes, molecules, ksizes)

data_arrays = {}

DIMS = ('cell1', 'cell2')


for (log2_num_hash, molecule, ksize), similarity in zip(iterable, similarities):
    name = f'molecule={molecule}_ksize={ksize}_log2numhash={log2_num_hash}'
    
    coords = {'cell1': similarity.index, 'cell2': similarity.columns}
    data_array = xr.DataArray(similarity, coords=coords, dims=DIMS)
    data_arrays[name] = data_array
    
dataset = xr.Dataset(data_arrays)
dataset

<xarray.Dataset>
Dimensions:                                   (cell1: 1000, cell2: 1000)
Coordinates:
  * cell1                                     (cell1) object 'A1-B002764-3_38_F-1-1' ... 'P9-MAA000487-3_10_M-1-1'
  * cell2                                     (cell2) object 'A1-B002764-3_38_F-1-1' ... 'P9-MAA000487-3_10_M-1-1'
Data variables:
    molecule=DNA_ksize=21_log2numhash=8       (cell1, cell2) float64 0.0 ... 0.0
    molecule=DNA_ksize=27_log2numhash=8       (cell1, cell2) float64 0.0 ... 0.0
    molecule=protein_ksize=21_log2numhash=8   (cell1, cell2) float64 0.0 ... 0.0
    molecule=protein_ksize=27_log2numhash=8   (cell1, cell2) float64 0.0 ... 0.0
    molecule=DNA_ksize=21_log2numhash=9       (cell1, cell2) float64 0.0 ... 0.0
    molecule=DNA_ksize=27_log2numhash=9       (cell1, cell2) float64 0.0 ... 0.0
    molecule=protein_ksize=21_log2numhash=9   (cell1, cell2) float64 0.0 ... 0.0
    molecule=protein_ksize=27_log2numhash=9   (cell1, cell2) float64 0.0 ... 0.0
   

In [124]:
! tree -d /data_lg/olga

/data_lg/olga
└── maca
    └── facs
        └── sourmash_compute_tissue_subset
            └── dask

4 directories


### Write the array!!!

In [132]:
folder = "/data_lg/olga/maca/facs/sourmash_compute_tissue_subset/xarray/"
! mkdir $folder
basename = "similarities_ksizes=21,27_molecules=DNA,protein_log2numhashes=8,9,10.netcdf"

filename = f"{folder}/{basename}"

dataset.to_netcdf(filename)

mkdir: cannot create directory ‘/data_lg/olga/maca/facs/sourmash_compute_tissue_subset/xarray/’: File exists


In [133]:
ls -lha $folder

total 8.2M
drwxrwxr-x 2 olga olga 4.0K Feb 27 11:19 [0m[38;5;27m.[0m/
drwxrwxr-x 4 olga olga 128K Feb 27 11:18 [38;5;27m..[0m/
-rw-rw-r-- 1 olga olga  92M Feb 27 11:19 similarities_ksizes=21,27_molecules=DNA,protein_log2numhashes=8,9,10.netcdf


In [131]:
ls -lha $filename

ls: cannot access /data_lg/olga/maca/facs/sourmash_compute_tissue_subset/xarray//similarities_ksizes=21,27_molecules=DNA,protein_log2numhashes=8,9,10.netcdf: No such file or directory


## Pre-memory-map the signature list and compare in serial just because

In [134]:
%%time

log2_num_hashes = 8, 9, 10
molecules = 'DNA', 'protein'
ksizes = 21, 27

similarities = []

%time siglist_memmaped = _memmap_siglist(siglist)

iterable = itertools.product(log2_num_hashes, molecules, ksizes)

for log2_num_hash, molecule, ksize in iterable:
    print(f'--- log2_num_hashes: {log2_num_hashes}, molecule: {molecule}, ksize: {ksize} ---')
    similarity = downsample_and_compare(siglist_memmaped, log2_num_hash, molecule, ksize, n_jobs)
    similarities.append(similarity)

KeyboardInterrupt: 

--- log2_num_hashes: (8, 9, 10), molecule: DNA, ksize: 21 ---



KeyboardInterrupt



In [135]:
range(3)

range(0, 3)

## Parallelize the comparisons

In [None]:

iterable = itertools.product(log2_num_hashes, molecules, ksizes)
%time similarities = Parallel(n_jobs=n_jobs)(delayed(downsample_and_compare) (siglist, log2_num_hash, molecule, ksize, n_jobs) for log2_num_hashes, molecule, ksize in iterable)

## Pre-memory-map the signature and do the parallelization

In [None]:
%%time

%time siglist_memmaped = _memmap_siglist(siglist)

iterable = itertools.product(log2_num_hashes, molecules, ksizes)

%time similarities = Parallel(n_jobs=n_jobs)(delayed(downsample_and_compare) (siglist_memmaped, log2_num_hash, molecule, ksize, n_jobs) for log2_num_hashes, molecule, ksize in iterable)

In [None]:
range(2)