# Introduction

What's faster for checking if we have found a cell barcode we're interested in. My dictionary tree search or a simple compiled regular expression?

In [15]:
import os
import re
from collections import namedtuple
import pysam
import csv
from functools import partial

In [16]:
tenx_root = '/woldlab/loxcyc/home/diane/proj/brian-2018-01-10x/'

tenx_bam = {
    1: os.path.join(tenx_root, 'Wold10x-1-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    3: os.path.join(tenx_root, 'Wold10x-3-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    4: os.path.join(tenx_root, 'Wold10x-4-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    5: os.path.join(tenx_root, 'Wold10x-5-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    6: os.path.join(tenx_root, 'Wold10x-6-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    7: os.path.join(tenx_root, 'Wold10x-7-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    8: os.path.join(tenx_root, 'Wold10x-8-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    9: os.path.join(tenx_root, 'Wold10x-9-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    10: os.path.join(tenx_root, 'Wold10x-10-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    11: os.path.join(tenx_root, 'Wold10x-11-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    12: os.path.join(tenx_root, 'Wold10x-12-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    13: os.path.join(tenx_root, 'Wold10x-13-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
}

for cluster in tenx_bam:
    if not os.path.exists(tenx_bam[cluster]):
        print('missing {}'.format(tenx_bam[cluster]))

In [3]:
CellBarcode = namedtuple('CellBarcode', ['run', 'stage', 'barcode', 'cluster'])
def parse_barcode_csv(filename, limit=None):
    with open(filename) as barcodes:
        reader = csv.reader(barcodes)
        header = next(reader)
        for i, row in enumerate(reader):
            run = []
            stage = []
            field0 = row[0]
            run_ = field0.find('_') + 1
            for c in field0[4:run_-1]:
                run.append(c)
            # +2 to be one after the trailing 0 or 5
            run_2 = field0.find('_', run_) + 2
            for c in field0[run_:run_2]:
                if c == '_':
                    c = '.'
                stage.append(c)
            run = ''.join(run)
            stage = ''.join(stage)

            barcode = field0[run_2:-2] + '-' + field0[-1:]
            cluster = row[1]
            yield CellBarcode(run, stage, barcode, cluster)
            if limit is not None and i > limit:
                break

In [4]:
def build_cluster_recognizer(filename):
    clusters = {}

    for row in parse_barcode_csv(filename):
        run = clusters.setdefault(row.cluster, {})
        barcode_tree = run.setdefault(row.run, {})
        barcode_node = barcode_tree
        for c in row.barcode:
            barcode_node = barcode_node.setdefault(c, {})
        
    return clusters

In [9]:
def is_recognized_barcode(barcode_tree, cell_barcode):
    barcode_node = barcode_tree
    for c in cell_barcode:
        try:
            barcode_node = barcode_node[c]
        except KeyError:
            return False
    return True

In [8]:
clusters = build_cluster_recognizer('../monocle/mouse/barcodes-to-cluster.csv')

In [None]:
def get_cluster_barcodes(cluster):
    results = []
    run_re = re.compile('barcodes-10x-cluster-{cluster}-run-(?P<run>[\d]+)\.txt'.format(cluster=cluster))
    pattern = 'barcodes-10x-cluster-{cluster}-run-*.txt'.format(cluster=cluster)
    for filename in glob(pattern):
        match = run_re.match(filename)
        run = int(match.group('run'))
        results.append((run, filename))
    return sorted(results)

In [12]:
def build_barcode_re(barcode_filename):
    barcodes = []
    with open(barcode_filename, 'rt') as instream:
        for line in instream:
            barcode = line.rstrip().split(':')[2]
            barcodes.append('(' + barcode + ')')
    return re.compile('|'.join(barcodes))

barcode_re = build_barcode_re('barcodes-10x-cluster-12-run-1.txt')

In [13]:
def is_re_recognized_barcode(barcode_re, cell_barcode):
    if barcode_re.match(cell_barcode):
        return True
    else:
        return False

In [14]:
def count_matched_barcodes(filename, recognizer):
    count = 0
    bam = pysam.AlignmentFile(filename, 'rb')
    for read in bam:
        if read.has_tag('CB'):
            cb = read.get_tag('CB')
            if recognizer(cb):
                count += 1
    return count

In [20]:
%timeit count_matched_barcodes(tenx_bam[1], partial(is_recognized_barcode, clusters['12']["1"]))

1 loop, best of 5: 23min 7s per loop


In [21]:
%timeit count_matched_barcodes(tenx_bam[1], partial(is_re_recognized_barcode, barcode_re))

1 loop, best of 5: 43min 6s per loop


Try running with grep, external to the notebook

<pre>
time cat <(samtools view -H $RUN1/outs/possorted_genome_bam.bam)     
<(grep -f barcodes-10x-cluster-12-run-1.txt <(samtools view $RUN1/outs/possorted_genome_bam.bam)) | wc -l
8254835

real    24m50.883s
user    0m10.560s
sys     1m10.827s
</pre>


In [22]:
%timeit is_recognized_barcode(clusters['12']["1"], "TTTGTCAAGCGGATCA-1")

The slowest run took 29.86 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 5: 529 ns per loop


In [25]:
%timeit is_recognized_barcode(clusters['12']["1"], "AACACGTGTAGAGTGC-1")

The slowest run took 20.97 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 5: 891 ns per loop


In [23]:
%timeit is_re_recognized_barcode(barcode_re, "TTTGTCAAGCGGATCA-1")

The slowest run took 4.05 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 5: 3.2 µs per loop


In [24]:
%timeit is_re_recognized_barcode(barcode_re, "AACACGTGTAGAGTGC-1")

The slowest run took 8.06 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 5: 602 ns per loop


In [26]:
barcode_0_run_3re = build_barcode_re('barcodes-10x-cluster-0-run-3.txt')

In [27]:
%timeit is_re_recognized_barcode(barcode_0_run_3re, 'TTTGTCATCTGGTGTA-1')


10 loops, best of 5: 111 ms per loop


In [29]:
%timeit is_recognized_barcode(clusters['0']["3"], "TTTGTCATCTGGTGTA-1")

The slowest run took 10.68 times longer than the fastest. This could mean that an intermediate result is being cached.
1000000 loops, best of 5: 912 ns per loop
