# Introduction

My first attempt to remove colliding cellular barcodes was painfully slow.

My hunch is that the python regular expression library was N, while my dictionary based barcode recognizer is log(N), my barcode recognizer was slightly faster than grep. (though best of 5 trials for the python vs 1 trial for shell).

The other speed up from peng's original run I think came from multiple greps running.

In [1]:
import os
import re
from collections import namedtuple
import pysam
import csv
from functools import partial
from distributed import Client, Pub, Sub

In [2]:
tenx_root = '/woldlab/loxcyc/home/diane/proj/brian-2018-01-10x/'

tenx_bam = {
    '1': os.path.join(tenx_root, 'Wold10x-1-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    '3': os.path.join(tenx_root, 'Wold10x-3-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    '4': os.path.join(tenx_root, 'Wold10x-4-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    '5': os.path.join(tenx_root, 'Wold10x-5-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    '6': os.path.join(tenx_root, 'Wold10x-6-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    '7': os.path.join(tenx_root, 'Wold10x-7-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    '8': os.path.join(tenx_root, 'Wold10x-8-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    '9': os.path.join(tenx_root, 'Wold10x-9-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    '10': os.path.join(tenx_root, 'Wold10x-10-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    '11': os.path.join(tenx_root, 'Wold10x-11-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    '12': os.path.join(tenx_root, 'Wold10x-12-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
    '13': os.path.join(tenx_root, 'Wold10x-13-encode-count-cells10000', 'outs', 'possorted_genome_bam.bam'),
}

for cluster in tenx_bam:
    if not os.path.exists(tenx_bam[cluster]):
        print('missing {}'.format(tenx_bam[cluster]))

In [3]:
CellBarcode = namedtuple('CellBarcode', ['run', 'stage', 'barcode', 'cluster'])
def parse_barcode_csv(filename, limit=None):
    with open(filename) as barcodes:
        reader = csv.reader(barcodes)
        header = next(reader)
        for i, row in enumerate(reader):
            run = []
            stage = []
            field0 = row[0]
            run_ = field0.find('_') + 1
            for c in field0[4:run_-1]:
                run.append(c)
            # +2 to be one after the trailing 0 or 5
            run_2 = field0.find('_', run_) + 2
            for c in field0[run_:run_2]:
                if c == '_':
                    c = '.'
                stage.append(c)
            run = ''.join(run)
            stage = ''.join(stage)

            barcode = field0[run_2:-2] + '-' + field0[-1:]
            cluster = row[1]
            yield CellBarcode(run, stage, barcode, cluster)
            if limit is not None and i > limit:
                break

In [4]:
def build_cluster_recognizer(filename):
    clusters = {}

    for row in parse_barcode_csv(filename):
        run = clusters.setdefault(row.cluster, {})
        barcode_tree = run.setdefault(row.run, {})
        barcode_node = barcode_tree
        for c in row.barcode:
            barcode_node = barcode_node.setdefault(c, {})
        
    return clusters

In [5]:
def is_recognized_barcode(barcode_tree, cell_barcode):
    barcode_node = barcode_tree
    for c in cell_barcode:
        try:
            barcode_node = barcode_node[c]
        except KeyError:
            return False
    return True

In [6]:
clusters = build_cluster_recognizer('../monocle/mouse/barcodes-to-cluster.csv')

In [7]:
c = Client()

Dask needs bokeh >= 0.13.0 for the dashboard.
Continuing without the dashboard.


In [8]:
Pub?

In [9]:
def copy_filtered_reads(possorted_filename, barcodes, topic, limit=None):
    possorted_bam = pysam.Alignment(filename, 'rb')
    current_reference = None
    seen = set()
    pub = Pub(topic)
    
    for i, read in enumerate(possorted_bam):
        if read.has_tag('CB'):
            cb = read.get_tag('CB')
            if is_recognized_barcode(barcodes, cb) and read.has_tag('UB'):
                ub = read.get_tag('UB')
                unique_key = (cb, ub, read.reference_name, read.reference_start)
                # reset seen dictionary for each new chromosome/reference sequence
                if read.reference_name != current_reference:
                    seen = set()
                    current_reference = read.reference_name
                if unique_key not in seen:
                    pub.put(read)
                    seen.add(unique_key)

        if limit is not None and i > limit:
            break
            
    possorted_bam.close()
    raise StopIteration()

In [10]:
def is_running(futures):
    for f in futures:
        if f.running():
            return True
        
    return False

In [None]:
def sequential_filter(c, output, barcodes):
    cluster = '12'
    futures = []
    for run in tenx_bam:
        if run in barcodes[cluster]:
            barcode_tree = barcodes[cluster][run]
            futures.append(c.submit(copy_filtered_reads, tenx_bam, barcode_tree, cluster, 10000))
        
    reads = Sub(cluster)
    count = 0
    for read in reads:
        count += 1
        if not is_running(futures):
            break
    return count

In [11]:
def parallel_filter(c, output, barcodes):
    cluster = '12'
    futures = []
    for run in tenx_bam:
        if run in barcodes[cluster]:
            barcode_tree = barcodes[cluster][run]
            futures.append(c.submit(copy_filtered_reads, tenx_bam, barcode_tree, cluster, 10000))
        
    reads = Sub(cluster)
    count = 0
    for read in reads:
        count += 1
        if not is_running(futures):
            break
    return count

In [12]:
parallel_filter(c, 'foo', clusters)

KeyboardInterrupt: 



In [None]:
%debug

In [None]:
clusters['12'].keys()

In [None]:
test_pub = Pub('test')

In [None]:
test_pub.subscribers