# Introduction

Barbara would like to generate 10x cluster bigwigs

In [99]:
import pandas
import os
import csv
import sys
import numpy
from scipy.sparse import dok_matrix
from collections import namedtuple
import pysam

In [27]:
CellBarcode = namedtuple('CellBarcode', ['run', 'stage', 'barcode', 'cluster'])

In [34]:
def parse_barcode_csv(filename, limit=None):
    with open(filename) as barcodes:
        reader = csv.reader(barcodes)
        header = next(reader)
        for i, row in enumerate(reader):
            run = []
            stage = []
            field0 = row[0]
            run_ = field0.find('_') + 1
            for c in field0[4:run_-1]:
                run.append(c)
            # +2 to be one after the trailing 0 or 5
            run_2 = field0.find('_', run_) + 2
            for c in field0[run_:run_2]:
                if c == '_':
                    c = '.'
                stage.append(c)
            run = ''.join(run)
            stage = ''.join(stage)

            barcode = field0[run_2:-2] + '-' + field0[-1:]
            cluster = row[1]
            yield CellBarcode(run, stage, barcode, cluster)
            if limit is not None and i > limit:
                break

In [48]:
def build_cluster_recognizer(filename):
    clusters = {}

    for row in parse_barcode_csv(filename):
        run = clusters.setdefault(row.cluster, {})
        barcode_tree = run.setdefault(row.run, {})
        barcode_node = barcode_tree
        for c in row.barcode:
            barcode_node = barcode_node.setdefault(c, {})
        
    return clusters

In [49]:
clusters = build_cluster_recognizer('monocle/mouse/barcodes-to-cluster.csv')

In [50]:
len(clusters)

25

In [51]:
{x: len(clusters[x]) for x in clusters}

{'3': 8,
 '2': 9,
 '5': 9,
 '6': 7,
 '1': 9,
 '9': 9,
 '24': 7,
 '0': 9,
 '13': 9,
 '4': 8,
 '17': 8,
 '22': 6,
 '8': 9,
 '14': 8,
 '21': 8,
 '12': 7,
 '11': 6,
 '7': 8,
 '18': 9,
 '10': 8,
 '20': 7,
 '15': 9,
 '23': 8,
 '19': 8,
 '16': 5}

In [52]:
{x: len(clusters['17'][x]) for x in clusters['17']}

{'12': 4, '13': 4, '1': 4, '4': 4, '5': 4, '6': 4, '7': 1, '8': 4}

Try to figure out how to build a sparse matrix that counts reads

In [76]:
chr19 = dok_matrix((1,61431566))

In [77]:
chr19[0, 100:150] += 1

In [78]:
chr19[0, 125:175] += 1

In [95]:
for y in chr19.nonzero()[1]:
    print(y, chr19[0,y])

100 1.0
101 1.0
102 1.0
103 1.0
104 1.0
105 1.0
106 1.0
107 1.0
108 1.0
109 1.0
110 1.0
111 1.0
112 1.0
113 1.0
114 1.0
115 1.0
116 1.0
117 1.0
118 1.0
119 1.0
120 1.0
121 1.0
122 1.0
123 1.0
124 1.0
125 2.0
126 2.0
127 2.0
128 2.0
129 2.0
130 2.0
131 2.0
132 2.0
133 2.0
134 2.0
135 2.0
136 2.0
137 2.0
138 2.0
139 2.0
140 2.0
141 2.0
142 2.0
143 2.0
144 2.0
145 2.0
146 2.0
147 2.0
148 2.0
149 2.0
150 1.0
151 1.0
152 1.0
153 1.0
154 1.0
155 1.0
156 1.0
157 1.0
158 1.0
159 1.0
160 1.0
161 1.0
162 1.0
163 1.0
164 1.0
165 1.0
166 1.0
167 1.0
168 1.0
169 1.0
170 1.0
171 1.0
172 1.0
173 1.0
174 1.0


In [97]:
def build_sparse_genome(filename):
    chroms = {}
    with open(filename, 'rt') as chromInfo:
        for line in chromInfo:
            name, length = line.rstrip().split()
            chroms[name] = dok_matrix((1,int(length)))
    return chroms
        

In [166]:
def is_recognized_barcode(barcode_tree, cell_barcode):
    barcode_node = barcode_tree
    for c in cell_barcode:
        try:
            barcode_node = barcode_node[c]
        except KeyError:
            return False
    return True

In [167]:
is_recognized_barcode(clusters['17']['12'], 'AACACGTAGAGTGAGA-1')

True

In [165]:
clusters['17']['12']['A']['A']['C']['A']['C']['G']['T']['A']['G']['A']['G']['T']['G']['A']['G']['A']['-']['1']

{}

In [145]:
def update_sparse_wiggle(wiggle, bam, barcode_tree, limit=None):
    for i, read in enumerate(bam):
        if not read.is_unmapped and read.has_tag('CB'):
            cb = read.get_tag('CB')
            if is_recognized_barcode(barcode_tree, cb):
                for pos in read.get_reference_positions():
                    wiggle[read.reference_name][0, pos] += 1
                    
        if limit is not None and i > limit:
            break

In [177]:
chroms = build_sparse_genome(os.path.expanduser('~/proj/genome/mm10-M4-male/chrNameLength.txt'))

In [178]:
possort = pysam.AlignmentFile(os.path.expanduser('~/proj/brian-2018-01-10x/Wold10x-12-encode-count-cells10000/outs/possorted_genome_bam.bam'))

In [179]:
update_sparse_wiggle(chroms, possort, clusters['17']['12'])

In [180]:
chroms

{'chr10': <1x130694993 sparse matrix of type '<class 'numpy.float64'>'
 	with 3525343 stored elements in Dictionary Of Keys format>,
 'chr11': <1x122082543 sparse matrix of type '<class 'numpy.float64'>'
 	with 5006272 stored elements in Dictionary Of Keys format>,
 'chr12': <1x120129022 sparse matrix of type '<class 'numpy.float64'>'
 	with 3030416 stored elements in Dictionary Of Keys format>,
 'chr13': <1x120421639 sparse matrix of type '<class 'numpy.float64'>'
 	with 3071452 stored elements in Dictionary Of Keys format>,
 'chr14': <1x124902244 sparse matrix of type '<class 'numpy.float64'>'
 	with 2822347 stored elements in Dictionary Of Keys format>,
 'chr15': <1x104043685 sparse matrix of type '<class 'numpy.float64'>'
 	with 2637571 stored elements in Dictionary Of Keys format>,
 'chr16': <1x98207768 sparse matrix of type '<class 'numpy.float64'>'
 	with 2526726 stored elements in Dictionary Of Keys format>,
 'chr17': <1x94987271 sparse matrix of type '<class 'numpy.float64'>'


In [175]:
chroms['chr10']

<1x130694993 sparse matrix of type '<class 'numpy.float64'>'
	with 2037495 stored elements in Dictionary Of Keys format>

In [176]:
for i,y in enumerate(chroms['chr10'].nonzero()[1]):
    print(y, chroms['chr10'][0,y])
    if i > 1000:
        break

3165723 1.0
3165724 1.0
3165725 1.0
3165726 1.0
3165727 1.0
3165728 1.0
3165729 1.0
3165730 1.0
3165731 1.0
3165732 1.0
3165733 1.0
3165734 1.0
3165735 1.0
3165736 1.0
3165737 1.0
3165738 1.0
3165739 1.0
3165740 1.0
3165741 1.0
3165742 1.0
3165743 1.0
3165744 1.0
3165745 1.0
3165746 1.0
3165747 1.0
3165748 1.0
3165749 1.0
3165750 1.0
3165751 1.0
3165752 1.0
3165753 1.0
3165754 1.0
3165755 1.0
3165756 1.0
3165757 1.0
3165758 1.0
3165759 1.0
3165760 1.0
3165761 1.0
3165762 1.0
3165763 1.0
3165764 1.0
3165765 1.0
3165766 1.0
3165767 1.0
3165768 1.0
3165769 1.0
3165770 1.0
3165771 1.0
3165772 1.0
3165773 1.0
3165774 1.0
3165775 1.0
3165776 1.0
3165777 1.0
3165778 2.0
3165779 2.0
3165780 2.0
3165781 2.0
3165782 2.0
3165783 2.0
3165784 2.0
3165785 2.0
3165786 2.0
3165787 2.0
3165788 2.0
3165789 2.0
3165790 2.0
3165791 2.0
3165792 2.0
3165793 2.0
3165794 2.0
3165795 2.0
3165796 2.0
3165797 2.0
3165798 2.0
3165799 2.0
3165800 2.0
3165801 2.0
3165802 2.0
3165803 2.0
3165804 2.0
3165805 2.0
3165

# build barcode pattern list

In [191]:
def create_cluster_run_barcode_lists(filename, cluster, output_template):
    run_streams = {}
    for row in parse_barcode_csv(filename):
        if row.cluster == cluster:
            output = output_template.format(cluster=row.cluster, run=row.run)
            outstream = run_streams.setdefault(row.run, open(output, 'wt'))
            outstream.write('CB:Z:')
            outstream.write(row.barcode)
            outstream.write(os.linesep)
            
    for name in run_streams:
        run_streams[name].close()

In [192]:
create_cluster_run_barcode_lists(
    'monocle/mouse/barcodes-to-cluster.csv',
    '17',
    '10x_tracks/barcodes-10x-cluster-{cluster}-run-{run}.txt',
)

In [193]:
create_cluster_run_barcode_lists(
    'monocle/mouse/barcodes-to-cluster.csv',
    '12',
    '10x_tracks/barcodes-10x-cluster-{cluster}-run-{run}.txt',
)

In [194]:
create_cluster_run_barcode_lists(
    'monocle/mouse/barcodes-to-cluster.csv',
    '4',
    '10x_tracks/barcodes-10x-cluster-{cluster}-run-{run}.txt',
)

In [195]:
create_cluster_run_barcode_lists(
    'monocle/mouse/barcodes-to-cluster.csv',
    '2',
    '10x_tracks/barcodes-10x-cluster-{cluster}-run-{run}.txt',
)

In [196]:
create_cluster_run_barcode_lists(
    'monocle/mouse/barcodes-to-cluster.csv',
    '0',
    '10x_tracks/barcodes-10x-cluster-{cluster}-run-{run}.txt',
)

In [197]:
create_cluster_run_barcode_lists(
    'monocle/mouse/barcodes-to-cluster.csv',
    '7',
    '10x_tracks/barcodes-10x-cluster-{cluster}-run-{run}.txt',
)