# Introduction

There shouldn't be any spikes detected, but we aligned against them so Barbara was somewhat curious if they'd show up.

In [1]:
import numpy
import pandas
import sys
import os
import seaborn
from scipy.io import mmread
from matplotlib import pyplot
from glob import glob
import dask
import dask.dataframe
import dask.bag
from dask.distributed import Client, LocalCluster

In [2]:
from woldrnaseq.models import load_gtf_cache
from woldrnaseq.gtfcache import protein_coding_gene_ids

In [3]:
annotation = load_gtf_cache(os.path.expanduser('~diane/proj/genome/mm10-M4-male/mm10-M4-male.h5'))

In [4]:
def spikein_gene_ids(annotation):
    """Filter GTF just protein coding genes
    """
    entry_type = (annotation['source'] == 'spikein')
    return annotation[entry_type]['gene_id']

In [5]:
spikeins = spikein_gene_ids(annotation)

In [6]:
def load_10x_filtered_features(root, gene_list=None):
    mmdir = os.path.join(root, 'outs', 'filtered_feature_bc_matrix')
    filename = os.path.join(mmdir, 'matrix.mtx.gz')
    sparse = mmread(filename)
    df = pandas.DataFrame(sparse.toarray())
    
    features = pandas.read_csv(
        os.path.join(mmdir, 'features.tsv.gz'),
        header=None,
        names=['gene_id'],
        usecols=[0],
        sep='\t')
    df.index=features['gene_id']
    
    barcodes = pandas.read_csv(
        os.path.join(mmdir, 'barcodes.tsv.gz'),
        header=None,
        names=['barcode'],
        sep='\t')
    df.columns = barcodes['barcode']
    
    if gene_list is not None:
        return df.loc[gene_list]
    else:
        return df


In [7]:
dask_client = Client('[::1]:8786')

In [8]:
encode_dirs = glob(os.path.expanduser('~diane/proj/brian-2018-01-10x/Wold10x-*-encode-count-cells10000'))
futures = []
for i, dirname in enumerate(encode_dirs) :
    experiment_name = dirname.replace('-encode-count-cells10000', '')
    futures.append(dask_client.submit(load_10x_filtered_features, dirname, spikeins))

dask_client.gather(futures)

[barcode              AAACCTGAGAAGCCCA-1  AAACCTGAGCGTAGTG-1  \
 gene_id                                                       
 gSpikein_ERCC-00002                   0                   0   
 gSpikein_ERCC-00003                   0                   0   
 gSpikein_ERCC-00004                   0                   0   
 gSpikein_ERCC-00007                   0                   0   
 gSpikein_ERCC-00009                   0                   0   
 gSpikein_ERCC-00012                   0                   0   
 gSpikein_ERCC-00013                   0                   0   
 gSpikein_ERCC-00014                   0                   0   
 gSpikein_ERCC-00016                   0                   0   
 gSpikein_ERCC-00017                   0                   0   
 gSpikein_ERCC-00018                   0                   0   
 gSpikein_ERCC-00019                   0                   0   
 gSpikein_ERCC-00022                   0                   0   
 gSpikein_ERCC-00023                   0

In [9]:
results = _

In [17]:
results[0].shape

(96, 14364)

In [19]:
df = pandas.concat(results, axis=1)

In [20]:
df.shape

(96, 141000)

In [18]:
pandas.concat?

In [24]:
any_expression = df[df > 0.1].count()

In [25]:
any_expression[any_expression > 1]

Series([], dtype: int64)

In [34]:
df[df == 0].dropna(how='all').shape

(96, 141000)