In [1]:
import pandas
import numpy
import os
import time
import collections

In [2]:
# sets up python path to include htsworkflow
import curation_common

In [3]:
import save_rnaseq_madqc

In [4]:
query_url = 'search/?type=experiment&assay_term_name=RNA-seq'
cache_name = 'rnaseq-experiments.shelf'
cache = save_rnaseq_madqc.caching_encoded_experiment_loader(query_url, cache_name)

Reading 140 of 1407 records in 0.028576807002536952 seconds
Reading 280 of 1407 records in 0.01707311300560832 seconds
Reading 420 of 1407 records in 0.020378804998472333 seconds
Reading 560 of 1407 records in 0.030186828997102566 seconds
Reading 700 of 1407 records in 0.01871005800785497 seconds
Reading 840 of 1407 records in 0.03688787898863666 seconds
Reading 980 of 1407 records in 0.10598395000852179 seconds
Reading 1120 of 1407 records in 0.1309164619888179 seconds
Reading 1260 of 1407 records in 0.12005515900091268 seconds
Reading 1400 of 1407 records in 0.11660880601266399 seconds
Read 1407 records in 0.6253778680111282 seconds


In [5]:
def remove_blacklist_spikes(spikes):
    blacklist_spikes = ['/references/ENCSR013YHQ/']

    # remove caltech spikes that werent aligned against.
    for ref in blacklist_spikes:
        if ref in spikes:
            spikes.remove(ref)
            
def load_spike_rsems(cache, limit=None):
    total = len(cache)
    chunk = total // 10
    tzero = time.monotonic()
    tprev = tzero
    
    for i, experiment_id in enumerate(cache):
        experiment = cache[experiment_id]

        for file in save_rnaseq_madqc.find_rsem(experiment['files']):
            url = 'https://www.encodeproject.org' + file.href
            fpkms = pandas.read_csv(url, usecols=[0,6], sep='\t', index_col=0)
            spike_fpkms = fpkms[fpkms.index.map(lambda x: x.startswith('gSpike'))].copy()
            spike_fpkms.columns = [file.library_id]

            yield (file.spikes_used, spike_fpkms)

        if limit and i > limit:
            break
        if (i + 1) % chunk == 0:
            tnow = time.monotonic()
            print("{} of {} in {:.2f} sec".format(
                i, total, tnow-tprev))
            tprev = tnow

    print('read {} in {:.2f}'.format(i, time.monotonic() - tzero))

In [6]:
fpkms_by_spike = {}

for spike, spike_fpkms in list(load_spike_rsems(cache)):
    fpkms_by_spike.setdefault(spike, []).append(spike_fpkms)

144 of 1453 in 92.93 sec
289 of 1453 in 121.03 sec
434 of 1453 in 95.91 sec
579 of 1453 in 149.48 sec
724 of 1453 in 100.71 sec
869 of 1453 in 83.38 sec
1014 of 1453 in 34.05 sec
1159 of 1453 in 46.49 sec
1304 of 1453 in 67.24 sec
1449 of 1453 in 82.20 sec
read 1452 in 876.88


In [7]:
for spike in fpkms_by_spike:
    fpkms_by_spike[spike] = pandas.concat(fpkms_by_spike[spike], axis=1)

In [8]:
[(k, len(fpkms_by_spike[k].columns)) for k in fpkms_by_spike.keys()]

[('/references/ENCSR884LPM/', 100),
 (None, 12),
 ('/references/ENCSR402QNO/', 41),
 ('/references/ENCSR133ALU/', 70),
 ('/references/ENCSR449DXG/', 18),
 ('/references/ENCSR470JZL/', 80),
 ('/references/ENCSR156CIL/', 191)]

In [9]:
v = (fpkms_by_spike['/references/ENCSR449DXG/'].loc['gSpikein_ERCC-00002'] > 100)

In [10]:
list(v[v == True].index)

['ENCLB267WCY',
 'ENCLB079COW',
 'ENCLB039ZZZ',
 'ENCLB040ZZZ',
 'ENCLB041ZZZ',
 'ENCLB042ZZZ',
 'ENCLB059ZZZ',
 'ENCLB060ZZZ',
 'ENCLB822JYE',
 'ENCLB374EGQ',
 'ENCLB257SKY',
 'ENCLB240JZT',
 'ENCLB188STW']

In [11]:
# save data
store = pandas.HDFStore('all-rna-spikes.h5', 'w', format='tables', complevel=9, compression='blosc')
for name in fpkms_by_spike:
    if name:
        _, spike_accession = os.path.split(name[:-1])
    else:
        spike_accession = "None"
    store[str(name)] = fpkms_by_spike[name]
    fpkms_by_spike[name].to_csv(spike_accession+'_spikein_by_library.csv')
store.close()

