#Introduction

Brian wanted FPKMs for some of the Gingeras datasets that fail MAD.

In [1]:
import pandas
import os
import sys

In [2]:
ROOT=os.path.expanduser("~diane/proj/encode3-curation")
if ROOT not in sys.path:
    sys.path.append(ROOT)

In [3]:
from curation_common import *
import save_rnaseq_madqc

In [4]:
query_url = 'search/?type=experiment&assay_term_name=RNA-seq'
cache_name = os.path.join(ROOT, 'compare-mad', 'rnaseq-experiments.shelf')
cache = save_rnaseq_madqc.caching_encoded_experiment_loader(query_url, cache_name)

Reading 139 of 1395 records in 0.022363889998814557 seconds
Reading 278 of 1395 records in 0.09035903599942685 seconds
Reading 417 of 1395 records in 0.01148256099986611 seconds
Reading 556 of 1395 records in 0.06382555600066553 seconds
Reading 695 of 1395 records in 0.017234585000551306 seconds
Reading 834 of 1395 records in 0.017057584998838138 seconds
Reading 973 of 1395 records in 0.0387104969995562 seconds
Reading 1112 of 1395 records in 0.2025324880014523 seconds
Reading 1251 of 1395 records in 0.13401064299978316 seconds
Reading 1390 of 1395 records in 0.36714254999969853 seconds
Read 1395 records in 0.9647193909986527 seconds


In [5]:
experiment_qc = pandas.read_csv(os.path.join(ROOT, 'compare-mad', 'experiment-mad-qc.csv'))

In [6]:
gingeras = experiment_qc[experiment_qc['lab'] == 'Thomas Gingeras, CSHL']

In [17]:
gingeras.to_csv(os.path.expanduser('~/public_html/compare-mad/gingeras_fpkms/gingeras-experiments.tsv'), sep='\t')

In [8]:
gingeras_fail_mad = gingeras[(gingeras['MAD'] > 0.5)]

In [18]:
gingeras_fail_mad.to_csv(os.path.expanduser('~/public_html/compare-mad/gingeras-fail-mad.tsv'), sep='\t')

In [9]:
def load_spike_rsems(cache, keys, quantification_name='FPKM', limit=None):
    column_map = {
        'TPM': 5,
        'FPKM': 6,
    }
    column = column_map[quantification_name]
    keys = list(keys)
    total = len(keys)
    chunk = max(total // 10, 1)
    tzero = time.monotonic()
    tprev = tzero
    
    for i, experiment_id in enumerate(keys):
        experiment = cache[experiment_id]
        fpkms = []
        for file in save_rnaseq_madqc.find_rsem(experiment['files']):
            url = 'https://www.encodeproject.org' + file.href
            fpkm = pandas.read_csv(url, usecols=[0,column], sep='\t', index_col=0)
            fpkm.columns = [file.library_id]
            fpkms.append(fpkm)

        if fpkms:
            yield (experiment_id, pandas.concat(fpkms, axis=1))
            
        if (i + 1) % chunk == 0:
            tnow = time.monotonic()
            print("{} of {} in {:.2f} sec".format(
                i, total, tnow-tprev))
            tprev = tnow

        if limit and i > limit:
            return

In [11]:
for experiment_id, fpkms in load_spike_rsems(
      cache,
      keys=list(gingeras['experiment']),
      quantification_name='FPKM'
    ):
    filename = os.path.join(os.path.expanduser('~/public_html/compare-mad/gingeras_fpkms/'), experiment_id + '_library_fpkms.csv')
    fpkms.to_csv(filename)

20 of 215 in 58.12 sec
41 of 215 in 60.79 sec
62 of 215 in 61.83 sec
83 of 215 in 71.31 sec
104 of 215 in 78.22 sec
125 of 215 in 72.53 sec
146 of 215 in 62.82 sec
167 of 215 in 50.15 sec
188 of 215 in 40.27 sec
209 of 215 in 51.78 sec


In [12]:
for experiment_id, fpkms in load_spike_rsems(
      cache,
      keys=list(gingeras['experiment']),
      quantification_name='TPM'
    ):
    filename = os.path.join(os.path.expanduser('~/public_html/compare-mad/gingeras_fpkms/'), experiment_id + '_library_tpms.csv')
    fpkms.to_csv(filename)

20 of 215 in 106.96 sec
41 of 215 in 133.50 sec
62 of 215 in 111.85 sec
83 of 215 in 142.79 sec
104 of 215 in 138.98 sec
125 of 215 in 141.64 sec
146 of 215 in 124.89 sec
167 of 215 in 90.78 sec
188 of 215 in 90.41 sec
209 of 215 in 81.94 sec


In [7]:
wold = experiment_qc[experiment_qc['lab'] == 'Barbara Wold, Caltech']

In [19]:
wold.to_csv(os.path.expanduser('~/public_html/compare-mad/wold_gene_quantifications/wold-experiments.tsv'), sep='\t')

In [15]:
for experiment_id, fpkms in load_spike_rsems(
      cache,
      keys=list(wold['experiment']),
      quantification_name='FPKM'
    ):
    filename = os.path.join(os.path.expanduser('~/public_html/compare-mad/wold_gene_quantifications/'), experiment_id + '_library_fpkms.csv')
    fpkms.to_csv(filename)

9 of 100 in 53.67 sec
19 of 100 in 51.65 sec
29 of 100 in 52.58 sec
39 of 100 in 52.62 sec
49 of 100 in 44.67 sec
59 of 100 in 43.99 sec
69 of 100 in 48.57 sec
79 of 100 in 47.76 sec
89 of 100 in 51.10 sec
99 of 100 in 54.53 sec


In [10]:
for experiment_id, fpkms in load_spike_rsems(
      cache,
      keys=list(wold['experiment']),
      quantification_name='TPM'
    ):
    filename = os.path.join(os.path.expanduser('~/public_html/compare-mad/wold_gene_quantifications/'), experiment_id + '_library_tpms.csv')
    fpkms.to_csv(filename)

9 of 100 in 44.55 sec
19 of 100 in 44.52 sec
29 of 100 in 46.69 sec
39 of 100 in 46.85 sec
49 of 100 in 50.05 sec
59 of 100 in 48.00 sec
69 of 100 in 42.28 sec
79 of 100 in 42.91 sec
89 of 100 in 48.22 sec
99 of 100 in 44.94 sec
