#Introduction

Brian wanted FPKMs for some of the Gingeras datasets that fail MAD.

In [1]:
import pandas
import os
import sys

In [2]:
ROOT=os.path.expanduser("~diane/proj/encode3-curation")
if ROOT not in sys.path:
    sys.path.append(ROOT)

In [3]:
from curation_common import *
import save_rnaseq_madqc

In [4]:
query_url = 'search/?type=experiment&assay_term_name=RNA-seq'
cache_name = os.path.join(ROOT, 'compare-mad', 'rnaseq-experiments.shelf')
cache = save_rnaseq_madqc.caching_encoded_experiment_loader(query_url, cache_name)

Reading 140 of 1406 records in 0.019692344998475164 seconds
Reading 280 of 1406 records in 0.10706269499496557 seconds
Reading 420 of 1406 records in 0.007282147998921573 seconds
Reading 560 of 1406 records in 0.08088527400104795 seconds
Reading 700 of 1406 records in 0.02871862699976191 seconds
Reading 840 of 1406 records in 0.02242868600296788 seconds
Reading 980 of 1406 records in 0.022624830991844647 seconds
Reading 1120 of 1406 records in 0.1411977000097977 seconds
Reading 1260 of 1406 records in 0.24879714100097772 seconds
Reading 1400 of 1406 records in 0.3323524149891455 seconds
Read 1406 records in 1.0110418619879056 seconds


In [5]:
experiment_qc = pandas.read_csv(os.path.join(ROOT, 'compare-mad', 'experiment-mad-qc.csv'))

In [6]:
gingeras = experiment_qc[experiment_qc['lab'] == 'Thomas Gingeras, CSHL']

In [7]:
gingeras.to_csv(os.path.expanduser('~/public_html/compare-mad/gingeras-experiments.csv'))

In [8]:
gingeras_fail_mad = gingeras[(gingeras['MAD'] > 0.5)]

In [9]:
gingeras_fail_mad.to_csv(os.path.expanduser('~/public_html/compare-mad/gingeras-fail-mad.csv'))

In [10]:
def load_spike_rsems(cache, keys, quantification_name='FPKM', limit=None):
    column_map = {
        'TPM': 5,
        'FPKM': 6,
    }
    column = column_map[quantification_name]
    keys = list(keys)
    total = len(keys)
    chunk = max(total // 10, 1)
    tzero = time.monotonic()
    tprev = tzero
    
    for i, experiment_id in enumerate(keys):
        experiment = cache[experiment_id]
        fpkms = []
        for file in save_rnaseq_madqc.find_rsem(experiment['files']):
            url = 'https://www.encodeproject.org' + file.href
            fpkm = pandas.read_csv(url, usecols=[0,column], sep='\t', index_col=0)
            fpkm.columns = [file.library_id]
            fpkms.append(fpkm)

        if fpkms:
            yield (experiment_id, pandas.concat(fpkms, axis=1))
            
        if (i + 1) % chunk == 0:
            tnow = time.monotonic()
            print("{} of {} in {:.2f} sec".format(
                i, total, tnow-tprev))
            tprev = tnow

        if limit and i > limit:
            return

In [12]:
for experiment_id, fpkms in load_spike_rsems(
      cache,
      keys=list(gingeras['experiment']),
      quantification_name='FPKM'
    ):
    filename = os.path.join(os.path.expanduser('~/public_html/compare-mad/'), experiment_id + '_library_fpkms.csv')
    fpkms.to_csv(filename)

KeyboardInterrupt: 

In [None]:
for experiment_id, fpkms in load_spike_rsems(
      cache,
      keys=list(gingeras['experiment']),
      quantification_name='TPM'
    ):
    filename = os.path.join(os.path.expanduser('~/public_html/compare-mad/'), experiment_id + '_library_tpms.csv')
    fpkms.to_csv(filename)