# Introduction

Ken wanted FPKMs for the ENCODE experiments in a form he can easily import.

In [32]:
import pandas
import collections
import sys
import os
import numpy
import pprint
import time

ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
if ROOT not in sys.path:
    sys.path.append(ROOT)
import save_rnaseq_madqc

In [2]:
query_url = 'search/?type=experiment&assay_term_name=RNA-seq'
cache_name = os.path.join(ROOT, 'rnaseq-experiments.shelf')
cache = save_rnaseq_madqc.caching_encoded_experiment_loader(query_url, cache_name)

Reading 140 of 1408 records in 0.008967859001131728 seconds
Reading 280 of 1408 records in 0.004839356988668442 seconds
Reading 420 of 1408 records in 0.006519144008052535 seconds
Reading 560 of 1408 records in 0.0071131360018625855 seconds
Reading 700 of 1408 records in 0.014910015001078136 seconds
Reading 840 of 1408 records in 0.003114136998192407 seconds
Reading 980 of 1408 records in 0.04728731499926653 seconds
Reading 1120 of 1408 records in 0.047442299997783266 seconds
Reading 1260 of 1408 records in 0.02373923300183378 seconds
Reading 1400 of 1408 records in 0.005228327994700521 seconds
Read 1408 records in 0.16916082399256993 seconds


In [3]:
def filter_experiments_by_size(cache):
    for i, experiment_id in enumerate(cache):
        experiment = cache[experiment_id]
        for replicate in experiment['replicates']:
            library = replicate['library']

            starting_amount = float(library.get('nucleic_acid_starting_quantity', numpy.nan))
            starting_amount_units = library.get('nucleic_acid_starting_quantity_units')
            if starting_amount >= 10 and starting_amount_units == 'ng':
                yield experiment_id

In [4]:
len(list(filter_experiments_by_size(cache)))

207

In [5]:
def filter_experiments_by_13pg(cache):
    for i, experiment_id in enumerate(cache):
        experiment = cache[experiment_id]
        for replicate in experiment['replicates']:
            library = replicate['library']

            starting_amount = float(library.get('nucleic_acid_starting_quantity', numpy.nan))
            starting_amount_units = library.get('nucleic_acid_starting_quantity_units')
            if starting_amount == 13 and starting_amount_units == 'pg':
                yield experiment_id

In [6]:
len(list(filter_experiments_by_13pg(cache)))

22

In [7]:
def load_spike_rsems(cache, keys, limit=None):
    keys = list(keys)
    total = len(keys)
    chunk = max(total // 10, 1)
    tzero = time.monotonic()
    tprev = tzero
    
    for i, experiment_id in enumerate(keys):
        experiment = cache[experiment_id]
        fpkms = []
        for file in save_rnaseq_madqc.find_rsem(experiment['files']):
            url = 'https://www.encodeproject.org' + file.href
            fpkm = pandas.read_csv(url, usecols=[0,6], sep='\t', index_col=0)
            fpkm.columns = [file.library_id]
            fpkms.append(fpkm)

        if fpkms:
            yield (experiment_id, pandas.concat(fpkms, axis=1))
            
        if (i + 1) % chunk == 0:
            tnow = time.monotonic()
            print("{} of {} in {:.2f} sec".format(
                i, total, tnow-tprev))
            tprev = tnow

        if limit and i > limit:
            return

In [8]:
for experiment_id, fpkms in load_spike_rsems(cache,
                                            filter_experiments_by_size(cache)):
    filename = os.path.join('/home/diane/tmp/encode-10ng/', experiment_id + '_library_fpkms.csv')
    fpkms.to_csv(filename)

19 of 207 in 41.68 sec
39 of 207 in 44.68 sec
59 of 207 in 35.54 sec
79 of 207 in 33.52 sec
99 of 207 in 40.74 sec
119 of 207 in 39.22 sec
139 of 207 in 20.98 sec
159 of 207 in 22.65 sec
179 of 207 in 32.32 sec
199 of 207 in 34.25 sec


In [9]:
for experiment_id, fpkms in load_spike_rsems(
      cache,
      keys=filter_experiments_by_13pg(cache)
    ):
    filename = os.path.join('/home/diane/tmp/encode-13pg/', experiment_id + '_library_fpkms.csv')
    fpkms.to_csv(filename)

1 of 22 in 5.14 sec
3 of 22 in 6.27 sec
5 of 22 in 4.41 sec
7 of 22 in 4.59 sec
9 of 22 in 5.35 sec
11 of 22 in 7.05 sec
13 of 22 in 6.76 sec
15 of 22 in 5.16 sec
17 of 22 in 6.72 sec
19 of 22 in 4.49 sec
21 of 22 in 4.70 sec


In [10]:
# master
# https://www.encodeproject.org/biosamples/ENCBS195IGI/
    
biosamples = set(['/biosamples/ENCBS356XKT/',
'/biosamples/ENCBS280SII/',
'/biosamples/ENCBS950FXN/',
'/biosamples/ENCBS036HXX/',
'/biosamples/ENCBS638AZC/',
'/biosamples/ENCBS879WMA/',
'/biosamples/ENCBS158BMT/',
'/biosamples/ENCBS901DKW/',
'/biosamples/ENCBS381VEV/',
'/biosamples/ENCBS913NDQ/',
'/biosamples/ENCBS135VMF/',
'/biosamples/ENCBS487GLA/',
'/biosamples/ENCBS236ZAD/',
'/biosamples/ENCBS934CYZ/',
'/biosamples/ENCBS383ZNJ/',])

'/biosamples/ENCBS487GLA/' in biosamples

True

In [11]:
def filter_experiments_gm12878paper(cache, limit=None):
    for i, experiment_id in enumerate(cache):
        experiment = cache[experiment_id]
        for replicate in experiment['replicates']:
            library = replicate['library']
            biosample = library['biosample']
            
            if biosample['@id'] in biosamples:
                print(experiment_id, experiment['description'])
                yield experiment_id
                break
        
        if limit and i > limit:
            break
            
list(filter_experiments_gm12878paper(cache))

ENCSR000AJD RNA-seq of GM12878 bulk prep of 10 ngs
ENCSR000AJG RNA-seq of a GM12878 10-11 cell pool


['ENCSR000AJD', 'ENCSR000AJG']

In [12]:
for experiment_id, fpkms in load_spike_rsems(
      cache,
      keys=filter_experiments_gm12878paper(cache)
    ):
    filename = os.path.join('/home/diane/tmp/encode-gm12878paper/', experiment_id + '_library_fpkms.csv')
    fpkms.to_csv(filename)

ENCSR000AJD RNA-seq of GM12878 bulk prep of 10 ngs
ENCSR000AJG RNA-seq of a GM12878 10-11 cell pool
0 of 2 in 2.31 sec
1 of 2 in 2.27 sec


In [13]:
query_url = 'search/?type=experiment&assay_term_name=single+cell+isolation+followed+by+RNA-seq'
cache_name = 'rnaseq-singlecell-experiments.shelf'
single_cache = save_rnaseq_madqc.caching_encoded_experiment_loader(query_url, cache_name)

Reading 8 of 88 records in 0.00132876199495513 seconds
Reading 16 of 88 records in 0.0014024400006746873 seconds
Reading 24 of 88 records in 0.010549728001933545 seconds
Reading 32 of 88 records in 0.0051778860070044175 seconds
Reading 40 of 88 records in 0.004348331989604048 seconds
Reading 48 of 88 records in 0.002396138006588444 seconds
Reading 56 of 88 records in 0.0012176989985164255 seconds
Reading 64 of 88 records in 0.001245064995600842 seconds
Reading 72 of 88 records in 0.0012096749996999279 seconds
Reading 80 of 88 records in 0.0014264680066844448 seconds
Reading 88 of 88 records in 0.0011516079975990579 seconds
Read 88 records in 0.03145380099886097 seconds


In [14]:
for experiment_id, fpkms in load_spike_rsems(
      single_cache,
      keys=filter_experiments_gm12878paper(single_cache)
    ):
    filename = os.path.join('/home/diane/tmp/encode-gm12878paper/', experiment_id + '_library_fpkms.csv')
    fpkms.to_csv(filename)



ENCSR000AIY RNA-seq of a single GM12878 cell
ENCSR000AJH RNA-seq of a single GM12878 cell
ENCSR673UIY RNA-seq on single cell equivalents of 10 isolated and homogenized GM12878 cells
ENCSR000AJA RNA-seq of a single GM12878 cell
ENCSR000AJE RNA-seq of a single GM12878 cell
ENCSR184CWK RNA-seq of a single GM12878 cell
ENCSR000AJB RNA-seq of a single GM12878 cell
ENCSR000AJF RNA-seq of a single GM12878 cell
ENCSR625DPZ RNA-seq of a single GM12878 cell
ENCSR000AJC RNA-seq of a single GM12878 cell
ENCSR767SOH RNA-seq of a single GM12878 cell
0 of 11 in 1.16 sec
1 of 11 in 1.17 sec
2 of 11 in 11.02 sec
3 of 11 in 1.15 sec
4 of 11 in 1.17 sec
5 of 11 in 1.10 sec
6 of 11 in 1.32 sec
7 of 11 in 1.06 sec
8 of 11 in 1.11 sec
9 of 11 in 1.13 sec
10 of 11 in 1.17 sec


In [None]:
def make_metadata_dataframe(cache, keys):
    """Return experiment metadata
    """
    for i, experiment_id in enumerate(cache):
        experiment = cache[experiment_id]
        description = experiment['description']
        for replicate in experiment['replicates']:
            library = replicate['library']
            biosample = library['biosample']
                        


In [16]:
df = save_rnaseq_madqc.make_experiment_df({x: cache[x] for x in filter_experiments_by_size(cache)})

In [23]:
df.to_csv('/home/diane/tmp/encode-10ng/experiment-metadata.csv', index=False)

In [19]:
df13pg = save_rnaseq_madqc.make_experiment_df({x: cache[x] for x in filter_experiments_by_13pg(cache)})

In [24]:
df13pg.to_csv('/home/diane/tmp/encode-13pg/experiment-metadata.csv', index=False)

In [20]:
gm12878df = save_rnaseq_madqc.make_experiment_df({x: cache[x] for x in filter_experiments_gm12878paper(cache)})

ENCSR000AJD RNA-seq of GM12878 bulk prep of 10 ngs
ENCSR000AJG RNA-seq of a GM12878 10-11 cell pool


In [25]:
gm12878df.to_csv('/home/diane/tmp/encode-gm12878paper/experiment-metadata-bulk.csv', index=False)

In [31]:
# Note no mad scores so this function doesn't work
#gm12878df = save_rnaseq_madqc.make_experiment_df({x: single_cache[x] for x in filter_experiments_gm12878paper(single_cache)})

In [40]:
starting = collections.Counter()
labs = collections.Counter()
for experiment_id in filter_experiments_by_size(cache):
    experiment = cache[experiment_id]
    labs[experiment['lab']['@id']] += 1
    for replicate in experiment['replicates']:
        library = replicate['library']
        biosample = library['biosample']
        starting_amount = library.get('nucleic_acid_starting_quantity')
        starting_amount_units = library.get('nucleic_acid_starting_quantity_units')
        starting_label = '{} {}'.format(starting_amount, starting_amount_units)
        starting[starting_label] += 1

In [41]:
dict(starting)

{'10 ng': 152,
 '10.0 ng': 112,
 '100 ng': 8,
 '1000 ng': 22,
 '10000 ng': 1,
 '1217 ng': 1,
 '12449 ng': 1,
 '1320 ng': 1,
 '1326 ng': 1,
 '1538 ng': 1,
 '1558 ng': 2,
 '1634 ng': 1,
 '1704 ng': 1,
 '1756 ng': 1,
 '1837 ng': 1,
 '1877 ng': 1,
 '200 ng': 1,
 '2000 ng': 5,
 '2419 ng': 1,
 '2436 ng': 1,
 '2584 ng': 1,
 '2640 ng': 3,
 '2680 ng': 1,
 '2876 ng': 1,
 '2970 ng': 1,
 '3000 ng': 5,
 '3051 ng': 1,
 '3565 ng': 1,
 '3848 ng': 1,
 '3883 ng': 1,
 '4000 ng': 3,
 '4171 ng': 1,
 '4216 ng': 1,
 '4252 ng': 1,
 '4255 ng': 1,
 '4652 ng': 1,
 '4777 ng': 1,
 '4830 ng': 1,
 '5054 ng': 1,
 '5280 ng': 2,
 '6068 ng': 1,
 '833 ng': 1,
 '930 ng': 1,
 'None None': 1}

In [42]:
dict(labs)

{'/labs/barbara-wold/': 136,
 '/labs/bing-ren/': 1,
 '/labs/joseph-costello/': 70}

In [45]:
starting = collections.Counter()
for experiment_id in filter_experiments_by_size(cache):
    experiment = cache[experiment_id]
    lab = experiment['lab']
    if lab['@id'] == '/labs/joseph-costello/':
        for replicate in experiment['replicates']:
            library = replicate['library']
            biosample = library['biosample']
            starting_amount = library.get('nucleic_acid_starting_quantity')
            starting_amount_units = library.get('nucleic_acid_starting_quantity_units')
            starting_label = '{} {}'.format(starting_amount, starting_amount_units)
            starting[starting_label] += 1

dict(starting)

{'1000 ng': 22,
 '10000 ng': 1,
 '1217 ng': 1,
 '12449 ng': 1,
 '1320 ng': 1,
 '1326 ng': 1,
 '1538 ng': 1,
 '1558 ng': 2,
 '1634 ng': 1,
 '1704 ng': 1,
 '1756 ng': 1,
 '1837 ng': 1,
 '1877 ng': 1,
 '2000 ng': 5,
 '2419 ng': 1,
 '2436 ng': 1,
 '2584 ng': 1,
 '2640 ng': 3,
 '2680 ng': 1,
 '2876 ng': 1,
 '2970 ng': 1,
 '3000 ng': 5,
 '3051 ng': 1,
 '3565 ng': 1,
 '3848 ng': 1,
 '3883 ng': 1,
 '4000 ng': 3,
 '4171 ng': 1,
 '4216 ng': 1,
 '4252 ng': 1,
 '4255 ng': 1,
 '4652 ng': 1,
 '4777 ng': 1,
 '4830 ng': 1,
 '5054 ng': 1,
 '5280 ng': 2,
 '6068 ng': 1,
 '833 ng': 1,
 '930 ng': 1}