# Introduction

Brian wants "all of the spikes data from the mouse embryo series (e11.5, e13.5, e14.5, e15.5, e16.5, but not P0) and combine it in to 1 table of FPKM values"

In [1]:
import pandas
import numpy
import os
import sys
import time
import collections

In [2]:
try:
    import save_rnaseq_madqc
except ImportError as e:
    print("Launch notebook server with startnb")

In [3]:
query_url = 'search/?type=experiment&assay_term_name=RNA-seq'
cache_name = 'rnaseq-experiments.shelf'
cache = save_rnaseq_madqc.caching_encoded_experiment_loader(query_url, cache_name)

Reading 140 of 1408 records in 0.09593840200068371 seconds
Reading 280 of 1408 records in 0.042379406999316416 seconds
Reading 420 of 1408 records in 0.04536383199956617 seconds
Reading 560 of 1408 records in 0.033109973001046455 seconds
Reading 700 of 1408 records in 0.06831830400005856 seconds
Reading 840 of 1408 records in 0.024399958998401416 seconds
Reading 980 of 1408 records in 0.31791950300066674 seconds
Reading 1120 of 1408 records in 0.25477535000027274 seconds
Reading 1260 of 1408 records in 0.034925001998999505 seconds
Reading 1400 of 1408 records in 0.06976228500025172 seconds
Read 1408 records in 0.9868920169992634 seconds


In [97]:
def find_mouse_embryo_experiments(cache, limit=None):
    """Look for wold mouse embryo experimments.
    """
    experiments = []
    for i, experiment_id in enumerate(cache):
        experiment = cache[experiment_id]
        experiment_lab = experiment['lab']

        if experiment_lab['@id'] == '/labs/barbara-wold/':
            for replicate in experiment['replicates']:
                library = replicate['library']
                library_id = library['@id']
                starting_quantity = library.get('nucleic_acid_starting_quantity')
                starting_units = library.get('nucleic_acid_starting_quantity_units')
                biosample = library['biosample']
                biosample_id = biosample['@id']
                life_stage = biosample['life_stage']
                age = biosample.get('age')
                age_units = biosample.get('age_units')
                biosample_term_name = biosample.get('biosample_term_name')
                organism = biosample['organism']
                species = organism['name']

                if life_stage == 'embryonic' and species == 'mouse' and starting_units == 'ng':
                    pretty_age = "{} {}".format(age, age_units)
                    experiments.append((pretty_age, biosample_term_name, experiment_id, library_id))
                    print(experiment_id, biosample_id, age, age_units, starting_quantity, starting_units, biosample_term_name)

        if limit and i > limit:
            return experiments

    return experiments
    
experiments = find_mouse_embryo_experiments(cache)
#grid

ENCSR347SQR /biosamples/ENCBS896ZED/ 13.5 day 10 ng limb
ENCSR347SQR /biosamples/ENCBS176PGP/ 13.5 day 10 ng limb
ENCSR541XZK /biosamples/ENCBS066IEL/ 11.5 day 10 ng limb
ENCSR541XZK /biosamples/ENCBS681XPR/ 11.5 day 10 ng limb
ENCSR906YQZ /biosamples/ENCBS355TAQ/ 15.5 day 10 ng stomach
ENCSR906YQZ /biosamples/ENCBS915SZN/ 15.5 day 10 ng stomach
ENCSR792RJV /biosamples/ENCBS185VNZ/ 13.5 day 10 ng midbrain
ENCSR792RJV /biosamples/ENCBS725LVX/ 13.5 day 10 ng midbrain
ENCSR343YLB /biosamples/ENCBS825LGT/ 14.5 day 10.0 ng midbrain
ENCSR343YLB /biosamples/ENCBS849BXE/ 14.5 day 10.0 ng midbrain
ENCSR760TOE /biosamples/ENCBS199NDR/ 11.5 day 10 ng hindbrain
ENCSR760TOE /biosamples/ENCBS976DPO/ 11.5 day 10 ng hindbrain
ENCSR823VEE /biosamples/ENCBS903HZW/ 14.5 day 10.0 ng embryonic facial prominence
ENCSR823VEE /biosamples/ENCBS776GWF/ 14.5 day 10.0 ng embryonic facial prominence
ENCSR992WBR /biosamples/ENCBS377RBN/ 16.5 day 10 ng lung
ENCSR992WBR /biosamples/ENCBS833ZFI/ 16.5 day 10 ng lung
EN

In [99]:
sorted(experiments)

[('11.5 day',
  'embryonic facial prominence',
  'ENCSR848HOX',
  '/libraries/ENCLB331VAA/'),
 ('11.5 day',
  'embryonic facial prominence',
  'ENCSR848HOX',
  '/libraries/ENCLB761PCA/'),
 ('11.5 day', 'forebrain', 'ENCSR160IIN', '/libraries/ENCLB471QMM/'),
 ('11.5 day', 'forebrain', 'ENCSR160IIN', '/libraries/ENCLB835LVO/'),
 ('11.5 day', 'heart', 'ENCSR691OPQ', '/libraries/ENCLB347FRI/'),
 ('11.5 day', 'heart', 'ENCSR691OPQ', '/libraries/ENCLB601XLL/'),
 ('11.5 day', 'hindbrain', 'ENCSR760TOE', '/libraries/ENCLB061TDP/'),
 ('11.5 day', 'hindbrain', 'ENCSR760TOE', '/libraries/ENCLB454ZUS/'),
 ('11.5 day', 'limb', 'ENCSR541XZK', '/libraries/ENCLB495FUZ/'),
 ('11.5 day', 'limb', 'ENCSR541XZK', '/libraries/ENCLB979DZW/'),
 ('11.5 day', 'liver', 'ENCSR284AMY', '/libraries/ENCLB055KYV/'),
 ('11.5 day', 'liver', 'ENCSR284AMY', '/libraries/ENCLB357KYA/'),
 ('11.5 day', 'midbrain', 'ENCSR307BCA', '/libraries/ENCLB426JKF/'),
 ('11.5 day', 'midbrain', 'ENCSR307BCA', '/libraries/ENCLB659UFY/'),


In [62]:
experiment_grid = pandas.DataFrame(
    { age: { tissue: list(grid[age][tissue])[0] for tissue in grid[age]} for age in grid }
)
experiment_grid

Unnamed: 0,11.5,13.5,14.5,15.5,16.5
embryonic facial prominence,ENCSR848HOX,ENCSR538WYL,ENCSR823VEE,ENCSR636CWO,
forebrain,ENCSR160IIN,ENCSR970EWM,ENCSR185LWM,ENCSR752RGN,ENCSR080EVZ
heart,ENCSR691OPQ,ENCSR284YKY,ENCSR727FHP,ENCSR597UZW,ENCSR020DGG
hindbrain,ENCSR760TOE,ENCSR921PRX,ENCSR559TRB,ENCSR401BSG,ENCSR285WZV
intestine,,,ENCSR932TRU,ENCSR370SFB,ENCSR848GST
kidney,,,ENCSR504GEG,ENCSR062VTB,ENCSR537GNQ
limb,ENCSR541XZK,ENCSR347SQR,ENCSR216NEG,ENCSR830IVQ,
liver,ENCSR284AMY,ENCSR448MXQ,ENCSR867YNV,ENCSR611PTP,ENCSR826HIQ
lung,,,ENCSR039ADS,ENCSR457RRW,ENCSR992WBR
midbrain,ENCSR307BCA,ENCSR792RJV,ENCSR343YLB,ENCSR557RMA,ENCSR367ZPZ


In [96]:
def load_spike_rsems(cache, keys, limit=None):
    keys = list(keys)
    total = len(keys)
    chunk = max(total // 10, 1)
    tzero = time.monotonic()
    tprev = tzero
    
    for i, experiment_id in enumerate(keys):
        experiment = cache[experiment_id]
        fpkms = []
        for file in save_rnaseq_madqc.find_rsem(experiment['files']):
            url = 'https://www.encodeproject.org' + file.href
            fpkm = pandas.read_csv(url, usecols=[0,6], sep='\t', index_col=0)
            fpkm.columns = [file.library_id]
            fpkms.append(fpkm)

        if fpkms:
            yield (experiment_id, pandas.concat(fpkms, axis=1))
            
        if (i + 1) % chunk == 0:
            tnow = time.monotonic()
            print("{} of {} in {:.2f} sec".format(
                i, total, tnow-tprev))
            tprev = tnow

        if limit and i > limit:
            return

In [95]:
for age in sorted(grid):
    row = grid[age]
    for tissue in sorted(row):
        print(age,  tissue, list(row[tissue])[0])

11.5 embryonic facial prominence ENCSR848HOX
11.5 forebrain ENCSR160IIN
11.5 heart ENCSR691OPQ
11.5 hindbrain ENCSR760TOE
11.5 limb ENCSR541XZK
11.5 liver ENCSR284AMY
11.5 midbrain ENCSR307BCA
11.5 neural tube ENCSR337FYI
13.5 embryonic facial prominence ENCSR538WYL
13.5 forebrain ENCSR970EWM
13.5 heart ENCSR284YKY
13.5 hindbrain ENCSR921PRX
13.5 limb ENCSR347SQR
13.5 liver ENCSR448MXQ
13.5 midbrain ENCSR792RJV
13.5 neural tube ENCSR115TWD
14.5 embryonic facial prominence ENCSR823VEE
14.5 forebrain ENCSR185LWM
14.5 heart ENCSR727FHP
14.5 hindbrain ENCSR559TRB
14.5 intestine ENCSR932TRU
14.5 kidney ENCSR504GEG
14.5 limb ENCSR216NEG
14.5 liver ENCSR867YNV
14.5 lung ENCSR039ADS
14.5 midbrain ENCSR343YLB
14.5 neural tube ENCSR928OXI
14.5 stomach ENCSR290RRR
15.5 embryonic facial prominence ENCSR636CWO
15.5 forebrain ENCSR752RGN
15.5 heart ENCSR597UZW
15.5 hindbrain ENCSR401BSG
15.5 intestine ENCSR370SFB
15.5 kidney ENCSR062VTB
15.5 limb ENCSR830IVQ
15.5 liver ENCSR611PTP
15.5 lung ENCSR457