One annoying thing about dealing with the ENCODED is it returns a large trees of information associated with a requested ID. Additional requests for other IDs can and will return duplicate information.

For example a set of experiments will have certainly have a common lab, common awards, and frequently common biosamples.

In [122]:
import pandas
import numpy
import collections
import odo

from curation_common import *

In [3]:
server = ENCODED('www.encodeproject.org')
server.load_netrc()

In [93]:
def is_encoded_object(obj):
    if not isinstance(obj, collections.Iterable):
        return False

    if '@id' in obj and '@type' in obj:
        return True
    return False

def parse_attachment(obj, tables):
    """Turn attachment records into table entries.
    
    They don't have @id @types so don't show up as objects.
    """
    attachments = tables.setdefault('Attachment', {})
    href = obj['href']
    if href not in attachments:
        attachments[href] = obj
    return href

def parse_embedded(obj, tables=None):
    """The DCC defaults to returning an object tree.
    
    lets break that up into multiple records
    """
    if tables is None:
        tables = {}
    
    obj_all_types = obj['@type']
    obj_type = obj_all_types[0]
    obj_id = obj['@id']
    #print("Recursed for: {} {}".format(obj_type, obj_id))
    
    table = tables.setdefault(obj_type, {})
    if obj_id not in table:
        record = table.setdefault(obj_id, {})

        for key in obj:
            value = obj[key]
            # FIXME: this should generate a linking table... not a list
            if key == 'attachment':
                record[key] = parse_attachment(value, tables)
            elif key == '@type':
                # lets ignore the inherited type tree
                record[key] = value[0]
            elif isinstance(value, list) and len(value) > 0 and is_encoded_object(value[0]):
                item_ids = []
                for item in value:
                    parse_embedded(item, tables)
                    item_ids.append(item['@id'])
                record[key] = item_ids
            elif is_encoded_object(value):
                parse_embedded(value, tables)
                record[key] = value['@id']
            else:
                record[key] = value

def cache_dcc_metadata(query):
    tables = {}
    tzero = time.monotonic()
    tprev = tzero
    progress = len(query['@graph']) // 10
    for i, record in enumerate(query['@graph']):
        accession = record['@id'][len('/experiments/'):-1]
        extended_experiment = server.get_json(record['@id'])

        parse_embedded(extended_experiment, tables)

        if (i+1) % progress  == 0:
            tnow = time.monotonic()
            print("Reading {} of {} records in {} seconds".format(
                  (i+1),
                  len(query['@graph']), 
                  tnow - tprev))
            tprev = tnow
    print("Read {} records in {} seconds".format(len(query['@graph']), tnow-tzero))
    return tables


In [94]:
query = server.get_json("search/?type=Experiment&assay_term_name=RNA-seq&organ_slims=brain&assembly=mm10")
t = cache_dcc_metadata(query)

Reading 3 of 31 records in 1.7768760769995424 seconds
Reading 6 of 31 records in 0.9961536210003032 seconds
Reading 9 of 31 records in 1.2863293899999917 seconds
Reading 12 of 31 records in 1.3446275369997238 seconds
Reading 15 of 31 records in 0.9617928000006941 seconds
Reading 18 of 31 records in 1.002926765999291 seconds
Reading 21 of 31 records in 0.9781768540005942 seconds
Reading 24 of 31 records in 0.9704227449992686 seconds
Reading 27 of 31 records in 1.0515457720002814 seconds
Reading 30 of 31 records in 0.965291026000159 seconds
Read 31 records in 11.33414258799985 seconds


In [97]:
c = collections.Counter()
for f in t['MadQualityMetric'].values():
    for name in f:
        c[name] += 1
dict(c)

{'@id': 33,
 '@type': 33,
 'MAD of log ratios': 33,
 'Pearson correlation': 33,
 'SD of log ratios': 33,
 'Spearman correlation': 33,
 'aliases': 33,
 'assay_term_id': 33,
 'assay_term_name': 33,
 'attachment': 33,
 'date_created': 33,
 'quality_metric_of': 33,
 'schema_version': 33,
 'status': 33,
 'step_run': 33,
 'submitted_by': 33,
 'uuid': 33}

In [123]:
odo.discover(mad)

dshape("""33 * {
  'SD of log ratios': ?float64,
  '@type': ?string,
  step_run: ?string,
  submitted_by: ?string,
  schema_version: ?string,
  quality_metric_of: ?string,
  uuid: ?string,
  status: ?string,
  assay_term_id: ?string,
  '@id': ?string,
  aliases: ?string,
  'Pearson correlation': ?float64,
  assay_term_name: ?string,
  attachment: ?string,
  'Spearman correlation': ?float64,
  'MAD of log ratios': ?float64,
  date_created: ?string
  }""")

In [114]:
mad = pandas.DataFrame.from_dict(t['MadQualityMetric'], orient='index')
files = pandas.DataFrame.from_dict(t['File'], orient='index')

In [113]:
mad.loc['/mad-quality-metrics/fc509dab-9d28-43a2-9f89-fbb510c564f9/']['quality_metric_of']

['/files/ENCFF887QYY/', '/files/ENCFF487ALN/']

In [117]:
files.loc['/files/ENCFF887QYY/']

award                                                 /awards/U41HG006992/
@type                                                                 File
biological_replicates                                                  [1]
accession                                                      ENCFF887QYY
file_type                                                              tsv
aliases                           [dnanexus:file-Bj6zF8j0vp7GjG3j55xvP7Xg]
output_category                                             quantification
uuid                                  09d6c75a-fa58-49fe-afa5-fccefc1ba558
dataset                                          /experiments/ENCSR752RGN/
status                                                            released
quality_metrics          [/mad-quality-metrics/fc509dab-9d28-43a2-9f89-...
notes                    {"software_versions": [{"version": {"quant-rse...
@id                                                    /files/ENCFF887QYY/
dbxrefs                  

In [7]:
sorted(t.keys())

['AnalysisStep',
 'AnalysisStepAnalysisStepVersion',
 'AnalysisStepPipeline',
 'AnalysisStepRun',
 'AnalysisStepVersion',
 'AnalysisStepVersionSoftwareVersion',
 'Attachment',
 'Award',
 'Biosample',
 'Document',
 'Experiment',
 'ExperimentFile',
 'ExperimentReplicate',
 'File',
 'FileFile',
 'FileMadQualityMetric',
 'FileStarQualityMetric',
 'Lab',
 'Library',
 'LibraryDocument',
 'LibraryReference',
 'MadQualityMetric',
 'MouseDonor',
 'Organism',
 'Pipeline',
 'Platform',
 'Reference',
 'Replicate',
 'Software',
 'SoftwareVersion',
 'Source',
 'StarQualityMetric',
 'User']

In [120]:
long_rna_query  = "https://www.encodeproject.org/search/?type=Experiment&assay_term_name=RNA-seq&award.project=ENCODE&assembly=hg19&replicates.library.size_range=%3E200&lab.title=Thomas+Gingeras%2C+CSHL"
rampage_query   = "https://www.encodeproject.org/search/?type=Experiment&award.project=ENCODE&assembly=hg19&replicates.library.size_range=%3E200&lab.title=Thomas+Gingeras%2C+CSHL&assay_term_name=RAMPAGE"
small_rna_query = "https://www.encodeproject.org/search/?type=Experiment&award.project=ENCODE&assembly=hg19&lab.title=Thomas+Gingeras%2C+CSHL&assay_term_name=RNA-seq&replicates.library.size_range=%3C200"

In [121]:
long_rna_tables = cache_dcc_metadata(server.get_json(long_rna_query))

Reading 20 of 200 records in 4.75295107599959 seconds
Reading 40 of 200 records in 5.112724122000145 seconds
Reading 60 of 200 records in 6.2564897889997155 seconds
Reading 80 of 200 records in 6.209476519000418 seconds
Reading 100 of 200 records in 6.807286764999844 seconds
Reading 120 of 200 records in 7.095306315000016 seconds
Reading 140 of 200 records in 8.318428931999733 seconds
Reading 160 of 200 records in 6.889881656000398 seconds
Reading 180 of 200 records in 6.020223762999194 seconds
Reading 200 of 200 records in 5.512095469000997 seconds
Read 200 records in 62.97486440600005 seconds


In [25]:
rampage = cache_dcc_metadata(server.get_json(rampage_query))

Reading 3 of 38 records in 1.1219779939997352 seconds
Reading 6 of 38 records in 1.1398926669999128 seconds
Reading 9 of 38 records in 1.0878116260000752 seconds
Reading 12 of 38 records in 1.0641096809999908 seconds
Reading 15 of 38 records in 1.0899548390002565 seconds
Reading 18 of 38 records in 1.05747558999974 seconds
Reading 21 of 38 records in 1.052698982000038 seconds
Reading 24 of 38 records in 1.0472028089998275 seconds
Reading 27 of 38 records in 0.7504600900001606 seconds
Reading 30 of 38 records in 1.529810240000188 seconds
Reading 33 of 38 records in 1.0648237209998115 seconds
Reading 36 of 38 records in 1.0069230570002219 seconds
Read 38 records in 13.013141295999958 seconds


In [26]:
[(x, len(rampage[x])) for x in rampage]

[('SoftwareVersion', 7),
 ('Document', 48),
 ('Organism', 1),
 ('StarQualityMetric', 63),
 ('Replicate', 136),
 ('Source', 9),
 ('IdrSummaryQualityMetric', 30),
 ('Experiment', 76),
 ('FileIdrSummaryQualityMetric', 58),
 ('Library', 69),
 ('AnalysisStepAnalysisStepVersion', 3),
 ('FileStarQualityMetric', 63),
 ('Award', 3),
 ('FileFile', 1278),
 ('AnalysisStep', 6),
 ('Biosample', 69),
 ('AnalysisStepVersion', 7),
 ('User', 10),
 ('Lab', 2),
 ('LibraryDocument', 49),
 ('HumanDonor', 28),
 ('File', 933),
 ('ExperimentExperiment', 38),
 ('AnalysisStepRun', 123),
 ('MadQualityMetric', 30),
 ('Pipeline', 2),
 ('Software', 6),
 ('Platform', 1),
 ('AnalysisStepPipeline', 3),
 ('AnalysisStepVersionSoftwareVersion', 10),
 ('ExperimentReplicate', 69),
 ('ExperimentFile', 960),
 ('FileMadQualityMetric', 58),
 ('Attachment', 171),
 ('ExperimentDocument', 38)]

In [27]:
small_rna = cache_dcc_metadata(server.get_json(small_rna_query))

Reading 12 of 121 records in 2.4122363960000257 seconds
Reading 24 of 121 records in 2.1826849160001984 seconds
Reading 36 of 121 records in 2.64339040699997 seconds
Reading 48 of 121 records in 2.084336104999693 seconds
Reading 60 of 121 records in 2.3382598800003507 seconds
Reading 72 of 121 records in 2.788500609000039 seconds
Reading 84 of 121 records in 2.36841025800004 seconds
Reading 96 of 121 records in 2.249588119999771 seconds
Reading 108 of 121 records in 2.759853344000021 seconds
Reading 120 of 121 records in 2.1502437230001306 seconds
Read 121 records in 23.97750375800024 seconds


In [31]:
make_df(rampage, 'MadQualityMetric')

Unnamed: 0,@id,@type,MAD of log ratios,Pearson correlation,SD of log ratios,Spearman correlation,aliases,assay_term_id,assay_term_name,attachment,date_created,quality_metric_of,schema_version,status,step_run,submitted_by,uuid
/mad-quality-metrics/345b78dc-dfbc-48a3-9091-a074f2d96c8d/,/mad-quality-metrics/345b78dc-dfbc-48a3-9091-a...,MadQualityMetric,0.867,0.944957,1.036,0.943047,[dnanexus:qc.mad.job-Bj642480J6Z59GzjXGB9g2G3],OBI:0001864,RAMPAGE,@@download/attachment/ENCFF703QJX-ENCFF244ERL_...,2015-10-07T16:44:42.283055+00:00,"[/files/ENCFF868NMX/, /files/ENCFF166PAG/]",1,released,/analysis-step-runs/9128c594-c58f-451f-95d5-96...,/users/a00d89e0-a488-4adf-9738-25149afc4087/,345b78dc-dfbc-48a3-9091-a074f2d96c8d
/mad-quality-metrics/f0d994b4-9782-4b55-aaf4-b373d347c044/,/mad-quality-metrics/f0d994b4-9782-4b55-aaf4-b...,MadQualityMetric,3.164,0.85976,2.456,0.854473,[dnanexus:qc.mad.job-Bgx5Gj00J6Z0QQXZF0z414Zj],OBI:0001864,RAMPAGE,@@download/attachment/ENCFF624BCW-ENCFF164QXT_...,2015-10-07T18:02:12.451320+00:00,"[/files/ENCFF171HWG/, /files/ENCFF667EFB/]",1,released,/analysis-step-runs/9745deff-c0d1-42c1-844e-8f...,/users/a00d89e0-a488-4adf-9738-25149afc4087/,f0d994b4-9782-4b55-aaf4-b373d347c044
/mad-quality-metrics/395be675-57a6-493b-b144-64145ba89b78/,/mad-quality-metrics/395be675-57a6-493b-b144-6...,MadQualityMetric,1.483,0.883738,1.389,0.877815,[dnanexus:qc.mad.job-Bgx5J6j0J6ZGjFg36Gv3KgV9],OBI:0001864,RAMPAGE,@@download/attachment/ENCFF110DHT-ENCFF144SWZ_...,2015-10-07T18:40:04.963471+00:00,"[/files/ENCFF566WCO/, /files/ENCFF288EJM/]",1,released,/analysis-step-runs/bc084c0d-1e50-44d1-af36-b1...,/users/a00d89e0-a488-4adf-9738-25149afc4087/,395be675-57a6-493b-b144-64145ba89b78
/mad-quality-metrics/60e06c3a-2b69-4833-bc96-3693c7c842ef/,/mad-quality-metrics/60e06c3a-2b69-4833-bc96-3...,MadQualityMetric,0.867,0.940562,1.02,0.939964,[dnanexus:qc.mad.job-Bgx5Gj80J6Z57bvJB4fjvyVY],OBI:0001864,RAMPAGE,@@download/attachment/ENCFF359GLF-ENCFF565JCQ_...,2015-10-07T08:24:01.798660+00:00,"[/files/ENCFF468KVO/, /files/ENCFF679NGX/]",1,released,/analysis-step-runs/8025cc83-4f16-464b-ac5c-06...,/users/a00d89e0-a488-4adf-9738-25149afc4087/,60e06c3a-2b69-4833-bc96-3693c7c842ef
/mad-quality-metrics/1c7a5b93-f8cd-47fe-b3ab-abd8d4fb08a6/,/mad-quality-metrics/1c7a5b93-f8cd-47fe-b3ab-a...,MadQualityMetric,0.784,0.949171,0.967,0.945197,[dnanexus:qc.mad.job-Bgx5Gjj0J6ZK3877PkQ90y5p],OBI:0001864,RAMPAGE,@@download/attachment/ENCFF282UIM-ENCFF683BLP_...,2015-10-07T16:37:55.884338+00:00,"[/files/ENCFF872OMM/, /files/ENCFF160TPF/]",1,released,/analysis-step-runs/24cec468-8ef9-4bb1-b6ce-3b...,/users/a00d89e0-a488-4adf-9738-25149afc4087/,1c7a5b93-f8cd-47fe-b3ab-abd8d4fb08a6
/mad-quality-metrics/8d97d972-5688-4518-b99f-9c962f993943/,/mad-quality-metrics/8d97d972-5688-4518-b99f-9...,MadQualityMetric,0.933,0.93704,1.137,0.934913,[dnanexus:qc.mad.job-Bgx5J7j0J6Z0QQXZF0z414b7],OBI:0001864,RAMPAGE,@@download/attachment/ENCFF452ZET-ENCFF568PTY_...,2015-10-07T16:44:01.739563+00:00,"[/files/ENCFF936WXI/, /files/ENCFF969SCC/]",1,released,/analysis-step-runs/25e680bc-e05a-445c-a403-39...,/users/a00d89e0-a488-4adf-9738-25149afc4087/,8d97d972-5688-4518-b99f-9c962f993943
/mad-quality-metrics/c555bca5-65c9-4d4e-8b09-11268bb69a9f/,/mad-quality-metrics/c555bca5-65c9-4d4e-8b09-1...,MadQualityMetric,2.472,0.856604,2.066,0.847162,[dnanexus:qc.mad.job-Bgx5J700J6Z33bVzJbQJ0QFP],OBI:0001864,RAMPAGE,@@download/attachment/ENCFF091MAK-ENCFF263SXU_...,2015-10-07T16:20:32.161275+00:00,"[/files/ENCFF924AZS/, /files/ENCFF549JWI/]",1,released,/analysis-step-runs/a1094bab-d0f7-4f8c-95d5-8d...,/users/a00d89e0-a488-4adf-9738-25149afc4087/,c555bca5-65c9-4d4e-8b09-11268bb69a9f
/mad-quality-metrics/07e7f939-2b39-4fa0-94b7-16dc548e5df3/,/mad-quality-metrics/07e7f939-2b39-4fa0-94b7-1...,MadQualityMetric,3.638,0.791816,2.825,0.784411,[dnanexus:qc.mad.job-Bgx5J7Q0J6Z3KbkXqVPVZxQ5],OBI:0001864,RAMPAGE,@@download/attachment/ENCFF704TLR-ENCFF336TLI_...,2015-10-07T16:37:36.398938+00:00,"[/files/ENCFF400GTY/, /files/ENCFF970BSE/]",1,released,/analysis-step-runs/b3293c58-0f63-4de5-97eb-41...,/users/a00d89e0-a488-4adf-9738-25149afc4087/,07e7f939-2b39-4fa0-94b7-16dc548e5df3
/mad-quality-metrics/4121b76a-f3bb-4d41-81e7-91c837fb4dcc/,/mad-quality-metrics/4121b76a-f3bb-4d41-81e7-9...,MadQualityMetric,1.444,0.866517,1.571,0.855769,[dnanexus:qc.mad.job-BgvvK280J6Z8vKkvKbQ33YZ3],OBI:0001864,RAMPAGE,@@download/attachment/ENCFF001YXU-ENCFF001YXV_...,2015-10-07T08:23:37.470309+00:00,"[/files/ENCFF967RIZ/, /files/ENCFF617EQU/]",1,released,/analysis-step-runs/e994942d-abd2-4bb6-b9ec-b1...,/users/a00d89e0-a488-4adf-9738-25149afc4087/,4121b76a-f3bb-4d41-81e7-91c837fb4dcc
/mad-quality-metrics/2ee57fbe-b1dc-4de8-be54-cf8039f9be0e/,/mad-quality-metrics/2ee57fbe-b1dc-4de8-be54-c...,MadQualityMetric,0.752,0.954973,1.011,0.955341,[dnanexus:qc.mad.job-Bgx5JPQ0J6Z3F6707GzF7z3K],OBI:0001864,RAMPAGE,@@download/attachment/ENCFF002BEN-ENCFF002BEO_...,2015-10-07T08:31:48.138331+00:00,"[/files/ENCFF799HZJ/, /files/ENCFF578GIW/]",1,released,/analysis-step-runs/509bae02-37ad-4a58-bbf8-b1...,/users/a00d89e0-a488-4adf-9738-25149afc4087/,2ee57fbe-b1dc-4de8-be54-cf8039f9be0e
