The DCC has made available QC metrics for the experiments submitted. We need to try to draw some conclusions from them.



In [1]:
import pandas
import numpy
import pprint
import collections

In [2]:
from curation_common import *

In [3]:
server = ENCODED('www.encodeproject.org')
server.load_netrc()

In [4]:
query = server.get_json('search/?type=experiment&lab.title=Barbara Wold, Caltech')
# &award.rfa=ENCODE3

In [5]:
experiments = {}
for i, record in enumerate(query['@graph']):
    accession = record['@id'][len('/experiments/'):-1]
    experiments[accession] = server.get_json(record['@id'])
    if i % 10 == 0:
        print(i, len(query['@graph']))

0 299
10 299
20 299
30 299
40 299
50 299
60 299
70 299
80 299
90 299
100 299
110 299
120 299
130 299
140 299
150 299
160 299
170 299
180 299
190 299
200 299
210 299
220 299
230 299
240 299
250 299
260 299
270 299
280 299
290 299


In [8]:
def format_organism(url):
    url = [ x for x in url.split('/') if len(x) > 0 ]
    return url[-1]

def aliases_to_wold_id(aliases):
    if len(aliases) > 1:
        raise RuntimeError("You need to do more work. Several library aliases: %s", aliases)
    elif len(aliases) == 1:
        alias = aliases[0].replace('barbara-wold:', '')
    else:
        alias = ''
    return alias
    

In [22]:
rows = []
for accession, detail in experiments.items():
    # For all the files in an experiment
    for f in detail['files']:
        # look at files that have quality control metrics attached
        if len(f['quality_metrics']) > 0:
            qc = f['quality_metrics']
            replicate = f['replicate']
            library = replicate['library']
            biosample = library['biosample']
            alias = aliases_to_wold_id(library['aliases'])
            # make sure there's only one QC metric attached to a file 
            assert len(qc) == 1
            # we only want to look at the MAD scores right now.
            # we're ignoring the STAR & RSEM scores and just
            # investigating Rafa's MAD QC output.
            if 'MadQualityMetric' in qc[0]['@type']:
                record = collections.OrderedDict(
                    # long list of things to identify a particular experiment replicate
                    (('experiment', accession), 
                     ('organism', format_organism(biosample['organism'])),
                     ('biosample', biosample['biosample_term_name']),
                     ('age', biosample['age'] + ' ' + biosample.get('age_units', '')),
                     ('starting', library.get('nucleic_acid_starting_quantity', numpy.nan)),
                     ('starting_units', library.get('nucleic_acid_starting_quantity_units', numpy.nan)),
                     ('bio_rep', replicate['biological_replicate_number']),
                     ('tech_rep', replicate['technical_replicate_number']),
                     ('library_id', alias),
                     # the qc metrics
                     ('Pearson', qc[0]['Pearson correlation']),
                     ('Spearman', qc[0]['Spearman correlation']),
                     ('MAD', qc[0]['MAD of log ratios']),
                     ('SD', qc[0]['SD of log ratios']),
                    ))
                rows.append(record)
experiment_qc = pandas.DataFrame(rows, columns=record.keys())         

Show (some of) the data. 

In [23]:
experiment_qc

Unnamed: 0,experiment,organism,biosample,age,starting,starting_units,bio_rep,tech_rep,library_id,Pearson,Spearman,MAD,SD
0,ENCSR830IVQ,mouse,limb,15.5 day,10,ng,1,1,16134,0.986437,0.987857,0.186,0.338
1,ENCSR830IVQ,mouse,limb,15.5 day,10,ng,2,1,16135,0.986437,0.987857,0.186,0.338
2,ENCSR538WYL,mouse,embryonic facial prominence,13.5 day,10,ng,1,1,16106,0.978883,0.981664,0.199,0.444
3,ENCSR538WYL,mouse,embryonic facial prominence,13.5 day,10,ng,2,1,16107,0.978883,0.981664,0.199,0.444
4,ENCSR932TRU,mouse,intestine,14.5 day,10.0,ng,2,1,15091,0.985479,0.988529,0.181,0.353
5,ENCSR932TRU,mouse,intestine,14.5 day,10.0,ng,1,1,15090,0.985479,0.988529,0.181,0.353
6,ENCSR080EVZ,mouse,forebrain,16.5 day,10,ng,1,1,16140,0.985635,0.987653,0.223,0.350
7,ENCSR080EVZ,mouse,forebrain,16.5 day,10,ng,2,1,16141,0.985635,0.987653,0.223,0.350
8,ENCSR636CWO,mouse,embryonic facial prominence,15.5 day,10,ng,1,1,16136,0.981865,0.983680,0.161,0.390
9,ENCSR636CWO,mouse,embryonic facial prominence,15.5 day,10,ng,2,1,16137,0.981865,0.983680,0.161,0.390


we have 10, 30, 50 cell sets, 13 pg and 10 ng data.

In [45]:
set([ '%s %s' % (x, y) for x,y in experiment_qc[['starting', 'starting_units']].values ])

{'10 cells',
 '10 ng',
 '10.0 ng',
 '100 ng',
 '11 cells',
 '13 pg',
 '30 cell-equivalent',
 '30 cells',
 '30.0 cell-equivalent',
 '30.0 cells',
 '50.0 cells',
 'nan nan'}

Save data to CSV

In [21]:
experiment_qc.to_csv('/tmp/experiment-mad-qc.csv', index=False)