# Introduction

Ken gave me a list of experiment IDs and would like to find the corresponding RSEM files for them.

In [1]:
import pandas
import os
import sys

In [2]:
ROOTS = [
    os.path.expanduser('~/proj/encode3-curation'),
    os.path.expanduser('~/proj/htsworkflow'),
]
for r in ROOTS:
    if r not in sys.path:
        sys.path.append(r)

In [3]:
from curation_common import *

In [4]:
server = ENCODED('www.encodeproject.org')
server.load_netrc()

In [5]:
import save_rnaseq_madqc

In [6]:
experiment_db = {}
experiment_db['ENCSR919QJT'] = server.get_json('https://www.encodeproject.org/experiments/ENCSR919QJT/')

In [7]:
query_url = 'search/?type=experiment&assay_term_name=RNA-seq'
cache_name = 'rnaseq-experiments.shelf'
experiments_db = save_rnaseq_madqc.caching_encoded_experiment_loader(query_url, cache_name)


Reading 150 of 1504 records in 0.008860171001288109 seconds
Reading 300 of 1504 records in 0.0060497469967231154 seconds
Reading 450 of 1504 records in 0.005841838006745093 seconds
Reading 600 of 1504 records in 0.008946743997512385 seconds
Reading 750 of 1504 records in 0.006682738006929867 seconds
Reading 900 of 1504 records in 0.007477486986317672 seconds
Reading 1050 of 1504 records in 0.039147881005192176 seconds
Reading 1200 of 1504 records in 0.05147456799750216 seconds
Reading 1350 of 1504 records in 0.026019398006610572 seconds
Reading 1500 of 1504 records in 0.06485720399359707 seconds
Read 1504 records in 0.22535777599841822 seconds


In [8]:
for f in experiments_db['ENCSR919QJT']['files']:
    wewant = [
'/files/ENCFF114MWR/',
'/files/ENCFF331TCQ/',
'/files/ENCFF780SJS/',
'/files/ENCFF555SHC/',
    ]
    
    analysis_step_version = f.get('analysis_step_version')
    software_versions = []
    if analysis_step_version:
        analysis_step = analysis_step_version['analysis_step']
        output_file_types = analysis_step['output_file_types']
        software_versions = analysis_step_version.get('software_versions')
                
        for version in software_versions:
            software = version['software']
            software_type = software.get('software_type')
            print(f["@id"], f['@id'] in wewant, f['file_format'], f['output_type'], software['name'])


/files/ENCFF815WBQ/ False bam alignments rna-pipelines
/files/ENCFF815WBQ/ False bam alignments lrna-align-tophat-pe
/files/ENCFF815WBQ/ False bam alignments concat-fastqs
/files/ENCFF815WBQ/ False bam alignments tophat
/files/ENCFF815WBQ/ False bam alignments bowtie
/files/ENCFF815WBQ/ False bam alignments tophat_bam_xsA_tag_fix.pl
/files/ENCFF183FCB/ False bam alignments rna-pipelines
/files/ENCFF183FCB/ False bam alignments lrna-align-tophat-pe
/files/ENCFF183FCB/ False bam alignments concat-fastqs
/files/ENCFF183FCB/ False bam alignments tophat
/files/ENCFF183FCB/ False bam alignments bowtie
/files/ENCFF183FCB/ False bam alignments tophat_bam_xsA_tag_fix.pl
/files/ENCFF544QSS/ False bigWig minus strand signal of all reads rna-pipelines
/files/ENCFF544QSS/ False bigWig minus strand signal of all reads lrna-signals-stranded
/files/ENCFF544QSS/ False bigWig minus strand signal of all reads star
/files/ENCFF544QSS/ False bigWig minus strand signal of all reads bedgraphtobigwig
/files/E

In [9]:
rsems = list(save_rnaseq_madqc.find_all_rsem(experiments_db['ENCSR919QJT']['files']))
rsems

[FileInfo(date_created='2015-08-24T20:23:06.215398+00:00', file_id='/files/ENCFF114MWR/', library_id='ENCLB738UVO', experiment_id='ENCSR919QJT', assembly='hg19', genome_annotation='V19', spikes_used=['ENCSR156CIL'], href='/files/ENCFF114MWR/@@download/ENCFF114MWR.tsv'),
 FileInfo(date_created='2015-08-24T20:24:56.110443+00:00', file_id='/files/ENCFF780SJS/', library_id='ENCLB652BFV', experiment_id='ENCSR919QJT', assembly='hg19', genome_annotation='V19', spikes_used=['ENCSR156CIL'], href='/files/ENCFF780SJS/@@download/ENCFF780SJS.tsv')]

In [10]:
rsems[0]._asdict()

OrderedDict([('date_created', '2015-08-24T20:23:06.215398+00:00'),
             ('file_id', '/files/ENCFF114MWR/'),
             ('library_id', 'ENCLB738UVO'),
             ('experiment_id', 'ENCSR919QJT'),
             ('assembly', 'hg19'),
             ('genome_annotation', 'V19'),
             ('spikes_used', ['ENCSR156CIL']),
             ('href', '/files/ENCFF114MWR/@@download/ENCFF114MWR.tsv')])

In [11]:
[ x for x in experiments_db['ENCSR919QJT']['replicates'] if x['library']['@id'] == '/libraries/ENCLB738UVO/']

[{'@id': '/replicates/d7d2bb3a-37cc-4c98-8e4b-ded1f086b98f/',
  '@type': ['Replicate', 'Item'],
  'aliases': [],
  'biological_replicate_number': 1,
  'date_created': '2014-04-22T20:09:02.562461+00:00',
  'experiment': '/experiments/ENCSR919QJT/',
  'library': {'@id': '/libraries/ENCLB738UVO/',
   '@type': ['Library', 'Item'],
   'accession': 'ENCLB738UVO',
   'aliases': ['thomas-gingeras:291691'],
   'alternate_accessions': [],
   'award': '/awards/U54HG007004/',
   'biosample': {'@id': '/biosamples/ENCBS381KGR/',
    '@type': ['Biosample', 'Item'],
    'accession': 'ENCBS381KGR',
    'age': '37',
    'age_display': '37 year',
    'age_units': 'year',
    'aliases': ['john-stamatoyannopoulos:24825'],
    'alternate_accessions': [],
    'award': '/awards/U54HG007010/',
    'biosample_synonyms': [],
    'biosample_term_id': 'EFO:0002184',
    'biosample_term_name': 'H4',
    'biosample_type': 'immortalized cell line',
    'characterizations': [],
    'constructs': [],
    'date_created'

In [12]:
gene_quants = [ x for x in experiments_db['ENCSR919QJT']['files'] if x['@id'] in wewant ]

In [13]:
len(gene_quants)

2

In [14]:
len(experiments_db['ENCSR919QJT']['files'])

30

In [15]:
def add_library_info(experiments_db, rsem):
    experiment_id = rsem['experiment_id']
    library_id = rsem['library_id']
    library_obj_id = '/libraries/{}/'.format(library_id)
    experiment = experiments_db[experiment_id]
    replicates = experiment['replicates']
    replicate = [ x for x in replicates if x['library']['@id'] == library_obj_id][0]
    library = replicate['library']
    biosample = library['biosample']
    rsem['lab'] = save_rnaseq_madqc.url_end(experiment['lab']['@id'])
    rsem['starting'] = library.get('nucleic_acid_starting_quantity')
    rsem['starting_units'] = library.get('nucleic_acid_starting_quantity_units')
    rsem['biosample_id'] = save_rnaseq_madqc.url_end(biosample['@id'])
    rsem['organism'] = biosample['organism']['name']
    rsem['age'] = biosample['age']
    rsem['age_units'] = biosample.get('age_units')
    rsem['biosample_lab'] = save_rnaseq_madqc.url_end(biosample['lab'])
    rsem['biological_replicate_number'] = replicate['biological_replicate_number']
    rsem['technical_replicate_number'] = replicate['technical_replicate_number']
    rsem['biosample_type'] = biosample['biosample_type']
    rsem['replication_type'] = experiment['replication_type']

    

In [16]:
with open('kens-ids.txt') as instream:
    rsems = []
    for line in instream:
        experiment_id = line.strip()
        experiment = experiments_db[experiment_id]
        for rsem in save_rnaseq_madqc.find_all_rsem(experiment['files']):
            rsem = rsem._asdict()
            rsem['href'] = 'https://www.encodeproject.org' + rsem['href']
            add_library_info(experiments_db, rsem)
            rsems.append(rsem)

Looking for fields:
,experiment,lab,rfa,description,organism,biosample,biosample_lab,age,starting,library_id,spikeins_used,Pearson,Spearman,MAD,SD


In [17]:
len(rsems)

492

In [18]:
for rsem in rsems:
    rsem['spikes_used'] = ','.join(rsem['spikes_used'])

In [19]:
rsem.keys()

odict_keys(['date_created', 'file_id', 'library_id', 'experiment_id', 'assembly', 'genome_annotation', 'spikes_used', 'href', 'lab', 'starting', 'starting_units', 'biosample_id', 'organism', 'age', 'age_units', 'biosample_lab', 'biological_replicate_number', 'technical_replicate_number', 'biosample_type', 'replication_type'])

In [20]:
df = pandas.DataFrame(rsems, columns=rsems[0].keys())

In [21]:
df.to_csv('/tmp/rsem_urls.tsv', sep='\t')

In [22]:
df

Unnamed: 0,date_created,file_id,library_id,experiment_id,assembly,genome_annotation,spikes_used,href,lab,starting,starting_units,biosample_id,organism,age,age_units,biosample_lab,biological_replicate_number,technical_replicate_number,biosample_type,replication_type
0,2015-06-25T01:16:17.139561+00:00,/files/ENCFF042WGS/,ENCLB555ALW,ENCSR000CUL,hg19,V19,ENCSR470JZL,https://www.encodeproject.org/files/ENCFF042WG...,thomas-gingeras,,,ENCBS472ENC,human,unknown,,thomas-gingeras,1,1,primary cell,anisogenic
1,2015-06-25T01:18:19.581724+00:00,/files/ENCFF708OYI/,ENCLB555ANB,ENCSR000CUL,hg19,V19,ENCSR470JZL,https://www.encodeproject.org/files/ENCFF708OY...,thomas-gingeras,,,ENCBS471ENC,human,unknown,,thomas-gingeras,2,1,primary cell,anisogenic
2,2015-12-18T01:46:21.022710+00:00,/files/ENCFF881JHI/,ENCLB426JGB,ENCSR379YAE,hg19,V19,ENCSR156CIL,https://www.encodeproject.org/files/ENCFF881JH...,thomas-gingeras,,,ENCBS806KLM,human,unknown,,john-stamatoyannopoulos,1,1,in vitro differentiated cells,isogenic
3,2015-12-18T01:47:52.152484+00:00,/files/ENCFF254RPS/,ENCLB387BDY,ENCSR379YAE,hg19,V19,ENCSR156CIL,https://www.encodeproject.org/files/ENCFF254RP...,thomas-gingeras,,,ENCBS343AKO,human,unknown,,john-stamatoyannopoulos,2,1,in vitro differentiated cells,isogenic
4,2015-05-28T02:02:45.092076+00:00,/files/ENCFF370KKZ/,ENCLB755HPY,ENCSR897KTO,hg19,V19,ENCSR156CIL,https://www.encodeproject.org/files/ENCFF370KK...,thomas-gingeras,,,ENCBS916BSF,human,unknown,,thomas-gingeras,2,1,primary cell,anisogenic
5,2015-05-28T02:01:06.991856+00:00,/files/ENCFF096DQV/,ENCLB088PVJ,ENCSR897KTO,hg19,V19,ENCSR156CIL,https://www.encodeproject.org/files/ENCFF096DQ...,thomas-gingeras,,,ENCBS542LAX,human,21,week,thomas-gingeras,1,1,primary cell,anisogenic
6,2015-01-06T23:51:07.759378+00:00,/files/ENCFF211LIT/,ENCLB595PNG,ENCSR290RRR,mm10,M4,ENCSR884LPM,https://www.encodeproject.org/files/ENCFF211LI...,barbara-wold,10.0,ng,ENCBS987ARB,mouse,14.5,day,barbara-wold,2,1,tissue,isogenic
7,2015-01-06T23:51:22.374497+00:00,/files/ENCFF501QKH/,ENCLB409LKR,ENCSR290RRR,mm10,M4,ENCSR884LPM,https://www.encodeproject.org/files/ENCFF501QK...,barbara-wold,10.0,ng,ENCBS715ONU,mouse,14.5,day,barbara-wold,1,1,tissue,isogenic
8,2015-10-02T18:53:30.846721+00:00,/files/ENCFF887QYY/,ENCLB373ANW,ENCSR752RGN,mm10,M4,ENCSR884LPM,https://www.encodeproject.org/files/ENCFF887QY...,barbara-wold,10,ng,ENCBS811RMD,mouse,15.5,day,len-pennacchio,1,1,tissue,isogenic
9,2015-10-02T18:55:27.775310+00:00,/files/ENCFF487ALN/,ENCLB567JQE,ENCSR752RGN,mm10,M4,ENCSR884LPM,https://www.encodeproject.org/files/ENCFF487AL...,barbara-wold,10,ng,ENCBS841GER,mouse,15.5,day,len-pennacchio,2,1,tissue,isogenic
