Investigate https://www.encodeproject.org/experiments/ENCSR000AJR/

In [8]:
import pandas
import os
import pprint
import django
import validate_encode3_aliases

In [2]:
from curation_common import *

In [3]:
django.setup()

In [57]:
def format_aliases(aliases):
    if len(aliases) == 0:
        return ''
    elif len(aliases) == 1:
        return aliases[0].replace('barbara-wold:', '')
    else:
        raise RuntimeError("unexpected number of aliases {}".format(','.join(aliases)))

In [37]:
import samples.models

In [4]:
server = ENCODED('www.encodeproject.org')
server.load_netrc()

# Experiment with stub library

In [64]:
experiment_ids = ['ENCSR000AJR', 'ENCSR000AJT']
found = []
for experiment_id in experiment_ids:
    experiment = server.get_json(experiment_id)
    for f in experiment['files']:
        if f['file_type'] == 'fastq':
            rep = f['replicate']
            library = rep.get('library')
            library_id = library['@id'] if library else None
            biosample_id = library['biosample']['@id'] if library else None
            url = 'https://www.encodeproject.org' + f['href']
            read = list(validate_encode3_aliases.fastq_read(url))[0]
            found.append({
                    'experiment': experiment_id,
                    'library': library_id,
                    'biosample': biosample_id,
                    'file': f['@id'],
                    'header': read[0],
                    'sequence': read[1]
                })

reads = pandas.DataFrame(
    found,
    columns=['experiment', 'library', 'biosample', 'file', 'header', 'sequence']
)

reads

ENCSR000AJR /files/ENCFF001IER/
ENCSR000AJR /files/ENCFF001IET/
ENCSR000AJR /files/ENCFF001IEY/
ENCSR000AJR /files/ENCFF001IES/
ENCSR000AJR /files/ENCFF001IEW/
ENCSR000AJR /files/ENCFF001IEZ/
ENCSR000AJR /files/ENCFF518AZL/
ENCSR000AJT /files/ENCFF001IEM/
ENCSR000AJT /files/ENCFF001IEO/
ENCSR000AJT /files/ENCFF001IEU/


Unnamed: 0,experiment,library,biosample,file,header,sequence
0,ENCSR000AJR,/libraries/ENCLB373ZZZ/,/biosamples/ENCBS127ENC/,/files/ENCFF001IEY/,b'@HWI-ST501_0039:1:1:1433:1980#TGACCA/1',b'NACAGAGTCTTCCTGTCTCTGCCTATGCCCAGAGCTGATCCTGT...
1,ENCSR000AJR,,,/files/ENCFF001IEZ/,b'@HWI-ST501_0039:1:1:1280:1999#GATCAG/1',b'NGAAAGGCCATAACTGAATTTTTTTTTTTTTTTTTGGTTTCCAA...
2,ENCSR000AJT,/libraries/ENCLB373ZZZ/,/biosamples/ENCBS127ENC/,/files/ENCFF001IEU/,b'@61MH7AAXX_HWI-EAS229_0009:5:1:995:20977/1',b'NTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'


# Biosample report

In [33]:
experiment_ids = ['ENCSR000EYP', 'ENCSR000CWQ', 'ENCSR000AIB', 'ENCSR000CWN', 
                  'ENCSR000CWR', 'ENCSR000EYT', 'ENCSR000AJD', 'ENCSR000AJR' ]

In [41]:
records = []
for experiment_id in experiment_ids:
    experiment = server.get_json(experiment_id)
    for rep in experiment['replicates']:
        library = rep.get('library')
        if library:
            aliases = format_aliases(library['aliases'])
            if aliases:
                jumpgate = samples.models.Library.objects.get(pk=aliases)
                made_by = jumpgate.made_by
                libname = jumpgate.library_name
            else:
                made_by = None
                libname = None
            biosample = library['biosample']
            records.append({
                    'experiment': experiment['@id'],
                    'library': library['@id'],
                    'aliases': aliases,
                    'bio_rep': rep['biological_replicate_number'],
                    'tech_rep': rep['technical_replicate_number'],
                    'made_by': made_by,
                    'biosample': biosample['@id'],
                    'description': biosample['description'],
                    'libname': libname,
                })
        
biosamples = pandas.DataFrame(
    records, 
    columns=['experiment', 'library', 'aliases', 'biosample', 'bio_rep', 'tech_rep', 'made_by', 'description', 'libname'])

biosamples

Unnamed: 0,experiment,library,aliases,biosample,bio_rep,tech_rep,made_by,description,libname
0,/experiments/ENCSR000EYP/,/libraries/ENCLB227WEO/,11286.0,/biosamples/ENCBS716AAA/,1,1,Brian,embryonic stem cells,Paired ends 174 hESC rep1 RNEasy run2 200
1,/experiments/ENCSR000EYP/,/libraries/ENCLB221REA/,11289.0,/biosamples/ENCBS051SJH/,3,1,Brian,embryonic stem cells,Paired ends 177 hESC rep2 RNEAsy run2 200
2,/experiments/ENCSR000EYP/,/libraries/ENCLB211VHB/,10874.0,/biosamples/ENCBS716AAA/,4,1,Brian Williams,embryonic stem cells,Paired ends 63 ES rep1 RNEasy 300
3,/experiments/ENCSR000EYP/,/libraries/ENCLB764TBI/,11288.0,/biosamples/ENCBS051SJH/,2,1,Brian,embryonic stem cells,Paired ends 176 hESC rep2 mirvana run2 200
4,/experiments/ENCSR000CWQ/,/libraries/ENCLB445LWQ/,11582.0,/biosamples/ENCBS254AYH/,2,1,Brian,"mammary gland, adenocarcinoma. (PMID: 4357757)...",Paired ends 248 MCF7 rep2
5,/experiments/ENCSR000CWQ/,/libraries/ENCLB794RXE/,11581.0,/biosamples/ENCBS155VQC/,1,1,Brian,"mammary gland, adenocarcinoma. (PMID: 4357757)...",Paired ends 247 MCF7 rep1
6,/experiments/ENCSR000CWQ/,/libraries/ENCLB849ZXB/,12098.0,/biosamples/ENCBS254AYH/,3,1,Brian,"mammary gland, adenocarcinoma. (PMID: 4357757)...",Paired ends 271 MCF7 rep2 mixed spikes
7,/experiments/ENCSR000AIB/,/libraries/ENCLB344ZZZ/,11653.0,/biosamples/ENCBS127ENC/,1,1,Lorian,Myoblast cell line derived from thigh muscle o...,C2C12 60h CEBPb 5/4/10 (adaptor 1) a
8,/experiments/ENCSR000AIB/,/libraries/ENCLB345ZZZ/,11847.0,/biosamples/ENCBS127ENC/,2,1,Lorian,Myoblast cell line derived from thigh muscle o...,C2C12 60h C/EBPb 6/22/10 (adaptor 6) a
9,/experiments/ENCSR000CWN/,/libraries/ENCLB866FVU/,11584.0,/biosamples/ENCBS326AAA/,1,1,Brian,skeletal muscle myoblasts,Paired ends 250 HSMM 17


In [None]:
biosamples.sort(['experiment', 'bio_rep', 'tech_rep'])

In [39]:
aliases

''