One of our todos at http://redmine.encodedcc.org/issues/2572#Diane-Brian-updates was to validate how the fastqs in ENCSR000AIZ were generated. The metadata said they were paired, but didn't include which fastq was attached to which end.

This code downloads the first read for each of the fastqs and parses them to see what the sequencer thought the flowcell / lane / end combinations were.

In [1]:
import pandas
from curation_common import *
from pprint import pprint

In [2]:
server = ENCODED('www.encodeproject.org')
server.load_netrc()

In [3]:
model = get_model(use_contexts=False)
experiments = ['ENCSR000AEG', 
               'ENCSR000AEH',
               'ENCSR000AEQ',
               'ENCSR000AEP',
               'ENCSR000AIZ'   # already done
              ]
for exp in experiments:
    load_jsonld_into_model(model, server.get_jsonld('/experiments/'+exp))

In [4]:
%%sparql -m model -c -o files 
select ?experiment ?library ?alias ?bio ?tech ?file ?href
where {
    ?experiment a experiment:experiment ;
         experiment:replicates ?rep ;
         experiment:files ?file .
    ?file file:replicate ?rep ;
          file:href ?href ;
          file:file_type ?file_type .
    ?rep replicate:library ?library ;
         replicate:biological_replicate_number ?bio ;
         replicate:technical_replicate_number ?tech .
    ?library library:aliases ?alias .
    filter(regex(?file_type, "fastq"))
}

Found 28 rows.


In [5]:
files

0,1,2,3,4,5,6
experiment,library,alias,bio,tech,file,href
experiments:ENCSR000AEG/,libraries:ENCLB044ZZZ/,barbara-wold:13712,2,1,files:ENCFF001RVW/,files:ENCFF001RVW/@@download/ENCFF001RVW.fastq.gz
experiments:ENCSR000AEG/,libraries:ENCLB044ZZZ/,barbara-wold:13712,2,1,files:ENCFF001RVS/,files:ENCFF001RVS/@@download/ENCFF001RVS.fastq.gz
experiments:ENCSR000AEG/,libraries:ENCLB043ZZZ/,barbara-wold:13711,1,1,files:ENCFF001RVY/,files:ENCFF001RVY/@@download/ENCFF001RVY.fastq.gz
experiments:ENCSR000AEG/,libraries:ENCLB043ZZZ/,barbara-wold:13711,1,1,files:ENCFF001RVR/,files:ENCFF001RVR/@@download/ENCFF001RVR.fastq.gz
experiments:ENCSR000AEH/,libraries:ENCLB046ZZZ/,barbara-wold:13714,2,1,files:ENCFF001RVX/,files:ENCFF001RVX/@@download/ENCFF001RVX.fastq.gz
experiments:ENCSR000AEH/,libraries:ENCLB046ZZZ/,barbara-wold:13714,2,1,files:ENCFF001RWB/,files:ENCFF001RWB/@@download/ENCFF001RWB.fastq.gz
experiments:ENCSR000AEH/,libraries:ENCLB045ZZZ/,barbara-wold:13713,1,1,files:ENCFF001RVZ/,files:ENCFF001RVZ/@@download/ENCFF001RVZ.fastq.gz
experiments:ENCSR000AEH/,libraries:ENCLB045ZZZ/,barbara-wold:13713,1,1,files:ENCFF001RVT/,files:ENCFF001RVT/@@download/ENCFF001RVT.fastq.gz
experiments:ENCSR000AEQ/,libraries:ENCLB063ZZZ/,barbara-wold:13717,1,1,files:ENCFF001RWF/,files:ENCFF001RWF/@@download/ENCFF001RWF.fastq.gz


In [6]:
import validate_encode3_aliases
FlowcellLookup = validate_encode3_aliases.CheckDCCWoldAlias(server)

ERROR:validate_encode3_aliases:Pysam not available, bam reading wont work


In [7]:
flowcells = []
for row in files:
    href = str(row['href'])
    raw_id = validate_encode3_aliases.fastq_read_id(href)
    read_id = FlowcellLookup.parse_read_id(raw_id)
    file_info = {
        'experiment': str(row['experiment'])[-12:-1],
        'library': str(row['library'])[-12:-1],
        'file': str(row['file'])[-12:-1],
        'lib_id': str(row['alias'])[len('barbara-wold:'):],
        'href': href,
        'short_href': href[-20:-9],
        'raw_id': raw_id.decode('ascii'),
        'flowcell': read_id['fc'].decode('ascii'),
        'lane': read_id['lane'].decode('ascii'),
        'end':read_id['end'].decode('ascii')
    }
    flowcells.append(file_info)
    #print(read_id, alias, short_href)
flowcells = pandas.DataFrame(
    flowcells, 
    columns=['experiment', 'library', 'lib_id', 'flowcell', 'lane', 'end', 'file', 'raw_id'])



In [8]:
flowcells

Unnamed: 0,experiment,library,lib_id,flowcell,lane,end,file,raw_id
0,ENCSR000AEG,ENCLB044ZZZ,13712,C2812ACXX,2,2,ENCFF001RVW,@HWI-ST354R:434:C2812ACXX:2:1101:1494:2172 2:N...
1,ENCSR000AEG,ENCLB044ZZZ,13712,C2812ACXX,2,1,ENCFF001RVS,@HWI-ST354R:434:C2812ACXX:2:1101:1494:2172 1:N...
2,ENCSR000AEG,ENCLB043ZZZ,13711,C2812ACXX,2,1,ENCFF001RVY,@HWI-ST354R:434:C2812ACXX:2:1101:1282:2086 1:N...
3,ENCSR000AEG,ENCLB043ZZZ,13711,C2812ACXX,2,2,ENCFF001RVR,@HWI-ST354R:434:C2812ACXX:2:1101:1282:2086 2:N...
4,ENCSR000AEH,ENCLB046ZZZ,13714,C2812ACXX,2,1,ENCFF001RVX,@HWI-ST354R:434:C2812ACXX:2:1101:1480:2139 1:N...
5,ENCSR000AEH,ENCLB046ZZZ,13714,C2812ACXX,2,2,ENCFF001RWB,@HWI-ST354R:434:C2812ACXX:2:1101:1480:2139 2:N...
6,ENCSR000AEH,ENCLB045ZZZ,13713,C2812ACXX,2,2,ENCFF001RVZ,@HWI-ST354R:434:C2812ACXX:2:1101:1617:2065 2:N...
7,ENCSR000AEH,ENCLB045ZZZ,13713,C2812ACXX,2,1,ENCFF001RVT,@HWI-ST354R:434:C2812ACXX:2:1101:1617:2065 1:N...
8,ENCSR000AEQ,ENCLB063ZZZ,13717,C2812ACXX,2,1,ENCFF001RWF,@HWI-ST354R:434:C2812ACXX:2:1101:1473:2097 1:N...
9,ENCSR000AEQ,ENCLB063ZZZ,13717,C2812ACXX,2,2,ENCFF001RWC,@HWI-ST354R:434:C2812ACXX:2:1101:1473:2097 2:N...


In [9]:
import django
if not 'DJANGO_SETTINGS_MODULE' in os.environ:
    os.environ['DJANGO_SETTINGS_MODULE'] = 'htsworkflow.settings.myrada'
django.setup()

In [10]:
from experiments import models

In [11]:
for flowcell_id in set(flowcells['flowcell']):
    try:
        fc_model = models.FlowCell.objects.get(flowcell_id=flowcell_id)
        print(flowcell_id, 'is_paired_end', fc_model.paired_end)
    except models.FlowCell.DoesNotExist as e:
        print(flowcell_id, "is not found") 

D0549ACXX is_paired_end True
HA00MADXX is_paired_end False
H9L41ADXX is_paired_end False
C2812ACXX is not found


# Discussion

I thought I'd double check what our database says for the various flowcells discovered from the read id. HA00MADXX and H9L41ADXX were both single ended runs, D0549ACXX was run paired ended, and C2812ACXX was not sequenced by us, but does appear to be paired ended though.