Whenever I upload stuff to the DCC Cricket always comes back with things that are wrong.

This notebook is a collection of cleanup scripts for data uploaded by y3q3.

In [1]:
import pandas
import numpy
import RDF
import urllib.parse

from curation_common import *

In [2]:
server = ENCODED('www.encodeproject.org')
server.load_netrc()

In [3]:
single_cell_search = server.get_json('https://www.encodeproject.org/search/?type=experiment&assay_term_name=single%20cell%20isolation%20followed%20by%20RNA-seq')

In [4]:
single_cell_search['@graph'][0].keys()

dict_keys(['files', 'lab', '@type', 'biosample_term_name', 'description', 'assay_term_name', 'accession', 'status', 'award', 'replicates', '@id', 'audit'])

In [5]:
model = get_model(use_contexts=False)

In [6]:
for row in single_cell_search['@graph']:
    print('Loading: {}'.format(row['@id']))
    load_jsonld_into_model(model, server.get_jsonld(row['@id']))

Loading: /experiments/ENCSR734SOQ/
Loading: /experiments/ENCSR909YEQ/
Loading: /experiments/ENCSR298YHA/
Loading: /experiments/ENCSR220HFQ/
Loading: /experiments/ENCSR650MMK/
Loading: /experiments/ENCSR953TKB/
Loading: /experiments/ENCSR295UJP/
Loading: /experiments/ENCSR323FHJ/
Loading: /experiments/ENCSR195BCE/
Loading: /experiments/ENCSR782WIL/
Loading: /experiments/ENCSR138VRG/
Loading: /experiments/ENCSR390KQJ/
Loading: /experiments/ENCSR427FOY/
Loading: /experiments/ENCSR846FYI/
Loading: /experiments/ENCSR927IOO/
Loading: /experiments/ENCSR739YUO/
Loading: /experiments/ENCSR100KRK/
Loading: /experiments/ENCSR198EIH/
Loading: /experiments/ENCSR746LUP/
Loading: /experiments/ENCSR637NLF/
Loading: /experiments/ENCSR320PXZ/
Loading: /experiments/ENCSR027CXR/
Loading: /experiments/ENCSR662UPQ/
Loading: /experiments/ENCSR521YLY/
Loading: /experiments/ENCSR559QTE/
Loading: /experiments/ENCSR527DQV/
Loading: /experiments/ENCSR167AWS/
Loading: /experiments/ENCSR708EHJ/
Loading: /experiment

In [7]:
%%sparql -m model 
select ?p ?o
where {
    <https://www.encodeproject.org/experiments/ENCSR000AIY/> ?p ?o .
}

0,1
p,o
rdf:type,experiment:experiment
rdf:type,experiment:dataset
rdf:type,experiment:item
rdf:description,RNA-seq of a single GM12878 cell
experiment:accession,ENCSR000AIY
experiment:aliases,barbara-wold:single-cell-13285-exp
experiment:assay_term_id,NTR:0003082
experiment:assay_term_name,single cell isolation followed by RNA-seq
experiment:audit,_:b0


In [8]:
%%sparql -m model
select ?p ?o
where {
    <https://www.encodeproject.org/replicates/9c95d61a-5dc8-4ea1-834c-f5777f50ef3b/> ?p ?o .
}

0,1
p,o
rdf:type,replicate:replicate
rdf:type,replicate:item
replicate:biological_replicate_number,1
replicate:date_created,2014-04-17T22:59:18.076704+00:00
replicate:experiment,experiments:ENCSR000AIY/
replicate:library,libraries:ENCLB391GZT/
replicate:notes,{}
replicate:schema_version,5
replicate:status,released


In [9]:
%%sparql -m model
select ?p ?o
where {
    <https://www.encodeproject.org/libraries/ENCLB391GZT/> ?p ?o .
}

0,1
p,o
rdf:type,library:library
rdf:type,library:item
library:accession,ENCLB391GZT
library:aliases,barbara-wold:13285
library:award,https://www.encodeproject.org/awards/U54HG006998/
library:biosample,https://www.encodeproject.org/biosamples/ENCBS638AZC/
library:date_created,2014-04-17T22:46:57.942156+00:00
library:extraction_method,
library:fragmentation_method,chemical (Nextera tagmentation)


In [10]:
%%sparql -m model
select ?p ?o
where {
    <https://www.encodeproject.org/files/ENCFF002BHS/> ?p ?o .
}

0,1
p,o
rdf:type,file:file
rdf:type,file:item
file:accession,ENCFF002BHS
file:award,https://www.encodeproject.org/awards/U54HG006998/
file:content_md5sum,7aa07fc9d479d17cb211efc8206f23ff
file:dataset,experiments:ENCSR000AIY/
file:date_created,2014-05-13
file:file_format,fastq
file:file_size,3637739902


In [11]:
%%sparql -m model
select ?flowcell
where {
    <https://www.encodeproject.org/files/ENCFF002BHS/> file:flowcell_details ?details .
    ?details file:flowcell ?flowcell .
}

0
flowcell
HBE4EADXX
H00EWBCXX
D1L1DACXX
D1EWHACXX
D1THKACXX


In [14]:
%%sparql -m model -c -o pool_splits
select ?exp ?description ?library ?starting ?units ?aliases
where {
    ?exp a experiment:experiment ;
         rdf:description ?description ;
         experiment:replicates ?replicate .
    ?replicate replicate:library ?library .
    ?library library:nucleic_acid_starting_quantity ?starting ;
             library:nucleic_acid_starting_quantity_units ?units ;
             library:aliases ?aliases .
    filter(regex(?description, "pool.*split"))
}

Found 15 rows.


In [13]:
for row in pool_splits:
    description = str(row['description'])
    url = urllib.parse.urlparse(str(row['library']))
    library = url.path
    starting = str(row['starting'])
    units = str(row['units'])
    if starting == '1.0' and units == 'cells':
        print('{} {} {} {}'.format(description, library, starting, units))
        #server.patch_json(library, {'nucleic_acid_starting_quantity_units': 'cell-equivalent'})

In [14]:
%%sparql -m model -c -o library_read_length
select ?library ?aliases ?file ?read_length
where {
    ?file a file:file ;
          file:replicate ?replicate .
    ?replicate replicate:library ?library .
    ?library library:aliases ?aliases ;
             a library:library .
    OPTIONAL { ?file file:read_length ?read_length . }
}

Found 224 rows.


In [15]:
library_read_length

0,1,2,3
library,aliases,file,read_length
libraries:ENCLB282TJI/,barbara-wold:15357,files:ENCFF076SVO/,
libraries:ENCLB861EZL/,barbara-wold:15359,files:ENCFF091CQF/,
libraries:ENCLB679XGZ/,barbara-wold:15356,files:ENCFF145MTO/,
libraries:ENCLB774TWW/,barbara-wold:15304,files:ENCFF178ZIW/,
libraries:ENCLB977FBH/,barbara-wold:15308,files:ENCFF185FYP/,
libraries:ENCLB774TWW/,barbara-wold:15304,files:ENCFF191RVE/,
libraries:ENCLB301VGJ/,barbara-wold:15362,files:ENCFF200EPQ/,
libraries:ENCLB231DLC/,barbara-wold:15307,files:ENCFF276ABU/,
libraries:ENCLB945FTP/,barbara-wold:15309,files:ENCFF282ATV/,


In [16]:
import django
if not 'DJANGO_SETTINGS_MODULE' in os.environ:
    os.environ['DJANGO_SETTINGS_MODULE'] = 'htsworkflow.settings.myrada'
django.setup()

In [17]:
from experiments.models import FlowCell

In [22]:
fc = FlowCell.objects.get(flowcell_id='H00EWBCXX')

In [23]:
fc.read_length

100

In [27]:
for row in library_read_length:
    library = urllib.parse.urlparse(str(row['library'])).path
    library_id = str(row['aliases'])[-5:]
    file = urllib.parse.urlparse(str(row['file'])).path
    current_file = server.get_json(file)
    flowcell_details = current_file['flowcell_details'][0]
    flowcell = current_file['flowcell_details'][0]['flowcell']
    lane = current_file['flowcell_details'][0]['lane']
    if 'read_length' not in current_file:
        fc = FlowCell.objects.get(flowcell_id=flowcell)
        print(library_id, flowcell, lane, fc.read_length)
        try:
            print(server.patch_json(file, {'read_length': fc.read_length}))
        except Exception as e:
            print(e)
            break


15357 H00EWBCXX 1 100
{'status': 'success', '@type': ['result'], '@graph': [{'md5sum': '305fd3c96e3233771d699b0e8a964e9e', 'submitted_file_name': '15357-LC_814_C57Bl6_layer_V_pyramidal_neuron_pool-split/15357_H00EWBCXX_c116_l1.fastq.gz', 'aliases': [], 'submitted_by': '/users/bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a/', 'replicate': '/replicates/a8e9cf16-6d13-4875-9b4c-a4247b51c493/', 'file_size': 353989625, 'lab': '/labs/barbara-wold/', 'qc_metrics': [], '@id': '/files/ENCFF076SVO/', 'award': '/awards/U54HG006998/', 'alternate_accessions': [], 'href': '/files/ENCFF076SVO/@@download/ENCFF076SVO.fastq.gz', 'title': 'ENCFF076SVO', 'output_type': 'reads', 'platform': '/platforms/OBI%3A0002002/', 'schema_version': '5', 'flowcell_details': [{'machine': 'http://jumpgate.caltech.edu/sequencer/8', 'lane': '1', 'flowcell': 'H00EWBCXX', 'barcode': 'N710-N501:CGAGGCTG-TAGATCGC'}], '@type': ['file', 'item'], 'uuid': 'e0b27f88-70c7-4906-b465-6807a31c04af', 'content_md5sum': '007ee7c822f7e3d91bf2d2f0985d

In [20]:
current_file.keys()

dict_keys(['md5sum', '@id', 'audit', 'aliases', 'submitted_by', 'replicate', 'read_length_units', 'lab', 'qc_metrics', 'submitted_file_name', 'award', 'alternate_accessions', 'href', 'title', 'output_type', 'flowcell_details', 'file_format', 'platform', '@type', 'uuid', 'content_md5sum', 'file_size', 'dataset', 'file_type', 'read_length', 'run_type', 'status', 'accession', 'schema_version', 'dbxrefs', 'date_created', 'output_category'])