Prepare submissions for year 3 quarter 3.

Our first block of submissions is stuff that was started in Y3Q1, but got dropped pending validation.

In [1]:
import pandas
import paramiko
import gcat

In [2]:
from curation_common import *

In [3]:
encode = ENCODED('www.encodeproject.org')
encode.load_netrc()

In [4]:
submission_root = 'proj/submission/encode-y3q3'
map_name = 'castaneus.txt'

# create biosamples

In [42]:
spreadsheet_name = "Brian's Mice"
mice_book = gcat.get_file(spreadsheet_name, fmt='pandas_excel')
mice = mice_book.parse('Sheet 1')

In [34]:
created = encode.post_sheet('/biosamples/', mice, verbose=True, dry_run=True)

In [33]:
if created:
    updates = []
    for row in created:
        updates.append({'uuid': row['uuid'], 'biosample_accession': row['accession'], 'aliases': row['aliases'][0]})
    updatesdf = pandas.DataFrame(updates, columns=['uuid', 'biosample_accession', 'aliases'])
    updatesdf.to_excel('/tmp/update.xlsx')

# point at y3q3 spreadsheet

In [46]:
spreadsheet_name = "ENCODE Submission for Y3Q3"
y3q3_book = gcat.get_file(spreadsheet_name, fmt='pandas_excel')

# create experiments

In [41]:
experiments = y3q3_book.parse('Experiments')
created = encode.post_sheet('/experiments/', experiments, dry_run=True, verbose=True)

In [40]:
updatesdf = pandas.DataFrame(created, columns=['accession','description'])
updatesdf.to_excel('/tmp/experiments.xlsx')

# create libraries

In [91]:
y3q3_book = gcat.get_file(spreadsheet_name, fmt='pandas_excel')
libraries = y3q3_book.parse('Libraries')
created = encode.post_sheet('/libraries/', libraries, dry_run=True, verbose=True)

In [90]:
if created:
    updates = []
    for row in created:
        updates.append({'uuid': row['uuid'], 'library_accession': row['accession'], 'aliases': row['aliases'][0]})
    updatesdf = pandas.DataFrame(updates, columns=['uuid', 'library_accession', 'aliases'])
    updatesdf.to_excel('/tmp/libraries.xlsx')

# create replicates

In [99]:
y3q3_book = gcat.get_file(spreadsheet_name, fmt='pandas_excel')
replicates = y3q3_book.parse('Replicates')
created = encode.post_sheet('/replicates/', replicates, dry_run=True, verbose=True)

In [98]:
if created:
    updatesdf = pandas.DataFrame(created, columns=['uuid', 'experiment', 'library'])
    updatesdf.to_excel('/tmp/replicates.xlsx')

# Create spike

In [79]:
lab = 'barbara-wold'
award = 'U54HG006998'

In [68]:
import imp
import htsworkflow.submission.encoded

In [69]:
imp.reload(htsworkflow.submission.encoded)

<module 'htsworkflow.submission.encoded' from '/home/diane/proj/htsworkflow/htsworkflow/submission/encoded.py'>

In [70]:
from htsworkflow.submission.encoded import Document

In [71]:
profile_6_concentration = Document(
  "/home/diane/proj/encode3-curation/ERCC 1000X sequences for Diane June25_2015.pdf",
  "spike-in concentrations",
  "ERCC-only concentrations that are 1000x single cell spike-ins"
)

In [72]:
concentration_uuid = '/documents/394ce5f5-cf93-4cc6-b0fc-92ec1fb6b45a/'
concentration_document = profile_6_concentration.create_if_needed(encode, concentration_uuid)

In [82]:
document_id = concentration_document['@graph'][0]['@id']
document_id

'/documents/394ce5f5-cf93-4cc6-b0fc-92ec1fb6b45a/'

In [84]:
ERCC = {
    'description': 'ERCC-only spike-ins that are 1000x single cell spike-ins"',
    'dataset_type': 'spike-ins',
    'related_files': ['/files/ENCFF001RTP'],
    'award': award,
    'lab': lab,
    'documents': [document_id],
}
encode.validate(ERCC, '/datasets/')
ERCC_accession = '/datasets/ENCSR884LPM/'
if ERCC_accession is None:
    print(encode.post_json('/datasets/', ERCC))

{'@type': ['result'], 'status': 'success', '@graph': [{'assembly': [], 'submitted_by': '/users/bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a/', 'dbxrefs': [], '@type': ['dataset', 'item'], 'lab': '/labs/barbara-wold/', 'revoked_files': [], 'aliases': [], 'related_files': ['/files/ENCFF001RTP/'], 'status': 'proposed', 'dataset_type': 'spike-ins', 'files': ['/files/ENCFF001RTP/'], 'documents': ['/documents/394ce5f5-cf93-4cc6-b0fc-92ec1fb6b45a/'], 'date_created': '2015-06-26T00:08:45.243031+00:00', 'alternate_accessions': [], 'contributing_files': [], 'accession': 'ENCSR884LPM', 'schema_version': '6', 'references': [], '@id': '/datasets/ENCSR884LPM/', 'award': '/awards/U54HG006998/', 'original_files': [], 'uuid': '19bf00b0-f0d2-48b3-a4c7-a7d2108cbc0e', 'description': 'ERCC-only spike-ins that are 1000x single cell spike-ins"'}]}


In [106]:
def make_manifest(spreadsheet_name, submission_root, map_name):
    y3q3_book = gcat.get_file(spreadsheet_name, fmt='pandas_excel')
    libraries = y3q3_book.parse('Libraries')
    replicates = y3q3_book.parse('Replicates')
    
    pongo = paramiko.SSHClient()
    pongo.set_missing_host_key_policy(paramiko.WarningPolicy())
    pongo.connect('pongo.cacr.caltech.edu', username='diane')
    sftp = pongo.open_sftp()
    sftp.chdir(submission_root)
    files = 0
    for line in sftp.open(map_name):
        # skip comments
        line = line.strip()
        if len(line) == 0 or line.startswith('#'):
            continue

        library_id, dirname = line.split()
        alias = 'barbara-wold:{}'.format(library_id)
        library_row = libraries[libraries['aliases:array'] == alias]
        replicate_row = replicates[replicates['library'] == alias]
        for filename in sftp.listdir(dirname):
            if filename.endswith('fastq.gz'):
                relative_path = os.path.join(dirname, filename)
                metadata = {
                    'dataset': list(replicate_row['experiment'])[0],
                    'submitted_file_name': relative_path,
                    'replicate': list(replicate_row['uuid'])[0],
                    'lab': list(library_row['lab'])[0],
                    'award': list(library_row['award'])[0],
                }
                #display.display_pretty(metadata)
                print('<http://jumpgate.caltech.edu/wiki/SubmissionsLog/encode-y3q3#'+dirname+'>')
                print('  encode3:dataset "{}" ;'.format(metadata['dataset']))
                print('  encode3:replicate "{}" ;'.format(metadata['replicate']))
                print('  encode3:lab "{}" ;'.format(metadata['lab']))
                print('  encode3:award "{}" .'.format(metadata['award']))

                print('<file:///woldlab/castor/home/diane/proj/submission/encode-y3q3/'+relative_path+'>')
                print('  encode3:output_type "reads" ;')
                print('  encode3:read_length 100 ; ')
                print('  encode3:file_format "fastq" .')
                files += 1

    print('generated {} records'.format(files))


In [107]:
make_manifest(spreadsheet_name, 'proj/submission/encode-y3q3', 'mice.txt')

<http://jumpgate.caltech.edu/wiki/SubmissionsLog/encode-y3q3#15272-LC_758_C57Bl6_layer_V_pyramidal_neuron_single_cell>
  encode3:dataset "ENCSR746LUP" ;
  encode3:replicate "0e68587e-327b-4129-850a-2645b80ea93f" ;
  encode3:lab "barbara-wold" ;
  encode3:award "U54HG006998" .
<file:///woldlab/castor/home/diane/proj/submission/encode-y3q3/15272-LC_758_C57Bl6_layer_V_pyramidal_neuron_single_cell/15272_HBE4EADXX_c116_l2.fastq.gz>
  encode3:output_type "reads" ;
  encode3:read_length 100 ; 
  encode3:file_format "fastq" .
<http://jumpgate.caltech.edu/wiki/SubmissionsLog/encode-y3q3#15272-LC_758_C57Bl6_layer_V_pyramidal_neuron_single_cell>
  encode3:dataset "ENCSR746LUP" ;
  encode3:replicate "0e68587e-327b-4129-850a-2645b80ea93f" ;
  encode3:lab "barbara-wold" ;
  encode3:award "U54HG006998" .
<file:///woldlab/castor/home/diane/proj/submission/encode-y3q3/15272-LC_758_C57Bl6_layer_V_pyramidal_neuron_single_cell/15272_HBE4EADXX_c116_l1.fastq.gz>
  encode3:output_type "reads" ;
  encode3:rea

  (key.get_name(), hostname, hexlify(key.get_fingerprint())))


In [108]:
make_manifest(spreadsheet_name, 'proj/submission/encode-y3q3', 'human-liver.txt')

<http://jumpgate.caltech.edu/wiki/SubmissionsLog/encode-y3q3#15096-paired-end-human_liver_STL010_10ngs>
  encode3:dataset "ENCSR388ZNJ" ;
  encode3:replicate "a37a8cff-5bf7-47fc-b9a0-fa2aafebb9c3" ;
  encode3:lab "barbara-wold" ;
  encode3:award "U54HG006998" .
<file:///woldlab/castor/home/diane/proj/submission/encode-y3q3/15096-paired-end-human_liver_STL010_10ngs/15096_H00EWBCXX_c116_l2.fastq.gz>
  encode3:output_type "reads" ;
  encode3:read_length 100 ; 
  encode3:file_format "fastq" .
<http://jumpgate.caltech.edu/wiki/SubmissionsLog/encode-y3q3#15096-paired-end-human_liver_STL010_10ngs>
  encode3:dataset "ENCSR388ZNJ" ;
  encode3:replicate "a37a8cff-5bf7-47fc-b9a0-fa2aafebb9c3" ;
  encode3:lab "barbara-wold" ;
  encode3:award "U54HG006998" .
<file:///woldlab/castor/home/diane/proj/submission/encode-y3q3/15096-paired-end-human_liver_STL010_10ngs/15096_H00EWBCXX_c116_l1.fastq.gz>
  encode3:output_type "reads" ;
  encode3:read_length 100 ; 
  encode3:file_format "fastq" .
<http://jump

  (key.get_name(), hostname, hexlify(key.get_fingerprint())))
