Submitting various things for end of grant.

In [1]:
import os
import sys
import requests
import pandas
import paramiko
import json
from IPython import display

In [2]:
from curation_common import *
from htsworkflow.submission.encoded import DCCValidator

In [3]:
PANDAS_ODF = os.path.expanduser('~/src/pandasodf')
if PANDAS_ODF not in sys.path:
    sys.path.append(PANDAS_ODF)
    from pandasodf import ODFReader

In [4]:
from htsworkflow.submission.encoded import Document
from htsworkflow.submission.aws_submission import run_aws_cp

In [5]:
# live server & control file
server = ENCODED('www.encodeproject.org')
spreadsheet_name = os.path.expanduser('~diane/woldlab/ENCODE/C1-mouse-forlimb-submission-201804.ods')

# test server & datafile
#server = ENCODED('test.encodedcc.org')
#spreadsheet_name = os.path.expanduser('~diane/woldlab/ENCODE/C1-mouse-forlimb-submission-201804-testserver.ods')

server.load_netrc()
validator = DCCValidator(server)

In [6]:
award = 'UM1HG009443'

# Submit Documents

Example Document submission

In [7]:
#atac_uuid = '0fc44318-b802-474e-8199-f3b6d708eb6f'
#atac = Document(os.path.expanduser('~/proj/encode3-curation/Wold_Lab_ATAC_Seq_protocol_December_2016.pdf'),
#                'general protocol',
#                'ATAC-Seq experiment protocol for Wold lab',
#                )
#body = atac.create_if_needed(server, atac_uuid)
#print(body['@id'])

# Submit Annotations

In [8]:
#sheet = gcat.get_file(spreadsheet_name, fmt='pandas_excel')
#annotations = sheet.parse('Annotations', header=0)
#created = server.post_sheet('/annotations/', annotations, verbose=True, dry_run=True)
#print(len(created))

In [9]:
#if created:
#    annotations.to_excel('/tmp/annotations.xlsx', index=False)

# Register Biosamples

In [10]:
book = ODFReader(spreadsheet_name)
biosample = book.parse('Biosample', header=0)
created = server.post_sheet('/biosamples/', biosample, 
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

0


In [11]:
if created:
    biosample.to_excel('/dev/shm/biosamples.xlsx', index=False)

# Register Libraries

In [12]:
print(spreadsheet_name)
book = ODFReader(spreadsheet_name)
libraries = book.parse('Library', header=0)
created = server.post_sheet('/libraries/', 
                            libraries, 
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

/home/diane/woldlab/ENCODE/C1-mouse-forlimb-submission-201804.ods
0


In [13]:
if created:
    libraries.to_excel('/dev/shm/libraries.xlsx', index=False)

In [14]:
len(created)

0

# Register Experiments

In [15]:
book = ODFReader(spreadsheet_name)
experiments = book.parse('Experiment', header=0)
created = server.post_sheet('/experiments/', 
                            experiments, 
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

0


In [16]:
if created:
    experiments.to_excel('/dev/shm/experiments.xlsx', index=False)

# Register Replicates

In [17]:
book = ODFReader(spreadsheet_name)
replicates = book.parse('Replicate', header=0)
created = server.post_sheet('/replicates/', 
                            replicates, 
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

0


In [18]:
if created:
    replicates.to_excel('/dev/shm/replicates.xlsx', index=False)

# Make manifest

In [21]:
def make_manifest(spreadsheet_name, submission_root, map_name):
    submission_name = 'C1_mouse_limb_submission_201804'
    book = ODFReader(spreadsheet_name)
    libraries = book.parse('Library')
    replicates = book.parse('Replicate')
    
    pongo = paramiko.SSHClient()
    pongo.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    pongo.connect('pongo.caltech.edu', username='diane')
    sftp = pongo.open_sftp()
    sftp.chdir(submission_root)
    files = 0
    for line in sftp.open(map_name):
        # skip comments
        line = line.strip()
        if len(line) == 0 or line.startswith('#'):
            continue

        library_id, dirname = line.split()
        alias = 'barbara-wold:{}'.format(library_id)
        library_row = libraries[[alias in row for row in list(libraries['aliases:array'])]]
        replicate_row = replicates[[alias in row for row in list(libraries['aliases:array'])]]
        for filename in sftp.listdir(dirname):
            if filename.endswith('fastq.gz'):
                relative_path = os.path.join(dirname, filename)
                metadata = {
                    'dataset': list(replicate_row['experiment'])[0],
                    'submitted_file_name': relative_path,
                    'replicate': list(replicate_row['uuid'])[0],
                    'lab': list(library_row['lab'])[0],
                    'award': list(library_row['award'])[0],
                }
                #display.display_pretty(metadata)
                print('<http://jumpgate.caltech.edu/wiki/SubmissionsLog/{}#{}>'.format(submission_name, dirname))
                print('  encode3:dataset "{}" ;'.format(metadata['dataset']))
                print('  encode3:replicate "{}" ;'.format(metadata['replicate']))
                print('  encode3:lab "{}" ;'.format(metadata['lab']))
                print('  encode3:award "{}" .'.format(metadata['award']))

                print('<file:///woldlab/loxcyc/home/diane/{}/{}>'.format(submission_root, relative_path))
                print('  encode3:output_type "reads" ;')
                print('  encode3:read_length 100 ; ')
                print('  encode3:file_format "fastq" .')
                files += 1

    print('generated {} records'.format(files))


In [23]:
#make_manifest(spreadsheet_name, 'proj/C1_mouse_limb_combined', 'libs.txt')