# Introduction

In [1]:
import pandas
import os
import gcat
import sys
import hashlib
import jsonschema
import requests
import paramiko
import logging
import importlib
import gzip

importlib.reload(logging)
logging.basicConfig(level=logging.INFO)

In [2]:
from curation_common import *

In [3]:
ROOT = os.path.expanduser('~/proj/htsworkflow')
if ROOT not in sys.path:
    sys.path.append(ROOT)

In [4]:
server = ENCODED('www.encodeproject.org')
server.load_netrc()

In [5]:
def make_manifest(spreadsheet_name, submission_name, submission_root, map_name):
    book = xlsx = pandas.ExcelFile(spreadsheet_name)
    
    libraries = book.parse('Libraries')
    replicates = book.parse('Replicates')
    
    pongo = paramiko.SSHClient()
    pongo.set_missing_host_key_policy(paramiko.WarningPolicy())
    pongo.connect('pongo.cacr.caltech.edu', username='diane')
    sftp = pongo.open_sftp()
    sftp.chdir(submission_root)
    files = 0
    for line in sftp.open(map_name):
        # skip comments
        line = line.strip()
        if len(line) == 0 or line.startswith('#'):
            continue

        library_id, dirname = line.split()
        alias = 'barbara-wold:{}'.format(library_id)
        library_row = libraries[libraries['aliases:array'] == alias]
        replicate_row = replicates[replicates['library'] == alias]
        for filename in sftp.listdir(dirname):
            if filename.endswith('fastq.gz'):
                relative_path = os.path.join(dirname, filename)
                metadata = {
                    'dataset': list(replicate_row['experiment'])[0],
                    'submitted_file_name': relative_path,
                    'replicate': list(replicate_row['uuid'])[0],
                    'lab': list(library_row['lab'])[0],
                    'award': list(library_row['award'])[0],
                }
                #display.display_pretty(metadata)
                print('<http://jumpgate.caltech.edu/wiki/SubmissionsLog/{}#{}>'.format(submission_name, dirname))
                print('  encode3:dataset "{}" ;'.format(metadata['dataset']))
                print('  encode3:replicate "{}" ;'.format(metadata['replicate']))
                print('  encode3:lab "{}" ;'.format(metadata['lab']))
                print('  encode3:award "{}" .'.format(metadata['award']))

                print('<file:///woldlab/castor/home/diane/proj/submission/{}/{}>'.format(submission_name, relative_path))
                print('  encode3:output_type "reads" ;')
                print('  encode3:file_format "fastq" .')
                files += 1

    print('generated {} records'.format(files))


In [6]:
make_manifest('/home/diane/dl/ENCODE Submission for Y3Q2.xlsx', 'encode-y3q2', 'proj/submission/encode-201605', 'libs.txt')

INFO:paramiko.transport:Connected (version 2.0, client OpenSSH_7.2p2)
  (key.get_name(), hostname, hexlify(key.get_fingerprint())))
INFO:paramiko.transport:Authentication (publickey) successful!
INFO:paramiko.transport.sftp:[chan 0] Opened sftp connection (server version 3)


<http://jumpgate.caltech.edu/wiki/SubmissionsLog/encode-y3q2#15475>
  encode3:dataset "ENCSR362AIZ" ;
  encode3:replicate "04cdf437-1e09-4603-b99a-dea90e86e556" ;
  encode3:lab "barbara-wold" ;
  encode3:award "U54HG006998" .
<file:///woldlab/castor/home/diane/proj/submission/encode-y3q2/15475/15475_H22Y2BCXX_c116_l1.fastq.gz>
  encode3:output_type "reads" ;
  encode3:file_format "fastq" .
generated 1 records


In [7]:
xlsx = pandas.ExcelFile('/home/diane/dl/ENCODE Submission for Y3Q2.xlsx')

In [8]:
xlsx.book

<xlrd.book.Book at 0x7f4426421c88>

In [9]:
xlsx.sheet_names

['Experiments', 'Libraries', 'Replicates', 'Biosamples']

In [10]:
xlsx.parse('Experiments')

Unnamed: 0,experiment_accession,description,assay_term_name,assay_term_id,species:skip,biosample_type,lab,award
0,ENCSR362AIZ,Total RNA-Seq on postnatal 0 day mouse forebrain,RNA-Seq,OBI:0001271,human,tissue,barbara-wold,U54HG006998
1,ENCSR719NAJ,Total RNA-Seq on postnatal 0 day mouse midbrain,RNA-Seq,OBI:0001271,human,tissue,barbara-wold,U54HG006998
2,ENCSR017JEG,Total RNA-Seq on postnatal 0 day mouse hindbrain,RNA-Seq,OBI:0001271,human,tissue,barbara-wold,U54HG006998
3,ENCSR667TOX,Total RNA-Seq on postnatal 0 day mouse neural ...,RNA-Seq,OBI:0001271,human,tissue,barbara-wold,U54HG006998
4,ENCSR438XCG,Total RNA-Seq on postnatal 0 day mouse thymus,RNA-Seq,OBI:0001271,human,tissue,barbara-wold,U54HG006998
5,ENCSR526SEX,Total RNA-Seq on postnatal 0 day mouse heart,RNA-Seq,OBI:0001271,human,tissue,barbara-wold,U54HG006998
6,ENCSR982MRY,Total RNA-Seq on postnatal 0 day mouse lungs,RNA-Seq,OBI:0001271,human,tissue,barbara-wold,U54HG006998
7,ENCSR096STK,Total RNA-Seq on postnatal 0 day mouse liver,RNA-Seq,OBI:0001271,human,tissue,barbara-wold,U54HG006998
8,ENCSR946HWC,Total RNA-Seq on postnatal 0 day mouse skeleta...,RNA-Seq,OBI:0001271,human,tissue,barbara-wold,U54HG006998
9,ENCSR579FCW,Total RNA-Seq on postnatal 0 day mouse spleen,RNA-Seq,OBI:0001271,human,tissue,barbara-wold,U54HG006998
