Preparing to submit wold stranded samples....


In [1]:
import os
import sys
import requests
import pandas
import paramiko
import json
from IPython import display
from pathlib import Path
import configparser

In [2]:
EC = str(Path('~/proj/encoded_client').expanduser())
if EC not in sys.path:
    sys.path.append(EC)

from encoded_client.encoded import ENCODED, Document, typed_column_parser, DCCValidator

In [3]:
HTSW = str(Path('~/proj/htsworkflow').expanduser())
if HTSW not in sys.path:
    sys.path.append(HTSW)

In [4]:
from htsworkflow.submission.aws_submission import run_aws_cp
from htsworkflow.util.api import (
    add_auth_options,
    make_auth_from_opts,
    HtswApi,
)

In [5]:
config = configparser.ConfigParser()
config.read([os.path.expanduser('~/.htsworkflow.ini'),
             '/etc/htsworkflow.ini'
             ])

SECTION = 'sequence_archive'
if config.has_section(SECTION):
    apiid = config.get(SECTION, 'apiid')
    apikey = config.get(SECTION, 'apikey')
    apihost = config.get(SECTION, 'host')

apihost='http://jumpgate.caltech.edu'    
auth = {'apiid': apiid, 'apikey': apikey }
htsw = HtswApi(apihost, auth)

In [6]:
# live server & control file
server = ENCODED('www.encodeproject.org')
spreadsheet_name = Path('~/woldlab/ENCODE/stranded-23877-23896-5xfad.xlsx').expanduser()
engine=None
#engine='odf'

# test server & datafile
#server = ENCODED('test.encodedcc.org')
#spreadsheet_name = os.path.expanduser('~diane/woldlab/ENCODE/C1-encode3-limb-2017-testserver.ods')

server.load_netrc()
validator = DCCValidator(server)

In [7]:
award = 'UM1HG009443'

# Retrieve insert size

In [8]:
print(spreadsheet_name)
libraries = pandas.read_excel(spreadsheet_name, sheet_name='Library', header=0, engine=engine)
for i, row in libraries.iterrows():
    accession = row['accession']
    alias = row['aliases:array']
    fragment = row['average_fragment_size:integer']
    library_id = alias[len('barbara-wold:'):]
    jumpgate_info = htsw.get_library(library_id)
    print(library_id, fragment, jumpgate_info['insert_size'])
        #if jumpgate_info['insert_size'] != dcc_info.get('average_fragment_size'):
        #    server.patch_json(dcc_info['@id'], {'average_fragment_size': int(jumpgate_info['insert_size'])})

/home/diane/woldlab/ENCODE/stranded-23877-23896-5xfad.xlsx
23877 221 221
23878 224 224
23879 214 214
23880 240 240
23881 232 232
23882 236 236
23883 234 234
23884 237 237
23885 234 234
23886 236 236
23887 246 246
23888 226 226
23889 219 219
23890 224 224
23891 221 221
23892 234 234
23893 229 229
23894 231 231
23895 226 226
23896 233 233


# Lookup biosample ontologies

Lookup any biosample ontologies that are already present

In [9]:
biosample = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

for i, row in biosample.iterrows():
    if not pandas.isnull(row.accession) and row.accession.startswith('E'):
        obj = server.get_json(row.accession)
        biosample_ontology = obj['biosample_ontology']
        if isinstance(biosample_ontology, dict):
            biosample.loc[i, 'biosample_ontology'] = biosample_ontology['@id']
            biosample.loc[i, 'biosample_term_name:skip'] = biosample_ontology['term_name']
        biosample.loc[i, 'source'] = obj['source']['@id']
        biosample.loc[i, 'uuid'] = obj['uuid']
        donor = obj['donor']
        if isinstance(donor, dict):
            biosample.loc[i, 'donor'] = donor['@id']
            
biosample

Unnamed: 0,uuid,accession,aliases:array,biosample_ontology,biosample_term_name:skip,model_organism_age,model_organism_age_units,model_organism_sex,description,library_id:skip,cDNA_sample:skip,organism,source,donor,lab,award
0,2b0e3e21-619e-4481-a158-7b8692041696,ENCBS375TMX,barbara-wold:ENC4_cDNA629,/biosample-types/tissue_UBERON_0002369/,adrenal gland,8-10,month,female,5xFAD X Cast adrenal gland_8-10 months female ...,23877,ENC4_cDNA629,/organisms/mouse/,/sources/jackson-labs/,/mouse-donors/ENCDO996MUL/,barbara-wold,UM1HG009443
1,0537b034-a481-413a-bd37-35d360339d28,ENCBS206LPG,barbara-wold:ENC4_cDNA630,/biosample-types/tissue_UBERON_0002369/,adrenal gland,8-10,month,female,5xFAD X Cast adrenal gland_8-10 months female ...,23878,ENC4_cDNA630,/organisms/mouse/,/sources/jackson-labs/,/mouse-donors/ENCDO996MUL/,barbara-wold,UM1HG009443
2,b5eb585f-b4c2-405a-aea9-b323fa7c173e,ENCBS943GNI,barbara-wold:ENC4_cDNA631,/biosample-types/tissue_UBERON_0002369/,adrenal gland,8-10,month,male,5xFAD X Cast adrenal gland_8-10 months male an...,23879,ENC4_cDNA631,/organisms/mouse/,/sources/jackson-labs/,/mouse-donors/ENCDO996MUL/,barbara-wold,UM1HG009443
3,1d47e9aa-a7c0-4562-90ce-3241358a6cf6,ENCBS515VBP,barbara-wold:ENC4_cDNA632,/biosample-types/tissue_UBERON_0002369/,adrenal gland,8-10,month,male,5xFAD X Cast adrenal gland_8-10 months male an...,23880,ENC4_cDNA632,/organisms/mouse/,/sources/jackson-labs/,/mouse-donors/ENCDO996MUL/,barbara-wold,UM1HG009443
4,d3338969-e2bf-4bec-8fae-51d0257dc914,ENCBS900NRB,barbara-wold:ENC4_cDNA633,/biosample-types/tissue_UBERON_0002305/,layer of hippocampus,8-10,month,female,5xFAD X Cast hippocampus_8-10 months female an...,23881,ENC4_cDNA633,/organisms/mouse/,/sources/jackson-labs/,/mouse-donors/ENCDO996MUL/,barbara-wold,UM1HG009443
5,ab2c9c1d-533a-4a07-a2e7-686d7d05af65,ENCBS125BDZ,barbara-wold:ENC4_cDNA634,/biosample-types/tissue_UBERON_0002305/,layer of hippocampus,8-10,month,female,5xFAD X Cast hippocampus_8-10 months female an...,23882,ENC4_cDNA634,/organisms/mouse/,/sources/jackson-labs/,/mouse-donors/ENCDO996MUL/,barbara-wold,UM1HG009443
6,03ae04ef-b7a3-471e-be98-04b690431956,ENCBS830OUL,barbara-wold:ENC4_cDNA635,/biosample-types/tissue_UBERON_0002305/,layer of hippocampus,8-10,month,male,5xFAD X Cast hippocampus_8-10 months male anim...,23883,ENC4_cDNA635,/organisms/mouse/,/sources/jackson-labs/,/mouse-donors/ENCDO996MUL/,barbara-wold,UM1HG009443
7,cf8452f1-8951-4bc8-9f00-133155e15f7c,ENCBS116ERU,barbara-wold:ENC4_cDNA636,/biosample-types/tissue_UBERON_0002305/,layer of hippocampus,8-10,month,male,5xFAD X Cast hippocampus_8-10 months male anim...,23884,ENC4_cDNA636,/organisms/mouse/,/sources/jackson-labs/,/mouse-donors/ENCDO996MUL/,barbara-wold,UM1HG009443
8,bdc0ee2d-200f-4a8d-8bce-37687ad32095,ENCBS505BIQ,barbara-wold:ENC4_cDNA637,/biosample-types/tissue_NTR_0000646/,left cerebral cortex,8-10,month,female,5xFAD X Cast L. cortex (brain)_8-10 months fem...,23885,ENC4_cDNA637,/organisms/mouse/,/sources/jackson-labs/,/mouse-donors/ENCDO996MUL/,barbara-wold,UM1HG009443
9,bdedad93-2910-46d2-a46c-26e63f08c8d5,ENCBS869WPR,barbara-wold:ENC4_cDNA638,/biosample-types/tissue_NTR_0000646/,left cerebral cortex,8-10,month,female,5xFAD X Cast L. cortex (brain)_8-10 months fem...,23886,ENC4_cDNA638,/organisms/mouse/,/sources/jackson-labs/,/mouse-donors/ENCDO996MUL/,barbara-wold,UM1HG009443


In [10]:
#biosample.to_excel('/dev/shm/biosamples.xlsx', index=False)

# Update biosample aliases on portal

In [11]:
biosamples = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)
for i, row in biosamples.iterrows():
    obj = server.get_json('/biosamples/{}/'.format(row.accession))
    new_aliases = obj['aliases'].copy()
    sheet_aliases = typed_column_parser('aliases:array', row['aliases:array'])[1]
    for alias in sheet_aliases:
        if alias not in obj['aliases']:
            new_aliases.append(alias)
    if len(new_aliases) != len(obj["aliases"]):
        update = {"aliases": new_aliases}
        print("Would update: {} {} -> {}".format(obj["@id"], obj["aliases"], new_aliases))
        #server.patch_json(obj["@id"], update)

# Register Biosamples

In [12]:
biosamples = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)
created = server.post_sheet('/biosamples/', biosamples, 
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

0


In [13]:
if created:
    biosample.to_excel('/dev/shm/biosamples.xlsx', index=False)

# Register Libraries

In [23]:
print(spreadsheet_name)
libraries = pandas.read_excel(spreadsheet_name, sheet_name='Library', header=0, engine=engine)
created = server.post_sheet('/libraries/',
                            libraries,
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

/home/diane/woldlab/ENCODE/stranded-23877-23896-5xfad.xlsx
0


In [22]:
if created:
    libraries.to_excel('/dev/shm/libraries.xlsx', index=False)

# Register Experiments

In [26]:
print(server.server)
experiments = pandas.read_excel(spreadsheet_name, sheet_name='Experiment', header=0, engine=engine)
experiments = experiments[experiments['accession'] != 'barbara approval needed']
created = server.post_sheet('/experiments/', 
                            experiments, 
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

www.encodeproject.org
0


In [25]:
if created:
    experiments.to_excel('/dev/shm/experiments.xlsx', index=False)

# Register Replicates

In [28]:
print(server.server)
print(spreadsheet_name)
replicates = pandas.read_excel(spreadsheet_name, sheet_name='Replicate', header=0, engine=engine)
replicates = replicates[replicates['uuid'] != 'barbara approval needed']
created = server.post_sheet('/replicates/',
                            replicates, 
                            verbose=True, 
                            dry_run=False, 
                            validator=validator)
print(len(created))

www.encodeproject.org
/home/diane/woldlab/ENCODE/stranded-23877-23896-5xfad.xlsx
Reponse {'status': 'success', '@type': ['result'], '@graph': [{'date_created': '2022-01-28T23:43:56.988540+00:00', 'submitted_by': '/users/bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a/', 'aliases': ['barbara-wold:23877_b1_t1'], 'schema_version': '9', 'biological_replicate_number': 1, 'technical_replicate_number': 1, 'experiment': '/experiments/ENCSR511LAZ/', 'library': '/libraries/ENCLB939AXX/', 'status': 'in progress', '@id': '/replicates/e5d17704-f581-4063-9256-2aae88b277af/', '@type': ['Replicate', 'Item'], 'uuid': 'e5d17704-f581-4063-9256-2aae88b277af'}]}
row 0 created: e5d17704-f581-4063-9256-2aae88b277af
Reponse {'status': 'success', '@type': ['result'], '@graph': [{'date_created': '2022-01-28T23:43:57.271420+00:00', 'submitted_by': '/users/bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a/', 'aliases': ['barbara-wold:23878_b2_t1'], 'schema_version': '9', 'biological_replicate_number': 2, 'technical_replicate_number': 1,

Reponse {'status': 'success', '@type': ['result'], '@graph': [{'date_created': '2022-01-28T23:44:00.866720+00:00', 'submitted_by': '/users/bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a/', 'aliases': ['barbara-wold:23891_b1_t1'], 'schema_version': '9', 'biological_replicate_number': 1, 'technical_replicate_number': 1, 'experiment': '/experiments/ENCSR584OSM/', 'library': '/libraries/ENCLB214DCX/', 'status': 'in progress', '@id': '/replicates/02a279d5-66be-4e38-96ec-f29771472321/', '@type': ['Replicate', 'Item'], 'uuid': '02a279d5-66be-4e38-96ec-f29771472321'}]}
row 14 created: 02a279d5-66be-4e38-96ec-f29771472321
Reponse {'status': 'success', '@type': ['result'], '@graph': [{'date_created': '2022-01-28T23:44:01.140443+00:00', 'submitted_by': '/users/bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a/', 'aliases': ['barbara-wold:23892_b2_t1'], 'schema_version': '9', 'biological_replicate_number': 2, 'technical_replicate_number': 1, 'experiment': '/experiments/ENCSR584OSM/', 'library': '/libraries/ENCLB453OZU/'

In [29]:
if created:
    replicates.to_excel('/dev/shm/replicates.xlsx', index=False)

# make library tsvs

1. Upload spreadsheet file to ~diane/proj/flowcells
2. create library.tsv files for new flowcells
   make_stub_flowcell_dirs.py -f spreadsheet -v
   (consider using -n the first time)
3. download fastqs
   (Unfortunately woldrnaseq.downloader broke with the new file layout.)
   Copied from sean with <pre>
   for a in  ~sau/bifgrc-rnaseq/flowcells/AAANMW7M5/*.fastq.gz ; 
      do d=$(basename $a) ;
      echo $d; read -d _ -ra FRAG <<< $d ;
      mkdir ${FRAG} ;  
      cp $a ${FRAG} ; 
   done</pre>
4. Generate file sheet.
   python3 prepare-submission-sheet.py -f spreadsheet
5. validate file metadata below.
6. copy spreadsheet with filled in files tab back to ~diane/proj/flowcells
   

# Check Files

In [20]:
files = pandas.read_excel(spreadsheet_name, sheet_name='File', header=0, engine=engine)
created = server.post_sheet('/files/', files, verbose=True, dry_run=True, validator=validator)
print(len(created))

20
