Preparing to submit wold stranded samples....


In [1]:
import os
import sys
import requests
import pandas
import paramiko
import re
import json
from IPython import display
from pathlib import Path
import configparser

In [2]:
from curation_common import *
from encoded_client.encoded import DCCValidator

In [3]:
from encoded_client.encoded import Document
from encoded_client.submission import run_aws_cp
from htsworkflow.util.api import (
    add_auth_options,
    make_auth_from_opts,
    HtswApi,
)

In [4]:
config = configparser.ConfigParser()
config.read([os.path.expanduser('~/.htsworkflow.ini'),
             '/etc/htsworkflow.ini'
             ])

SECTION = 'sequence_archive'
if config.has_section(SECTION):
    apiid = config.get(SECTION, 'apiid')
    apikey = config.get(SECTION, 'apikey')
    apihost = config.get(SECTION, 'host')

auth = {'apiid': apiid, 'apikey': apikey }
htsw = HtswApi(apihost, auth)

In [5]:
# live server & control file
server = ENCODED('www.encodeproject.org')
spreadsheet_name = Path('~/woldlab/ENCODE/stranded-25011-25032-snyder-tissue.xlsx').expanduser()
engine=None
#engine='odf'

# test server & datafile
#server = ENCODED('test.encodedcc.org')
#spreadsheet_name = os.path.expanduser('~diane/woldlab/ENCODE/C1-encode3-limb-2017-testserver.ods')

server.load_netrc()
validator = DCCValidator(server)

assert spreadsheet_name.exists()

In [6]:
award = 'UM1HG009443'

# Lookup biosample ontologies

Lookup any biosample ontologies that are already present

In [7]:
biosample_sheet = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

for i, row in biosample_sheet.iterrows():
    if not pandas.isnull(row.accession) and row.accession.startswith('E'):
        biosample = server.get_json(row.accession)
        biosample_ontology = biosample['biosample_ontology']
        if isinstance(biosample_ontology, dict):
            biosample_sheet.loc[i, 'biosample_ontology'] = biosample_ontology['@id']
            biosample_sheet.loc[i, 'biosample_term_name:skip'] = biosample_ontology['term_name']
            for term in [("organism","@id"), ("source","@id"), ("donor","@id"), ("lab","@id"), ("award", "@id")]:
                if pandas.isnull(biosample_sheet.loc[i, term[0]]):
                    biosample_sheet.loc[i, term[0]] = biosample[term[0]][term[1]]
            
biosample_sheet

Unnamed: 0,uuid,accession,library_id:skip,cDNA_sample:skip,description,biosample_ontology,biosample_term_name:skip,aliases:array,nih_institutional_certification,model_organism_age,model_organism_age_units,mouse_life_stage,model_organism_sex,organism,source,donor,lab,award
0,,ENCBS028WAL,25011,ENC4_cDNA_846,W64 pancreas,/biosample-types/tissue_UBERON_0001264/,pancreas,barbara-wold:ENC4_cDNA_846,NIC00005,,,,,/organisms/human/,/sources/yiing-lin/,michael-snyder:donor_W64,barbara-wold,/awards/UM1HG009444/
1,,ENCBS046OBK,25012,ENC4_cDNA_847,UW040 heart right ventricle,/biosample-types/tissue_UBERON_0002080/,heart right ventricle,barbara-wold:ENC4_cDNA_847,NIC00005,,,,,/organisms/human/,/sources/shin-lin/,michael-snyder:donor_UW040,barbara-wold,/awards/UM1HG009444/
2,,ENCBS189LQW,25013,ENC4_cDNA_848,UW068 heart right ventricle,/biosample-types/tissue_UBERON_0002080/,heart right ventricle,barbara-wold:ENC4_cDNA_848,NIC00005,,,,,/organisms/human/,/sources/shin-lin/,michael-snyder:donor_UW068,barbara-wold,/awards/UM1HG009444/
3,,ENCBS199BOZ,25014,ENC4_cDNA_849,UW040 heart left ventricle,/biosample-types/tissue_UBERON_0002084/,heart left ventricle,barbara-wold:ENC4_cDNA_849,NIC00005,,,,,/organisms/human/,/sources/shin-lin/,michael-snyder:donor_UW040,barbara-wold,/awards/UM1HG009444/
4,,ENCBS230VLO,25015,ENC4_cDNA_850,UW076 heart left ventricle,/biosample-types/tissue_UBERON_0002084/,heart left ventricle,barbara-wold:ENC4_cDNA_850,NIC00005,,,,,/organisms/human/,/sources/shin-lin/,michael-snyder:donor_UW076,barbara-wold,/awards/UM1HG009444/
5,,ENCBS397EAK,25016,ENC4_cDNA_851,UW038 heart right ventricle,/biosample-types/tissue_UBERON_0002080/,heart right ventricle,barbara-wold:ENC4_cDNA_851,NIC00005,,,,,/organisms/human/,/sources/shin-lin/,michael-snyder:donor_UW038,barbara-wold,/awards/UM1HG009444/
6,,ENCBS488WUX,25017,ENC4_cDNA_852,UW076 heart right ventricle,/biosample-types/tissue_UBERON_0002080/,heart right ventricle,barbara-wold:ENC4_cDNA_852,NIC00005,,,,,/organisms/human/,/sources/shin-lin/,michael-snyder:donor_UW076,barbara-wold,/awards/UM1HG009444/
7,,ENCBS531TYS,25018,ENC4_cDNA_853,UW068 heart left ventricle,/biosample-types/tissue_UBERON_0002084/,heart left ventricle,barbara-wold:ENC4_cDNA_853,NIC00005,,,,,/organisms/human/,/sources/shin-lin/,michael-snyder:donor_UW068,barbara-wold,/awards/UM1HG009444/
8,,ENCBS605FHL,25019,ENC4_cDNA_854,UW036 heart right ventricle,/biosample-types/tissue_UBERON_0002080/,heart right ventricle,barbara-wold:ENC4_cDNA_854,NIC00005,,,,,/organisms/human/,/sources/shin-lin/,michael-snyder:donor_UW036,barbara-wold,/awards/UM1HG009444/
9,,ENCBS655VDJ,25020,ENC4_cDNA_855,UW067 heart right ventricle,/biosample-types/tissue_UBERON_0002080/,heart right ventricle,barbara-wold:ENC4_cDNA_855,NIC00005,,,,,/organisms/human/,/sources/shin-lin/,michael-snyder:donor_UW067,barbara-wold,/awards/UM1HG009444/


In [None]:
biosample_sheet.to_excel('/dev/shm/biosamples.xlsx', index=False)

# Register Biosamples

In [8]:
biosample = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)
created = server.post_sheet('/biosamples/', biosample, 
                            verbose=True, 
                            dry_run=True,
                            validator=validator)
print(len(created))

0


In [None]:
if created:
    biosample.to_excel('/dev/shm/biosamples.xlsx', index=False)

# Retrieve library starting amount

In [9]:
print(spreadsheet_name)
libraries = pandas.read_excel(spreadsheet_name, sheet_name='Library', header=0, engine=engine)

fragment_size = []
for i, row in libraries.iterrows():
    library_id = row["library_id:skip"]
    library_info = htsw.get_library(library_id)    
    if pandas.isnull(row["average_fragment_size:integer"]):
        fragment_size.append(library_info["insert_size"])
    else:
        assert library_info["insert_size"] == row["average_fragment_size:integer"], "{} {} {}!={}".format(i, library_id, library_info["insert_size"], row["average_fragment_size:integer"])
        fragment_size.append("{} pass".format(row["average_fragment_size:integer"]))
    
print("\n".join([str(x) for x in fragment_size]))

/home/diane/woldlab/ENCODE/stranded-25011-25032-snyder-tissue.xlsx
240 pass
261 pass
255 pass
256 pass
259 pass
255 pass
266 pass
209 pass
248 pass
260 pass
250 pass
269 pass
240 pass
248 pass
251 pass
257 pass
246 pass
250 pass
250 pass
242 pass
235 pass
249 pass


# Register Libraries

In [18]:
print(spreadsheet_name)
libraries = pandas.read_excel(spreadsheet_name, sheet_name='Library', header=0, engine=engine)
created = server.post_sheet('/libraries/', 
                            libraries,
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

/home/diane/woldlab/ENCODE/stranded-25011-25032-snyder-tissue.xlsx
0


In [17]:
if created:
    libraries.to_excel('/dev/shm/libraries.xlsx', index=False)

# Register Experiments

In [21]:
print(server.server)
experiments = pandas.read_excel(spreadsheet_name, sheet_name='Experiment', header=0, engine=engine)
experiments = experiments[experiments['accession'] != 'barbara approval needed']
created = server.post_sheet('/experiments/', 
                            experiments, 
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

www.encodeproject.org
0


In [20]:
if created:
    experiments.to_excel('/dev/shm/experiments.xlsx', index=False)

# Register Replicates

In [24]:
print(server.server)
print(spreadsheet_name)
replicates = pandas.read_excel(spreadsheet_name, sheet_name='Replicate', header=0, engine=engine)
replicates = replicates[replicates['uuid'] != 'barbara approval needed']
created = server.post_sheet('/replicates/',
                            replicates, 
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

www.encodeproject.org
/home/diane/woldlab/ENCODE/stranded-25011-25032-snyder-tissue.xlsx
0


In [23]:
if created:
    replicates.to_excel('/dev/shm/replicates.xlsx', index=False)

# Check Files

In [25]:
files = pandas.read_excel(spreadsheet_name, sheet_name='File', header=0, engine=engine)
created = server.post_sheet('/files/', files, verbose=True, dry_run=True, validator=validator)
print(len(created))

22


# Check NIH Institutional Certifications

In [None]:
biosample_sheet = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

for i, row in biosample_sheet.iterrows():
    if not pandas.isnull(row.accession) and row.accession.startswith('E'):
        biosample = server.get_json(row.accession)
        biosample_ontology = biosample['biosample_ontology']
        parent = biosample.get("part_of", {})
        parent_ontology = parent.get("biosample_ontology", {})
        print(biosample["accession"], parent["accession"], parent["nih_institutional_certification"], parent_ontology["term_name"])
              #biosample_ontology.get("nih_institutional_certification"), biosample_ontology["term_name"])
        #if isinstance(biosample_ontology, dict):
        #    biosample_sheet.loc[i, 'biosample_ontology'] = biosample_ontology['@id']
        #    biosample_sheet.loc[i, 'biosample_term_name:skip'] = biosample_ontology['term_name']
        #    for term in [("organism","@id"), ("source","@id"), ("donor","@id"), ("lab","@id"), ("award", "@id")]:
        #        if pandas.isnull(biosample_sheet.loc[i, term[0]]):
        #            biosample_sheet.loc[i, term[0]] = biosample[term[0]][term[1]]
            
#biosample_sheet