Preparing to submit wold stranded samples....


In [1]:
import os
import sys
import requests
import pandas
import paramiko
import re
import json
from IPython import display
from pathlib import Path
import configparser

In [2]:
from curation_common import *
from encoded_client.encoded import DCCValidator

In [3]:
from encoded_client.encoded import Document
from encoded_client.submission import run_aws_cp
from htsworkflow.util.api import (
    add_auth_options,
    make_auth_from_opts,
    HtswApi,
)

In [4]:
config = configparser.ConfigParser()
config.read([os.path.expanduser('~/.htsworkflow.ini'),
             '/etc/htsworkflow.ini'
             ])

SECTION = 'sequence_archive'
if config.has_section(SECTION):
    apiid = config.get(SECTION, 'apiid')
    apikey = config.get(SECTION, 'apikey')
    apihost = config.get(SECTION, 'host')

auth = {'apiid': apiid, 'apikey': apikey }
htsw = HtswApi(apihost, auth)

In [5]:
# live server & control file
server = ENCODED('www.encodeproject.org')
spreadsheet_name = Path('~/woldlab/ENCODE/stranded-24610-24626-jax-bucket2.xlsx').expanduser()
engine=None
#engine='odf'

# test server & datafile
#server = ENCODED('test.encodedcc.org')
#spreadsheet_name = os.path.expanduser('~diane/woldlab/ENCODE/C1-encode3-limb-2017-testserver.ods')

server.load_netrc()
validator = DCCValidator(server)

assert spreadsheet_name.exists()

In [6]:
award = 'UM1HG009443'

# Lookup biosample ontologies

Lookup any biosample ontologies that are already present

In [7]:
biosample_sheet = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

for i, row in biosample_sheet.iterrows():
    if not pandas.isnull(row.accession) and row.accession.startswith('E'):
        biosample = server.get_json(row.accession)
        biosample_ontology = biosample['biosample_ontology']
        if isinstance(biosample_ontology, dict):
            biosample_sheet.loc[i, 'biosample_ontology'] = biosample_ontology['@id']
            biosample_sheet.loc[i, 'biosample_term_name:skip'] = biosample_ontology['term_name']
            for term in [("organism","@id"), ("source","@id"), ("donor","@id"), ("lab","@id"), ("award", "@id")]:
                if pandas.isnull(biosample_sheet.loc[i, term[0]]):
                    biosample_sheet.loc[i, term[0]] = biosample[term[0]][term[1]]
            
biosample_sheet

Unnamed: 0,uuid,accession,library_id:skip,cDNA_sample:skip,description,biosample_ontology,biosample_term_name:skip,aliases:array,nih_institutional_certification,organism,source,donor,lab,award
0,,ENCBS917FHP,24610,ENC4_cDNA_809,CD4 N (RA+ MACS sort)Th0 (no diff Cytokines)(d...,/biosample-types/primary_cell_CL_0000895/,"naive thymus-derived CD4-positive, alpha-beta ...",barbara-wold:ENC4_cDNA_809,,/organisms/human/,/sources/allcells/,/human-donors/ENCDO374BBL/,/labs/yijun-ruan/,/awards/UM1HG009444/
1,,ENCBS737JKH,24611,ENC4_cDNA_810,CD4 N (RA+ MACS sort)Th0 (no diff Cytokines)(d...,/biosample-types/primary_cell_CL_0000895/,"naive thymus-derived CD4-positive, alpha-beta ...",barbara-wold:ENC4_cDNA_810,,/organisms/human/,/sources/allcells/,/human-donors/ENCDO374BBL/,/labs/yijun-ruan/,/awards/UM1HG009444/
2,,ENCBS238ZBD,24612,ENC4_cDNA_811,CD4 N (RA+ MACS sort)Th0 (no diff Cytokines)(d...,/biosample-types/primary_cell_CL_0000896/,"activated CD4-positive, alpha-beta T cell",barbara-wold:ENC4_cDNA_811,,/organisms/human/,/sources/allcells/,/human-donors/ENCDO374BBL/,/labs/yijun-ruan/,/awards/UM1HG009444/
3,,ENCBS242UJR,24613,ENC4_cDNA_812,CD4 N (RA+ MACS sort)Th0 (no diff Cytokines)(d...,/biosample-types/primary_cell_CL_0000896/,"activated CD4-positive, alpha-beta T cell",barbara-wold:ENC4_cDNA_812,,/organisms/human/,/sources/allcells/,/human-donors/ENCDO374BBL/,/labs/yijun-ruan/,/awards/UM1HG009444/
4,,ENCBS995NLO,24614,ENC4_cDNA_813,CD4 N (RA+ MACS sort)Th1 (aIL-4 1ug/ml+IL-12 3...,/biosample-types/primary_cell_CL_0000545/,T-helper 1 cell,barbara-wold:ENC4_cDNA_813,,/organisms/human/,/sources/allcells/,/human-donors/ENCDO374BBL/,/labs/yijun-ruan/,/awards/UM1HG009444/
5,,ENCBS941XOI,24615,ENC4_cDNA_814,CD4 N (RA+ MACS sort)Th1 (aIL-4 1ug/ml+IL-12 3...,/biosample-types/primary_cell_CL_0000545/,T-helper 1 cell,barbara-wold:ENC4_cDNA_814,,/organisms/human/,/sources/allcells/,/human-donors/ENCDO374BBL/,/labs/yijun-ruan/,/awards/UM1HG009444/
6,,ENCBS942GGC,24616,ENC4_cDNA_815,CD4 N (RA+ MACS sort)Th1 (aIL-4 1ug/ml+IL-12 3...,/biosample-types/primary_cell_NTR_0000633/,activated T-helper 1 cell,barbara-wold:ENC4_cDNA_815,,/organisms/human/,/sources/allcells/,/human-donors/ENCDO374BBL/,/labs/yijun-ruan/,/awards/UM1HG009444/
7,,ENCBS035SRN,24617,ENC4_cDNA_816,CD4 N (RA+ MACS sort)Th1 (aIL-4 1ug/ml+IL-12 3...,/biosample-types/primary_cell_NTR_0000633/,activated T-helper 1 cell,barbara-wold:ENC4_cDNA_816,,/organisms/human/,/sources/allcells/,/human-donors/ENCDO374BBL/,/labs/yijun-ruan/,/awards/UM1HG009444/
8,,ENCBS983OVZ,24618,ENC4_cDNA_817,CD4 N (RA+ MACS sort)Th2 (aIFNg 5ug/ml+IL-4 10...,/biosample-types/primary_cell_CL_0000546/,T-helper 2 cell,barbara-wold:ENC4_cDNA_817,,/organisms/human/,/sources/allcells/,/human-donors/ENCDO374BBL/,/labs/yijun-ruan/,/awards/UM1HG009444/
9,,ENCBS727FRT,24619,ENC4_cDNA_818,CD4 N (RA+ MACS sort)Th2 (aIFNg 5ug/ml+IL-4 10...,/biosample-types/primary_cell_CL_0000546/,T-helper 2 cell,barbara-wold:ENC4_cDNA_818,,/organisms/human/,/sources/allcells/,/human-donors/ENCDO374BBL/,/labs/yijun-ruan/,/awards/UM1HG009444/


In [8]:
biosample_sheet.to_excel('/dev/shm/biosamples.xlsx', index=False)

# Generate experiment descriptions

In [9]:
biosample_sheet = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

def uniquify_pandas(rows):
    if isinstance(rows, str):
        return rows
    elif isinstance(rows, pandas.Series):
        result = set()
        for value in rows:
            result.add(value)
        assert len(result) == 1, "Too many values {}".format(result)
        return result.pop()
    else:
        raise RuntimeError("Unexpected type")

experiments = {}

for i, row in biosample_sheet.iterrows():
    desired = re.sub("_rep_[\d]+(_ENC4_cDNA_[\d]+)?", "", row.description)
    #experiments.setdefault(desired, []).append(row.description)
    #biosample_ontology = biosample_sheet.set_index("description").loc[row.description, "biosample_ontology"]
    #for biosample_ontology_id in biosample_ontology:
    #    experiment_ontologies.setdefault(desired, set()).add(biosample_ontology_id)

    #print(type(biosample_sheet.set_index("description").loc[row.description, "biosample_term_name:skip"]))
    #for term in biosample_sheet.set_index("description").loc[row.description, "biosample_term_name:skip"]:
    #    print(term)
    #experiment_ontology_terms.setdefault(desired, set()).add(biosample_ontology_term)
    experiment_ontology = uniquify_pandas(biosample_sheet.set_index("description").loc[row.description, "biosample_ontology"])
    experiment_ontology_term = uniquify_pandas(biosample_sheet.set_index("description").loc[row.description, "biosample_term_name:skip"])
    experiments[desired] = {
        "biosample_ontology":experiment_ontology, 
        "biosample_ontology_term_name:skip": experiment_ontology_term
    }
    
experiment_metadata = pandas.DataFrame(experiments).T

experiment_metadata.to_excel("/dev/shm/experiment_metadata.xlsx")
experiment_metadata


Unnamed: 0,biosample_ontology,biosample_ontology_term_name:skip
CD4 N (RA+ MACS sort)Th0 (no diff Cytokines)(day 14),/biosample-types/primary_cell_CL_0000895/,"naive thymus-derived CD4-positive, alpha-beta ..."
CD4 N (RA+ MACS sort)Th0 (no diff Cytokines)(day 13)+beads (1:3) day 1,/biosample-types/primary_cell_CL_0000896/,"activated CD4-positive, alpha-beta T cell"
CD4 N (RA+ MACS sort)Th1 (aIL-4 1ug/ml+IL-12 30ng/ml)(day 14),/biosample-types/primary_cell_CL_0000545/,T-helper 1 cell
CD4 N (RA+ MACS sort)Th1 (aIL-4 1ug/ml+IL-12 30ng/ml)(day 13)+beads (1:3) day 1,/biosample-types/primary_cell_NTR_0000633/,activated T-helper 1 cell
CD4 N (RA+ MACS sort)Th2 (aIFNg 5ug/ml+IL-4 100ng/ml)(day 14),/biosample-types/primary_cell_CL_0000546/,T-helper 2 cell
CD4 N (RA+ MACS sort)Th2 (aIFNg 5ug/ml+IL-4 100ng/ml)(day 13)+beads (1:3) day 1,/biosample-types/primary_cell_NTR_0000634/,activated T-helper 2 cell
PBMC (NK MACS sort)NK,/biosample-types/primary_cell_CL_0000623/,natural killer cell
PBMCCD19+ (B cells) resting,/biosample-types/primary_cell_CL_0000236/,B cell
PBMCCD19+ (B cells) activatedCPG ODN2006 24h,/biosample-types/primary_cell_NTR_0000506/,activated B cell


# Register Biosamples

In [10]:
biosample = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)
created = server.post_sheet('/biosamples/', biosample, 
                            verbose=True, 
                            dry_run=True,
                            validator=validator)
print(len(created))

/labs/yijun-ruan/ is not a lab in user Diane Trout can submits_for
/labs/yijun-ruan/ is not a lab in user Diane Trout can submits_for
/labs/yijun-ruan/ is not a lab in user Diane Trout can submits_for
/labs/yijun-ruan/ is not a lab in user Diane Trout can submits_for
/labs/yijun-ruan/ is not a lab in user Diane Trout can submits_for
/labs/yijun-ruan/ is not a lab in user Diane Trout can submits_for
/labs/yijun-ruan/ is not a lab in user Diane Trout can submits_for
/labs/yijun-ruan/ is not a lab in user Diane Trout can submits_for
/labs/yijun-ruan/ is not a lab in user Diane Trout can submits_for
/labs/yijun-ruan/ is not a lab in user Diane Trout can submits_for
/labs/yijun-ruan/ is not a lab in user Diane Trout can submits_for
/labs/yijun-ruan/ is not a lab in user Diane Trout can submits_for
/labs/yijun-ruan/ is not a lab in user Diane Trout can submits_for
/labs/yijun-ruan/ is not a lab in user Diane Trout can submits_for
/labs/yijun-ruan/ is not a lab in user Diane Trout can submits

0


In [11]:
if created:
    biosample.to_excel('/dev/shm/biosamples.xlsx', index=False)

# Retrieve library starting amount

In [12]:
print(spreadsheet_name)
libraries = pandas.read_excel(spreadsheet_name, sheet_name='Library', header=0, engine=engine)

fragment_size = []
for i, row in libraries.iterrows():
    library_id = row["library_id:skip"]
    library_info = htsw.get_library(library_id)    
    if pandas.isnull(row["average_fragment_size:integer"]):
        fragment_size.append(library_info["insert_size"])
    else:
        assert library_info["insert_size"] == row["average_fragment_size:integer"], "{} {} {}!={}".format(i, library_id, library_info["insert_size"], row["average_fragment_size:integer"])
        fragment_size.append("{} pass".format(row["average_fragment_size:integer"]))
    
print("\n".join([str(x) for x in fragment_size]))

/home/diane/woldlab/ENCODE/stranded-24610-24626-jax-bucket2.xlsx
286 pass
286 pass
274 pass
281 pass
290 pass
231 pass
279 pass
235 pass
224 pass
277 pass
279 pass
278 pass
281 pass
276 pass
281 pass
280 pass
283 pass


# Register Libraries

In [13]:
print(spreadsheet_name)
libraries = pandas.read_excel(spreadsheet_name, sheet_name='Library', header=0, engine=engine)
created = server.post_sheet('/libraries/', 
                            libraries,
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

/home/diane/woldlab/ENCODE/stranded-24610-24626-jax-bucket2.xlsx
0


In [14]:
if created:
    libraries.to_excel('/dev/shm/libraries.xlsx', index=False)

# Register Experiments

In [15]:
print(server.server)
experiments = pandas.read_excel(spreadsheet_name, sheet_name='Experiment', header=0, engine=engine)
experiments = experiments[experiments['accession'] != 'barbara approval needed']
created = server.post_sheet('/experiments/', 
                            experiments, 
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

www.encodeproject.org
0


In [16]:
if created:
    experiments.to_excel('/dev/shm/experiments.xlsx', index=False)

# Verify replicate experiment ids

In [17]:
replicates = pandas.read_excel(spreadsheet_name, sheet_name='Replicate', header=0, engine=engine)
experiments = pandas.read_excel(spreadsheet_name, sheet_name='Experiment', header=0, engine=engine)

for i, replicate in replicates.iterrows():
    description = experiments.set_index("accession").loc[replicate.experiment, "description"]
    if replicate["description:skip"] == description:
        print("{} matches".format(replicate.experiment))
    else:
        print("{} differs: '{}' '{}'".format(replicate.experiment, replicate["description:skip"], description))


ENCSR317HKT matches
ENCSR317HKT matches
ENCSR052SDT matches
ENCSR052SDT matches
ENCSR743TJZ matches
ENCSR743TJZ matches
ENCSR475KPG matches
ENCSR475KPG matches
ENCSR588TIV matches
ENCSR588TIV matches
ENCSR341VFG matches
ENCSR341VFG matches
ENCSR927KSI matches
ENCSR896YYL matches
ENCSR896YYL matches
ENCSR398REC matches
ENCSR398REC matches


# Register Replicates

In [18]:
print(server.server)
print(spreadsheet_name)
replicates = pandas.read_excel(spreadsheet_name, sheet_name='Replicate', header=0, engine=engine)
replicates = replicates[replicates['uuid'] != 'barbara approval needed']
created = server.post_sheet('/replicates/',
                            replicates, 
                            verbose=True, 
                            dry_run=False, 
                            validator=validator)
print(len(created))

www.encodeproject.org
/home/diane/woldlab/ENCODE/stranded-24610-24626-jax-bucket2.xlsx
0


In [19]:
if created:
    replicates.to_excel('/dev/shm/replicates.xlsx', index=False)

# Check Files

In [20]:
files = pandas.read_excel(spreadsheet_name, sheet_name='File', header=0, engine=engine)
created = server.post_sheet('/files/', files, verbose=True, dry_run=True, validator=validator)
print(len(created))

0


# Check NIH Institutional Certifications

In [34]:
biosample_sheet = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

for i, row in biosample_sheet.iterrows():
    if not pandas.isnull(row.accession) and row.accession.startswith('E'):
        biosample = server.get_json(row.accession)
        biosample_ontology = biosample['biosample_ontology']
        parent = biosample.get("part_of", {})
        parent_ontology = parent.get("biosample_ontology", {})
        print(biosample["accession"], parent["accession"], parent["nih_institutional_certification"], parent_ontology["term_name"])
              #biosample_ontology.get("nih_institutional_certification"), biosample_ontology["term_name"])
        #if isinstance(biosample_ontology, dict):
        #    biosample_sheet.loc[i, 'biosample_ontology'] = biosample_ontology['@id']
        #    biosample_sheet.loc[i, 'biosample_term_name:skip'] = biosample_ontology['term_name']
        #    for term in [("organism","@id"), ("source","@id"), ("donor","@id"), ("lab","@id"), ("award", "@id")]:
        #        if pandas.isnull(biosample_sheet.loc[i, term[0]]):
        #            biosample_sheet.loc[i, term[0]] = biosample[term[0]][term[1]]
            
#biosample_sheet

ENCBS917FHP ENCBS346ALM NIC00004 naive thymus-derived CD4-positive, alpha-beta T cell
ENCBS737JKH ENCBS346ALM NIC00004 naive thymus-derived CD4-positive, alpha-beta T cell
ENCBS238ZBD ENCBS862RAU NIC00004 activated CD4-positive, alpha-beta T cell
ENCBS242UJR ENCBS862RAU NIC00004 activated CD4-positive, alpha-beta T cell
ENCBS995NLO ENCBS171QXJ NIC00004 T-helper 1 cell
ENCBS941XOI ENCBS171QXJ NIC00004 T-helper 1 cell
ENCBS942GGC ENCBS992HUL NIC00004 activated T-helper 1 cell
ENCBS035SRN ENCBS992HUL NIC00004 activated T-helper 1 cell
ENCBS983OVZ ENCBS069KZY NIC00004 T-helper 2 cell
ENCBS727FRT ENCBS069KZY NIC00004 T-helper 2 cell
ENCBS309SJI ENCBS148CKG NIC00004 activated T-helper 2 cell
ENCBS696MEH ENCBS148CKG NIC00004 activated T-helper 2 cell
ENCBS190MRQ ENCBS208JXH NIC00004 natural killer cell
ENCBS531BSW ENCBS570COZ NIC00004 B cell
ENCBS533MTK ENCBS570COZ NIC00004 B cell
ENCBS323XPY ENCBS099RDO NIC00004 activated B cell
ENCBS430TYQ ENCBS099RDO NIC00004 activated B cell


In [28]:
sorted(biosample.keys())

['@context',
 '@id',
 '@type',
 'accession',
 'age',
 'age_display',
 'age_units',
 'aliases',
 'alternate_accessions',
 'applied_modifications',
 'audit',
 'award',
 'biosample_ontology',
 'characterizations',
 'date_created',
 'dbxrefs',
 'documents',
 'donor',
 'genetic_modifications',
 'internal_tags',
 'lab',
 'life_stage',
 'organism',
 'origin_batch',
 'parent_of',
 'part_of',
 'perturbed',
 'references',
 'schema_version',
 'sex',
 'simple_summary',
 'source',
 'starting_amount',
 'starting_amount_units',
 'status',
 'submitted_by',
 'summary',
 'treatments',
 'uuid']