Preparing to submit wold stranded samples....

This got a little messy because a couple libraries needed a top up.

In [1]:
import os
import sys
import requests
import pandas
import paramiko
import re
import json
from IPython import display
from pathlib import Path
import configparser

In [2]:
from curation_common import *
from encoded_client.encoded import DCCValidator

In [3]:
from encoded_client.encoded import Document
from encoded_client.submission import run_aws_cp
from htsworkflow.util.api import (
    add_auth_options,
    make_auth_from_opts,
    HtswApi,
)

In [4]:
ls /etc/ssl/certs/ca-certificates.crt

/etc/ssl/certs/ca-certificates.crt


In [5]:
config = configparser.ConfigParser()
config.read([os.path.expanduser('~/.htsworkflow.ini'),
             '/etc/htsworkflow.ini'
             ])

SECTION = 'sequence_archive'
if config.has_section(SECTION):
    apiid = config.get(SECTION, 'apiid')
    apikey = config.get(SECTION, 'apikey')
    apihost = config.get(SECTION, 'host')

auth = {'apiid': apiid, 'apikey': apikey }
htsw = HtswApi(apihost, auth)

In [6]:
# live server & control file
server = ENCODED('www.encodeproject.org')
spreadsheet_name = Path('~/woldlab/ENCODE/stranded-24196-24219-rush-resubmit.xlsx').expanduser()
engine=None
#engine='odf'

# test server & datafile
#server = ENCODED('test.encodedcc.org')
#spreadsheet_name = os.path.expanduser('~diane/woldlab/ENCODE/C1-encode3-limb-2017-testserver.ods')

server.load_netrc()
validator = DCCValidator(server)

In [7]:
award = 'UM1HG009443'

# Confirm biosample donor is right

In [8]:
print(spreadsheet_name)
biosample = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

for i, row in biosample.iterrows():
    library_id = row["library_id:skip"]
    cdna = row["cDNA_sample:skip"]
    description = row["description"]
    donor = row["donor"]
    match = re.search("Rush ID_E(?P<rushid>[0-9]+)", description)
    assert match.group("rushid") == donor[len("john-stamatoyannopoulos:E"):], "{} {} {}".format(library_id, match.group("rushid"), donor)
    match = re.search("(?P<encid>ENC4_cDNA([0-9]+))", description)
    if match is not None:
        assert match.group("encid") == cdna, "{} {} {}".format(library_id, match.group("encid"), cdna)
    else:
        print("{} {} lacks an encind".format(i, row.description))
    

/home/diane/woldlab/ENCODE/stranded-24196-24219-rush-resubmit.xlsx
24 human_brain_Rush ID_E3348003_BS71 lacks an encind
25 human_brain_Rush ID_E7461192_BS73 lacks an encind


# Confirm experiment is right

In [9]:
print(spreadsheet_name)
experiment = pandas.read_excel(spreadsheet_name, sheet_name='Experiment', header=0, engine=engine)

for i, row in experiment.iterrows():
    library_id = row["library_id:skip"]
    library_info = htsw.get_library(library_id)
    assert library_info["library_name"] == row["description"], "{} {} != {}".format(library_id, library_info["library_name"], row["description"])

/home/diane/woldlab/ENCODE/stranded-24196-24219-rush-resubmit.xlsx


In [10]:
htsw.get_library("24196")

{'antibody_id': None,
 'cell_line_id': None,
 'cell_line': None,
 'experiment_type': 'RNA-seq',
 'experiment_type_id': 4,
 'gel_cut_size': 357,
 'hidden': False,
 'id': '24196',
 'insert_size': 257,
 'lane_set': [{'flowcell': 'AAAWHMGM5',
   'lane_number': 1,
   'lane_id': 13803,
   'paired_end': False,
   'read_length': 100,
   'status_code': None,
   'status': ''}],
 'library_id': '24196',
 'library_name': 'human_brain_Rush ID_E3253149_BS96_ENC4_cDNA673',
 'library_species': 'Homo sapiens',
 'library_species_id': 8,
 'library_type_id': 11,
 'made_for': '',
 'made_by': 'Brian',
 'notes': 'gel cut size from BioAnalyzer',
 'replicate': None,
 'stopping_point': 'Done',
 'successful_pM': None,
 'undiluted_concentration': '7.10',
 'library_type': 'NEBNext Multiplexed'}

# Lookup biosample ontologies

Lookup any biosample ontologies that are already present

In [11]:
biosample = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

for i, row in biosample.iterrows():
    if not pandas.isnull(row.accession) and row.accession.startswith('E'):
        obj = server.get_json(row.accession)
        biosample_ontology = obj['biosample_ontology']
        if isinstance(biosample_ontology, dict):
            biosample.loc[i, 'biosample_ontology'] = biosample_ontology['@id']
            biosample.loc[i, 'biosample_term_name:skip'] = biosample_ontology['term_name']
        biosample.loc[i, 'source'] = obj['source']['@id']
            
biosample

Unnamed: 0,uuid,accession,rush_id:skip,age:skip,age_units:skip,sex:skip,clinical_status:skip,library_id:skip,cDNA_sample:skip,description,biosample_ontology,biosample_term_name:skip,aliases:array,nih_institutional_certification,organism,source,donor,lab,award
0,3fa91232-1624-432f-8fe0-17fc63e67ee5,ENCBS190LXS,E3253149,90 or above,year,male,MCI + no CI,24196.0,ENC4_cDNA673,human_brain_Rush ID_E3253149_BS96_ENC4_cDNA673,/biosample-types/tissue_UBERON_0006483/,middle frontal area 46,barbara-wold:ENC4_cDNA673,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E3253149,barbara-wold,UM1HG009443
1,c18074ac-84c6-4ff3-9399-04a3bfaccc26,ENCBS943VMJ,E3332236,82,year,female,NCI,24197.0,ENC4_cDNA674,human_brain_Rush ID_E3332236_BS97_ENC4_cDNA674,/biosample-types/tissue_UBERON_0006483/,middle frontal area 46,barbara-wold:ENC4_cDNA674,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E3332236,barbara-wold,UM1HG009443
2,9dcc1b67-e362-431a-ba14-0670da3ff189,ENCBS129NMG,E4638217,90 or above,year,female,MCI + no CI,24198.0,ENC4_cDNA675,human_brain_Rush ID_E4638217_BS98_ENC4_cDNA675,/biosample-types/tissue_UBERON_0006483/,middle frontal area 46,barbara-wold:ENC4_cDNA675,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E4638217,barbara-wold,UM1HG009443
3,dc8f6768-e25a-4603-8072-3dae6615f4b3,ENCBS997UQN,E4646042,90 or above,year,female,MCI + no CI,24199.0,ENC4_cDNA676,human_brain_Rush ID_E4646042_BS99_ENC4_cDNA676,/biosample-types/tissue_UBERON_0006483/,middle frontal area 46,barbara-wold:ENC4_cDNA676,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E4646042,barbara-wold,UM1HG009443
4,27920e09-61e6-4cf0-852e-725fb9f2277f,ENCBS754RPU,E5448057,81,year,female,MCI + CI,24200.0,ENC4_cDNA677,human_brain_Rush ID_E5448057_BS100_ENC4_cDNA677,/biosample-types/tissue_UBERON_0006483/,middle frontal area 46,barbara-wold:ENC4_cDNA677,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E5448057,barbara-wold,UM1HG009443
5,56f77e8e-24af-4c3f-a278-cccd9eb7a3c5,ENCBS439FJF,E6090948,87,year,female,NCI,24201.0,ENC4_cDNA678,human_brain_Rush ID_E6090948_BS101_ENC4_cDNA678,/biosample-types/tissue_UBERON_0006483/,middle frontal area 46,barbara-wold:ENC4_cDNA678,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E6090948,barbara-wold,UM1HG009443
6,01a4646c-0878-4008-972d-62fe3cb97438,ENCBS885BFB,E1005516,90 or above,year,female,NINCDS PROB AD,24202.0,ENC4_cDNA679,human_brain_Rush ID_E1005516_BS117_ENC4_cDNA679,/biosample-types/tissue_UBERON_0006483/,middle frontal area 46,barbara-wold:ENC4_cDNA679,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E1005516,barbara-wold,UM1HG009443
7,fbc372f6-bc9b-4ed3-b19f-6b6fe90b62b3,ENCBS372KDQ,E1097236,85,year,female,NINCDS PROB AD,24203.0,ENC4_cDNA680,human_brain_Rush ID_E1097236_BS118_ENC4_cDNA680,/biosample-types/tissue_UBERON_0006483/,middle frontal area 46,barbara-wold:ENC4_cDNA680,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E1097236,barbara-wold,UM1HG009443
8,3b250c0b-76c9-4017-9156-734b44a3e872,ENCBS875BBD,E1121653,86,year,female,NCI,24204.0,ENC4_cDNA681,human_brain_Rush ID_E1121653_BS119_ENC4_cDNA681,/biosample-types/tissue_UBERON_0006483/,middle frontal area 46,barbara-wold:ENC4_cDNA681,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E1121653,barbara-wold,UM1HG009443
9,f2e19a85-d314-4241-8e85-c1d50f7fc7de,ENCBS137SYN,E1182310,83,year,male,NCI,24205.0,ENC4_cDNA682,human_brain_Rush ID_E1182310_BS120_ENC4_cDNA682,/biosample-types/tissue_UBERON_0006483/,middle frontal area 46,barbara-wold:ENC4_cDNA682,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E1182310,barbara-wold,UM1HG009443


In [12]:
#biosample.to_excel('/dev/shm/biosamples.xlsx', index=False)

# Lookup clinical data

In [13]:
print(spreadsheet_name)
biosample = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

def add_if_needed(sheet, name, values):
    sheet[name] = values
    return sheet

rush_ids = []
ages = []
age_units = []
sexes = []
clinical = []
for i, row in biosample.iterrows():
    donor = server.get_json(row["donor"])
    rush_ids.append(row["donor"].split(":")[1])
    ages.append(donor["age"]),
    age_units.append(donor["age_units"]),
    sexes.append(donor["sex"]),
    clinical.append(donor['submitter_comment'])
    
add_if_needed(biosample, "rush_id:skip", rush_ids)
add_if_needed(biosample, "age:skip", ages)
add_if_needed(biosample, "age_units:skip", age_units)
add_if_needed(biosample, "sex:skip", sexes)
add_if_needed(biosample, "clinical_status:skip", clinical)

biosample.to_excel("/dev/shm/biosample.xlsx")

/home/diane/woldlab/ENCODE/stranded-24196-24219-rush-resubmit.xlsx


In [14]:
biosample

Unnamed: 0,uuid,accession,rush_id:skip,age:skip,age_units:skip,sex:skip,clinical_status:skip,library_id:skip,cDNA_sample:skip,description,biosample_ontology,biosample_term_name:skip,aliases:array,nih_institutional_certification,organism,source,donor,lab,award
0,3fa91232-1624-432f-8fe0-17fc63e67ee5,ENCBS190LXS,E3253149,90 or above,year,male,MCI + no CI,24196.0,ENC4_cDNA673,human_brain_Rush ID_E3253149_BS96_ENC4_cDNA673,/biosample-types/tissue_UBERON_0006483/,mid frontal cortex,barbara-wold:ENC4_cDNA673,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E3253149,barbara-wold,UM1HG009443
1,c18074ac-84c6-4ff3-9399-04a3bfaccc26,ENCBS943VMJ,E3332236,82,year,female,NCI,24197.0,ENC4_cDNA674,human_brain_Rush ID_E3332236_BS97_ENC4_cDNA674,/biosample-types/tissue_UBERON_0006483/,mid frontal cortex,barbara-wold:ENC4_cDNA674,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E3332236,barbara-wold,UM1HG009443
2,9dcc1b67-e362-431a-ba14-0670da3ff189,ENCBS129NMG,E4638217,90 or above,year,female,MCI + no CI,24198.0,ENC4_cDNA675,human_brain_Rush ID_E4638217_BS98_ENC4_cDNA675,/biosample-types/tissue_UBERON_0006483/,mid frontal cortex,barbara-wold:ENC4_cDNA675,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E4638217,barbara-wold,UM1HG009443
3,dc8f6768-e25a-4603-8072-3dae6615f4b3,ENCBS997UQN,E4646042,90 or above,year,female,MCI + no CI,24199.0,ENC4_cDNA676,human_brain_Rush ID_E4646042_BS99_ENC4_cDNA676,/biosample-types/tissue_UBERON_0006483/,mid frontal cortex,barbara-wold:ENC4_cDNA676,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E4646042,barbara-wold,UM1HG009443
4,27920e09-61e6-4cf0-852e-725fb9f2277f,ENCBS754RPU,E5448057,81,year,female,MCI + CI,24200.0,ENC4_cDNA677,human_brain_Rush ID_E5448057_BS100_ENC4_cDNA677,/biosample-types/tissue_UBERON_0006483/,mid frontal cortex,barbara-wold:ENC4_cDNA677,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E5448057,barbara-wold,UM1HG009443
5,56f77e8e-24af-4c3f-a278-cccd9eb7a3c5,ENCBS439FJF,E6090948,87,year,female,NCI,24201.0,ENC4_cDNA678,human_brain_Rush ID_E6090948_BS101_ENC4_cDNA678,/biosample-types/tissue_UBERON_0006483/,mid frontal cortex,barbara-wold:ENC4_cDNA678,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E6090948,barbara-wold,UM1HG009443
6,01a4646c-0878-4008-972d-62fe3cb97438,ENCBS885BFB,E1005516,90 or above,year,female,NINCDS PROB AD,24202.0,ENC4_cDNA679,human_brain_Rush ID_E1005516_BS117_ENC4_cDNA679,/biosample-types/tissue_UBERON_0006483/,mid frontal cortex,barbara-wold:ENC4_cDNA679,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E1005516,barbara-wold,UM1HG009443
7,fbc372f6-bc9b-4ed3-b19f-6b6fe90b62b3,ENCBS372KDQ,E1097236,85,year,female,NINCDS PROB AD,24203.0,ENC4_cDNA680,human_brain_Rush ID_E1097236_BS118_ENC4_cDNA680,/biosample-types/tissue_UBERON_0006483/,mid frontal cortex,barbara-wold:ENC4_cDNA680,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E1097236,barbara-wold,UM1HG009443
8,3b250c0b-76c9-4017-9156-734b44a3e872,ENCBS875BBD,E1121653,86,year,female,NCI,24204.0,ENC4_cDNA681,human_brain_Rush ID_E1121653_BS119_ENC4_cDNA681,/biosample-types/tissue_UBERON_0006483/,mid frontal cortex,barbara-wold:ENC4_cDNA681,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E1121653,barbara-wold,UM1HG009443
9,f2e19a85-d314-4241-8e85-c1d50f7fc7de,ENCBS137SYN,E1182310,83,year,male,NCI,24205.0,ENC4_cDNA682,human_brain_Rush ID_E1182310_BS120_ENC4_cDNA682,/biosample-types/tissue_UBERON_0006483/,mid frontal cortex,barbara-wold:ENC4_cDNA682,NIC00058,/organisms/human/,/sources/rush-university/,john-stamatoyannopoulos:E1182310,barbara-wold,UM1HG009443


# Register Biosamples

In [15]:
biosample = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)
created = server.post_sheet('/biosamples/', biosample, 
                            verbose=True, 
                            dry_run=True,
                            validator=validator)
print(len(created))

0


In [16]:
if created:
    biosample.to_excel('/dev/shm/biosamples.xlsx', index=False)

# Retrieve library starting amount

In [17]:
print(spreadsheet_name)
libraries = pandas.read_excel(spreadsheet_name, sheet_name='Library', header=0, engine=engine)

fragment_size = []
for i, row in libraries.iterrows():
    library_id = row["library_id:skip"]
    library_info = htsw.get_library(library_id)    
    if pandas.isnull(row["average_fragment_size:integer"]):
        fragment_size.append(library_info["insert_size"])
    else:
        assert library_info["insert_size"] == row["average_fragment_size:integer"], "{} {} {}!={}".format(i, library_id, library_info["insert_size"], row["average_fragment_size:integer"])
        fragment_size.append(row["average_fragment_size:integer"])
    
#fragment_size

/home/diane/woldlab/ENCODE/stranded-24196-24219-rush-resubmit.xlsx


# Register Libraries

In [18]:
print(spreadsheet_name)
libraries = pandas.read_excel(spreadsheet_name, sheet_name='Library', header=0, engine=engine)
created = server.post_sheet('/libraries/', 
                            libraries,
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

/home/diane/woldlab/ENCODE/stranded-24196-24219-rush-resubmit.xlsx
0


In [19]:
if created:
    libraries.to_excel('/dev/shm/libraries.xlsx', index=False)

# Register Experiments

In [20]:
print(server.server)
experiments = pandas.read_excel(spreadsheet_name, sheet_name='Experiment', header=0, engine=engine)
experiments = experiments[experiments['accession'] != 'barbara approval needed']
created = server.post_sheet('/experiments/', 
                            experiments, 
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

www.encodeproject.org
0


In [21]:
if created:
    experiments.to_excel('/dev/shm/experiments.xlsx', index=False)

# Register Replicates

In [22]:
print(server.server)
print(spreadsheet_name)
replicates = pandas.read_excel(spreadsheet_name, sheet_name='Replicate', header=0, engine=engine)
replicates = replicates[replicates['uuid'] != 'barbara approval needed']
created = server.post_sheet('/replicates/',
                            replicates, 
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

www.encodeproject.org
/home/diane/woldlab/ENCODE/stranded-24196-24219-rush-resubmit.xlsx
0


In [23]:
if created:
    replicates.to_excel('/dev/shm/replicates.xlsx', index=False)

# Check Files

In [24]:
files = pandas.read_excel(spreadsheet_name, sheet_name='File', header=0, engine=engine)
created = server.post_sheet('/files/', files, verbose=True, dry_run=True, validator=validator)
print(len(created))

0


# Fix Biosample NICs

In [33]:
biosample = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

for i, row in biosample.iterrows():
    if not pandas.isnull(row.accession) and row.accession.startswith('E'):
        obj = server.get_json(row.accession)
        nic = obj.get("nih_institutional_certification")
        if nic != row["nih_institutional_certification"]:
            payload = {"nih_institutional_certification": row["nih_institutional_certification"]}
            print("Need to update {}. {} to {}".format(obj["@id"], nic, payload))
            #print(server.patch_json(obj["@id"], payload))
            
            


Need to update /biosamples/ENCBS190LXS/. None to {'nih_institutional_certification': 'NIC00058'}
Need to update /biosamples/ENCBS943VMJ/. None to {'nih_institutional_certification': 'NIC00058'}
Need to update /biosamples/ENCBS129NMG/. None to {'nih_institutional_certification': 'NIC00058'}
Need to update /biosamples/ENCBS997UQN/. None to {'nih_institutional_certification': 'NIC00058'}
Need to update /biosamples/ENCBS754RPU/. None to {'nih_institutional_certification': 'NIC00058'}
Need to update /biosamples/ENCBS439FJF/. None to {'nih_institutional_certification': 'NIC00058'}
Need to update /biosamples/ENCBS885BFB/. None to {'nih_institutional_certification': 'NIC00058'}
Need to update /biosamples/ENCBS372KDQ/. None to {'nih_institutional_certification': 'NIC00058'}
Need to update /biosamples/ENCBS875BBD/. None to {'nih_institutional_certification': 'NIC00058'}
Need to update /biosamples/ENCBS137SYN/. None to {'nih_institutional_certification': 'NIC00058'}
Need to update /biosamples/ENC