Preparing to submit wold stranded samples....

This got a little messy because a couple libraries needed a top up.

In [None]:
import os
import sys
import requests
import pandas
import paramiko
import re
import json
from IPython import display
from pathlib import Path
import configparser

In [None]:
from curation_common import *
from encoded_client.encoded import DCCValidator

In [None]:
from encoded_client.encoded import Document
from encoded_client.submission import run_aws_cp
from htsworkflow.util.api import (
    add_auth_options,
    make_auth_from_opts,
    HtswApi,
)

In [None]:
ls /etc/ssl/certs/ca-certificates.crt

In [None]:
config = configparser.ConfigParser()
config.read([os.path.expanduser('~/.htsworkflow.ini'),
             '/etc/htsworkflow.ini'
             ])

SECTION = 'sequence_archive'
if config.has_section(SECTION):
    apiid = config.get(SECTION, 'apiid')
    apikey = config.get(SECTION, 'apikey')
    apihost = config.get(SECTION, 'host')

auth = {'apiid': apiid, 'apikey': apikey }
htsw = HtswApi(apihost, auth)

In [None]:
# live server & control file
server = ENCODED('www.encodeproject.org')
spreadsheet_name = Path('~/woldlab/ENCODE/stranded-24388-24412-rush-resubmit.xlsx').expanduser()
engine=None
#engine='odf'

# test server & datafile
#server = ENCODED('test.encodedcc.org')
#spreadsheet_name = os.path.expanduser('~diane/woldlab/ENCODE/C1-encode3-limb-2017-testserver.ods')

server.load_netrc()
validator = DCCValidator(server)

In [None]:
award = 'UM1HG009443'

# Confirm biosample donor is right

In [None]:
print(spreadsheet_name)
biosample = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

for i, row in biosample.iterrows():
    library_id = row["library_id:skip"]
    cdna = row["cDNA_sample:skip"]
    description = row["description"]
    donor = row["donor"]
    match = re.search("Rush ID_E(?P<rushid>[0-9]+)", description)
    assert match.group("rushid") == donor[len("john-stamatoyannopoulos:E"):], "{} {} {}".format(library_id, match.group("rushid"), donor)
    match = re.search("(?P<encid>ENC4_cDNA([0-9]+))", description)
    if match is not None:
        assert match.group("encid") == cdna, "{} {} {}".format(library_id, match.group("encid"), cdna)
    else:
        print("{} {} lacks an encind".format(i, row.description))
    

# Confirm experiment is right

In [None]:
print(spreadsheet_name)
experiment = pandas.read_excel(spreadsheet_name, sheet_name='Experiment', header=0, engine=engine)

for i, row in experiment.iterrows():
    library_id = row["library_id:skip"]
    library_info = htsw.get_library(library_id)
    assert library_info["library_name"] == row["description"], "{} {} != {}".format(library_id, library_info["library_name"], row["description"])

# Lookup clinical data

In [None]:
print(spreadsheet_name)
biosample = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

def add_if_needed(sheet, name, values):
    sheet[name] = values
    return sheet

rush_ids = []
ages = []
age_units = []
sexes = []
clinical = []
for i, row in biosample.iterrows():
    donor = server.get_json(row["donor"])
    rush_ids.append(row["donor"].split(":")[1])
    ages.append(donor["age"]),
    age_units.append(donor["age_units"]),
    sexes.append(donor["sex"]),
    clinical.append(donor['submitter_comment'])
    
add_if_needed(biosample, "rush_id:skip", rush_ids)
add_if_needed(biosample, "age:skip", ages)
add_if_needed(biosample, "age_units:skip", age_units)
add_if_needed(biosample, "sex:skip", sexes)
add_if_needed(biosample, "clinical_status:skip", clinical)

biosample.to_excel("/dev/shm/biosample.xlsx")
biosample.head()

# Lookup biosample ontologies

Lookup any biosample ontologies that are already present

In [None]:
biosample = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

for i, row in biosample.iterrows():
    if not pandas.isnull(row.accession) and row.accession.startswith('E'):
        obj = server.get_json(row.accession)
        biosample_ontology = obj['biosample_ontology']
        if isinstance(biosample_ontology, dict):
            biosample.loc[i, 'biosample_ontology'] = biosample_ontology['@id']
            biosample.loc[i, 'biosample_term_name:skip'] = biosample_ontology['term_name']
        biosample.loc[i, 'source'] = obj['source']['@id']
            
biosample

In [None]:
#biosample.to_excel('/dev/shm/biosamples.xlsx', index=False)

# Register Biosamples

In [None]:
biosample = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)
created = server.post_sheet('/biosamples/', biosample, 
                            verbose=True, 
                            dry_run=True,
                            validator=validator)
print(len(created))

In [None]:
if created:
    biosample.to_excel('/dev/shm/biosamples.xlsx', index=False)

# Retrieve library starting amount

In [None]:
print(spreadsheet_name)
libraries = pandas.read_excel(spreadsheet_name, sheet_name='Library', header=0, engine=engine)

fragment_size = []
for i, row in libraries.iterrows():
    library_id = row["library_id:skip"]
    library_info = htsw.get_library(library_id)    
    if pandas.isnull(row["average_fragment_size:integer"]):
        fragment_size.append(library_info["insert_size"])
    else:
        assert library_info["insert_size"] == row["average_fragment_size:integer"], "{} {} {}!={}".format(i, library_id, library_info["insert_size"], row["average_fragment_size:integer"])
        fragment_size.append(row["average_fragment_size:integer"])
    
#fragment_size

# Register Libraries

In [None]:
print(spreadsheet_name)
libraries = pandas.read_excel(spreadsheet_name, sheet_name='Library', header=0, engine=engine)
created = server.post_sheet('/libraries/', 
                            libraries,
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

In [None]:
if created:
    libraries.to_excel('/dev/shm/libraries.xlsx', index=False)

# Register Experiments

In [None]:
print(server.server)
experiments = pandas.read_excel(spreadsheet_name, sheet_name='Experiment', header=0, engine=engine)
experiments = experiments[experiments['accession'] != 'barbara approval needed']
created = server.post_sheet('/experiments/', 
                            experiments, 
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

In [None]:
if created:
    experiments.to_excel('/dev/shm/experiments.xlsx', index=False)

# Register Replicates

In [None]:
print(server.server)
print(spreadsheet_name)
replicates = pandas.read_excel(spreadsheet_name, sheet_name='Replicate', header=0, engine=engine)
replicates = replicates[replicates['uuid'] != 'barbara approval needed']
created = server.post_sheet('/replicates/',
                            replicates, 
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

In [None]:
if created:
    replicates.to_excel('/dev/shm/replicates.xlsx', index=False)

# Check Files

In [None]:
files = pandas.read_excel(spreadsheet_name, sheet_name='File', header=0, engine=engine)
created = server.post_sheet('/files/', files, verbose=True, dry_run=True, validator=validator)
print(len(created))