Preparing to submit wold stranded samples....


In [1]:
import os
import sys
import requests
import pandas
import paramiko
import re
import json
from IPython import display
from pathlib import Path
import configparser

In [2]:
from curation_common import *
from encoded_client.encoded import DCCValidator

In [3]:
from encoded_client.encoded import Document
from encoded_client.submission import run_aws_cp
from htsworkflow.util.api import (
    add_auth_options,
    make_auth_from_opts,
    HtswApi,
)

In [4]:
config = configparser.ConfigParser()
config.read([os.path.expanduser('~/.htsworkflow.ini'),
             '/etc/htsworkflow.ini'
             ])

SECTION = 'sequence_archive'
if config.has_section(SECTION):
    apiid = config.get(SECTION, 'apiid')
    apikey = config.get(SECTION, 'apikey')
    apihost = config.get(SECTION, 'host')

auth = {'apiid': apiid, 'apikey': apikey }
htsw = HtswApi(apihost, auth)

In [5]:
# live server & control file
server = ENCODED('www.encodeproject.org')
spreadsheet_name = Path('~/woldlab/ENCODE/stranded-25067-25098-erez-degron.xlsx').expanduser()
engine=None
#engine='odf'

# test server & datafile
#server = ENCODED('test.encodedcc.org')
#spreadsheet_name = os.path.expanduser('~diane/woldlab/ENCODE/C1-encode3-limb-2017-testserver.ods')

server.load_netrc()
validator = DCCValidator(server)

assert spreadsheet_name.exists()

In [6]:
award = 'UM1HG009443'

# Lookup biosample ontologies

Lookup any biosample ontologies that are already present

In [7]:
biosample_sheet = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

for i, row in biosample_sheet.iterrows():
    if not pandas.isnull(row.accession) and row.accession.startswith('E'):
        biosample = server.get_json(row.accession)
        biosample_ontology = biosample['biosample_ontology']
        if isinstance(biosample_ontology, dict):
            biosample_sheet.loc[i, 'biosample_ontology'] = biosample_ontology['@id']
            biosample_sheet.loc[i, 'biosample_term_name:skip'] = biosample_ontology['term_name']
            for term in [("organism","@id"), ("source","@id"), ("donor","@id"), ("lab","@id"), ("award", "@id")]:
                if pandas.isnull(biosample_sheet.loc[i, term[0]]):
                    biosample_sheet.loc[i, term[0]] = biosample[term[0]][term[1]]
            
biosample_sheet

Unnamed: 0,uuid,accession,library_id:skip,cDNA_sample:skip,description,biosample_ontology,biosample_term_name:skip,aliases:array,nih_institutional_certification,model_organism_age,model_organism_age_units,mouse_life_stage,model_organism_sex,organism,source,donor,lab,award
0,,ENCBS116FGF,25067,ENC4_cDNA_869,HCT116-BRD4-clone1-treated-new_bulk_1-1F,/biosample-types/cell_line_EFO_0002824/,HCT116,barbara-wold:ENC4_cDNA_869,,,,,,/organisms/human/,/sources/masato-kanemaki/,ENCDO000ABE,barbara-wold,/awards/UM1HG009444/
1,,ENCBS569QUD,25068,ENC4_cDNA_870,HCT116-BRD4-clone1-treated-new_bulk_2-1F,/biosample-types/cell_line_EFO_0002824/,HCT116,barbara-wold:ENC4_cDNA_870,,,,,,/organisms/human/,/sources/masato-kanemaki/,ENCDO000ABE,barbara-wold,/awards/UM1HG009444/
2,,ENCBS909MRS,25069,ENC4_cDNA_871,HCT116-BRD4-clone1-untreated-new_bulk_1-1F,/biosample-types/cell_line_EFO_0002824/,HCT116,barbara-wold:ENC4_cDNA_871,,,,,,/organisms/human/,/sources/masato-kanemaki/,ENCDO000ABE,barbara-wold,/awards/UM1HG009444/
3,,ENCBS913PLQ,25070,ENC4_cDNA_872,HCT116-BRD4-clone1-untreated-new_bulk_2-1F,/biosample-types/cell_line_EFO_0002824/,HCT116,barbara-wold:ENC4_cDNA_872,,,,,,/organisms/human/,/sources/masato-kanemaki/,ENCDO000ABE,barbara-wold,/awards/UM1HG009444/
4,,ENCBS806WQS,25071,ENC4_cDNA_873,HCT116-CDK7-treated-new_bulk_1-A4,/biosample-types/cell_line_EFO_0002824/,HCT116,barbara-wold:ENC4_cDNA_873,,,,,,/organisms/human/,/sources/masato-kanemaki/,ENCDO000ABE,barbara-wold,/awards/UM1HG009444/
5,,ENCBS040KDM,25072,ENC4_cDNA_874,HCT116-CDK7-treated-new_bulk_2-A4,/biosample-types/cell_line_EFO_0002824/,HCT116,barbara-wold:ENC4_cDNA_874,,,,,,/organisms/human/,/sources/masato-kanemaki/,ENCDO000ABE,barbara-wold,/awards/UM1HG009444/
6,,ENCBS921NBP,25073,ENC4_cDNA_875,HCT116-CDK7-untreated-new_bulk_1-A4,/biosample-types/cell_line_EFO_0002824/,HCT116,barbara-wold:ENC4_cDNA_875,,,,,,/organisms/human/,/sources/masato-kanemaki/,ENCDO000ABE,barbara-wold,/awards/UM1HG009444/
7,,ENCBS298VZU,25074,ENC4_cDNA_876,HCT116-CDK7-untreated-new_bulk_2-A4,/biosample-types/cell_line_EFO_0002824/,HCT116,barbara-wold:ENC4_cDNA_876,,,,,,/organisms/human/,/sources/masato-kanemaki/,ENCDO000ABE,barbara-wold,/awards/UM1HG009444/
8,,ENCBS895NSJ,25075,ENC4_cDNA_877,HCT116-CTCF-treated-new_bulk_1-D12,/biosample-types/cell_line_EFO_0002824/,HCT116,barbara-wold:ENC4_cDNA_877,,,,,,/organisms/human/,/sources/masato-kanemaki/,ENCDO000ABE,barbara-wold,/awards/UM1HG009444/
9,,ENCBS753USK,25076,ENC4_cDNA_878,HCT116-CTCF-treated-new_bulk_2-D12,/biosample-types/cell_line_EFO_0002824/,HCT116,barbara-wold:ENC4_cDNA_878,,,,,,/organisms/human/,/sources/masato-kanemaki/,ENCDO000ABE,barbara-wold,/awards/UM1HG009444/


In [8]:
biosample_sheet.to_excel('/dev/shm/biosamples.xlsx', index=False)

# Retrieve library starting amount

In [None]:
print(spreadsheet_name)
libraries = pandas.read_excel(spreadsheet_name, sheet_name='Library', header=0, engine=engine)

fragment_size = []
for i, row in libraries.iterrows():
    library_id = row["library_id:skip"]
    library_info = htsw.get_library(library_id)    
    if pandas.isnull(row["average_fragment_size:integer"]):
        fragment_size.append(library_info["insert_size"])
    else:
        assert library_info["insert_size"] == row["average_fragment_size:integer"], "{} {} {}!={}".format(i, library_id, library_info["insert_size"], row["average_fragment_size:integer"])
        fragment_size.append("{} pass".format(row["average_fragment_size:integer"]))
    
print("\n".join([str(x) for x in fragment_size]))

# Check Library Names

In [52]:
print(spreadsheet_name)
biosample_sheet = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

for i, row in biosample_sheet.iterrows():
    library_id = row["library_id:skip"]
    library_info = htsw.get_library(library_id)
    sheet_name = row["description"]
    library_name = library_info["library_name"].split(" ")[0]
    print(sheet_name == library_name, library_id, sheet_name, library_name)

    
#print("\n".join([str(x) for x in fragment_size]))

/home/diane/woldlab/ENCODE/stranded-25067-25098-erez-degron.xlsx
True 25067 HCT116-BRD4-clone1-treated-new_bulk_1-1F HCT116-BRD4-clone1-treated-new_bulk_1-1F
True 25068 HCT116-BRD4-clone1-treated-new_bulk_2-1F HCT116-BRD4-clone1-treated-new_bulk_2-1F
True 25069 HCT116-BRD4-clone1-untreated-new_bulk_1-1F HCT116-BRD4-clone1-untreated-new_bulk_1-1F
True 25070 HCT116-BRD4-clone1-untreated-new_bulk_2-1F HCT116-BRD4-clone1-untreated-new_bulk_2-1F
True 25071 HCT116-CDK7-treated-new_bulk_1-A4 HCT116-CDK7-treated-new_bulk_1-A4
True 25072 HCT116-CDK7-treated-new_bulk_2-A4 HCT116-CDK7-treated-new_bulk_2-A4
True 25073 HCT116-CDK7-untreated-new_bulk_1-A4 HCT116-CDK7-untreated-new_bulk_1-A4
True 25074 HCT116-CDK7-untreated-new_bulk_2-A4 HCT116-CDK7-untreated-new_bulk_2-A4
True 25075 HCT116-CTCF-treated-new_bulk_1-D12 HCT116-CTCF-treated-new_bulk_1-D12
True 25076 HCT116-CTCF-treated-new_bulk_2-D12 HCT116-CTCF-treated-new_bulk_2-D12
True 25077 HCT116-CTCF-untreated-new_bulk_1-D12 HCT116-CTCF-untreated

In [48]:
library_info.keys()

dict_keys(['antibody_id', 'cell_line_id', 'cell_line', 'experiment_type', 'experiment_type_id', 'gel_cut_size', 'hidden', 'id', 'insert_size', 'lane_set', 'library_id', 'library_name', 'library_species', 'library_species_id', 'library_type_id', 'made_for', 'made_by', 'multiplex_index', 'notes', 'replicate', 'stopping_point', 'successful_pM', 'undiluted_concentration', 'library_type'])

# Register Biosamples

In [11]:
biosample = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)
created = server.post_sheet('/biosamples/', biosample, 
                            verbose=True, 
                            dry_run=True,
                            validator=validator)
print(len(created))

0


In [12]:
if created:
    biosample.to_excel('/dev/shm/biosamples.xlsx', index=False)

# Register Libraries

In [57]:
print(spreadsheet_name)
libraries = pandas.read_excel(spreadsheet_name, sheet_name='Library', header=0, engine=engine)
created = server.post_sheet('/libraries/', 
                            libraries,
                            verbose=True,
                            dry_run=True,
                            validator=validator)
print(len(created))

/home/diane/woldlab/ENCODE/stranded-25067-25098-erez-degron.xlsx
0


In [56]:
if created:
    libraries.to_excel('/dev/shm/libraries.xlsx', index=False)

# Register Experiments

In [61]:
print(server.server)
experiments = pandas.read_excel(spreadsheet_name, sheet_name='Experiment', header=0, engine=engine)
experiments = experiments[experiments['accession'] != 'barbara approval needed']
created = server.post_sheet('/experiments/', 
                            experiments, 
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

www.encodeproject.org
0


In [60]:
if created:
    experiments.to_excel('/dev/shm/experiments.xlsx', index=False)

# Register Replicates

In [65]:
print(server.server)
print(spreadsheet_name)
replicates = pandas.read_excel(spreadsheet_name, sheet_name='Replicate', header=0, engine=engine)
replicates = replicates[replicates['uuid'] != 'barbara approval needed']
created = server.post_sheet('/replicates/',
                            replicates, 
                            verbose=True, 
                            dry_run=True, 
                            validator=validator)
print(len(created))

www.encodeproject.org
/home/diane/woldlab/ENCODE/stranded-25067-25098-erez-degron.xlsx
0


In [64]:
if created:
    replicates.to_excel('/dev/shm/replicates.xlsx', index=False)

# Check Files

In [66]:
files = pandas.read_excel(spreadsheet_name, sheet_name='File', header=0, engine=engine)
created = server.post_sheet('/files/', files, verbose=True, dry_run=True, validator=validator)
print(len(created))

32


# Check NIH Institutional Certifications

In [10]:
biosample_sheet = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

for i, row in biosample_sheet.iterrows():
    if not pandas.isnull(row.accession) and row.accession.startswith('E'):
        biosample = server.get_json(row.accession)
        biosample_ontology = biosample['biosample_ontology']
        parent = biosample.get("part_of", {})
        parent_ontology = parent.get("biosample_ontology", {})
        print(biosample["accession"], parent["accession"], parent["nih_institutional_certification"], parent_ontology["term_name"])
              #biosample_ontology.get("nih_institutional_certification"), biosample_ontology["term_name"])
        #if isinstance(biosample_ontology, dict):
        #    biosample_sheet.loc[i, 'biosample_ontology'] = biosample_ontology['@id']
        #    biosample_sheet.loc[i, 'biosample_term_name:skip'] = biosample_ontology['term_name']
        #    for term in [("organism","@id"), ("source","@id"), ("donor","@id"), ("lab","@id"), ("award", "@id")]:
        #        if pandas.isnull(biosample_sheet.loc[i, term[0]]):
        #            biosample_sheet.loc[i, term[0]] = biosample[term[0]][term[1]]
            
#biosample_sheet

ENCBS116FGF ENCBS228RKP NIC00016 HCT116
ENCBS569QUD ENCBS228RKP NIC00016 HCT116
ENCBS909MRS ENCBS368OIX NIC00016 HCT116
ENCBS913PLQ ENCBS368OIX NIC00016 HCT116
ENCBS806WQS ENCBS395CVW NIC00016 HCT116
ENCBS040KDM ENCBS395CVW NIC00016 HCT116
ENCBS921NBP ENCBS676ONK NIC00016 HCT116
ENCBS298VZU ENCBS676ONK NIC00016 HCT116
ENCBS895NSJ ENCBS358FCU NIC00016 HCT116
ENCBS753USK ENCBS358FCU NIC00016 HCT116
ENCBS529HYV ENCBS335ZNA NIC00016 HCT116
ENCBS242YXD ENCBS335ZNA NIC00016 HCT116
ENCBS066LQR ENCBS793DSB NIC00016 HCT116
ENCBS896ORZ ENCBS793DSB NIC00016 HCT116
ENCBS265FNK ENCBS842OLG NIC00016 HCT116
ENCBS003KVP ENCBS842OLG NIC00016 HCT116
ENCBS141PCL ENCBS768VHM NIC00016 HCT116
ENCBS606HUI ENCBS768VHM NIC00016 HCT116
ENCBS976NRF ENCBS204AVI NIC00016 HCT116
ENCBS038BAW ENCBS204AVI NIC00016 HCT116
ENCBS590FGJ ENCBS386LBR NIC00016 HCT116
ENCBS068GZX ENCBS386LBR NIC00016 HCT116
ENCBS470QIL ENCBS682ITO NIC00016 HCT116
ENCBS767IFU ENCBS682ITO NIC00016 HCT116
ENCBS641JLV ENCBS036FEF NIC00016 HCT116


In [24]:
def get_experiment_biosample(experiment_accession, biosample_accession):
    experiment = server.get_json(experiment_accession)
    for replicate in experiment.get("replicates", []):
        library = replicate["library"]
        if library["biosample"]["accession"] == biosample_accession:
            formatted_replicate = (replicate["biological_replicate_number"], replicate["technical_replicate_number"])
            return {biosample_accession: formatted_replicate}
        
get_experiment_biosample("ENCSR816PPJ", "ENCBS047AWE")

{'ENCBS047AWE': (2, 1)}

In [33]:
def get_replicates(parent_accession):
    replicate_ids = {}
    parent = server.get_json(parent_accession)
    for biosample in parent.get("parent_of", []):
        graph = server.search_jsonld(searchTerm=biosample["accession"])
        for row in graph["@graph"]:
            if "Experiment" in row["@type"]:
                formatted_replicate = get_experiment_biosample(row["accession"], biosample["accession"])
                if formatted_replicate is not None:
                    replicate_ids.setdefault(parent_accession, {}).update(formatted_replicate)
                else:
                    print("Unable to find {} {}".format(row["accession"], biosample["accession"]))
                #replicate_ids.add(formatted_replicate)
                
    return replicate_ids

get_replicates("ENCBS228RKP")

{'ENCBS228RKP': {'ENCBS047AWE': (2, 1),
  'ENCBS512GOU': (1, 1),
  'ENCBS016NYI': (1, 1),
  'ENCBS524SGS': (2, 1),
  'ENCBS189PYR': (1, 1),
  'ENCBS347WDW': (2, 1),
  'ENCBS671YQM': (1, 1),
  'ENCBS632ZLW': (2, 1),
  'ENCBS627EDW': (1, 1),
  'ENCBS191ARH': (1, 1),
  'ENCBS745VSI': (2, 1),
  'ENCBS488HJZ': (1, 1),
  'ENCBS156BEM': (2, 1),
  'ENCBS087RNX': (1, 2),
  'ENCBS026RHM': (1, 2)}}

In [None]:
assigned_replicates = {}

In [34]:
biosample_sheet = pandas.read_excel(spreadsheet_name, sheet_name='Biosample', header=0, engine=engine)

# This is really slow to collect everything.
if 0:
    for i, row in biosample_sheet.iterrows():
        if not pandas.isnull(row.accession) and row.accession.startswith('E'):
            biosample = server.get_json(row.accession)
            biosample_ontology = biosample['biosample_ontology']
            parent = biosample.get("part_of", None)
            if parent is not None and parent["accession"] not in assigned_replicates:
                assigned_replicates.update(get_replicates(parent["accession"]))
            
pandas.DataFrame(assigned_replicates)

Unable to find ENCSR101DVP ENCBS084QBE
Unable to find ENCSR240RLZ ENCBS084QBE
Unable to find ENCSR290MUH ENCBS084QBE
Unable to find ENCSR864MFX ENCBS777WED
Unable to find ENCSR290FDP ENCBS777WED
Unable to find ENCSR809VWT ENCBS777WED
Unable to find ENCSR099BYN ENCBS777WED
Unable to find ENCSR420LNH ENCBS777WED
Unable to find ENCSR032PQO ENCBS777WED
Unable to find ENCSR648UHG ENCBS777WED
Unable to find ENCSR108NNI ENCBS777WED
Unable to find ENCSR904LMD ENCBS777WED
Unable to find ENCSR773NQB ENCBS815HME


Unnamed: 0,ENCBS228RKP,ENCBS368OIX,ENCBS395CVW,ENCBS676ONK,ENCBS358FCU,ENCBS335ZNA,ENCBS793DSB,ENCBS842OLG,ENCBS768VHM,ENCBS204AVI,ENCBS386LBR,ENCBS682ITO,ENCBS036FEF,ENCBS177ZJV,ENCBS257YYW,ENCBS178RWJ
ENCBS047AWE,"(2, 1)",,,,,,,,,,,,,,,
ENCBS512GOU,"(1, 1)",,,,,,,,,,,,,,,
ENCBS016NYI,"(1, 1)",,,,,,,,,,,,,,,
ENCBS524SGS,"(2, 1)",,,,,,,,,,,,,,,
ENCBS189PYR,"(1, 1)",,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENCBS984MQX,,,,,,,,,,,,,,,,"(1, 1)"
ENCBS603YYY,,,,,,,,,,,,,,,,"(2, 1)"
ENCBS692ZUT,,,,,,,,,,,,,,,,"(1, 1)"
ENCBS456GWO,,,,,,,,,,,,,,,,"(1, 1)"


In [35]:
assigned_replicates

{'ENCBS228RKP': {'ENCBS047AWE': (2, 1),
  'ENCBS512GOU': (1, 1),
  'ENCBS016NYI': (1, 1),
  'ENCBS524SGS': (2, 1),
  'ENCBS189PYR': (1, 1),
  'ENCBS347WDW': (2, 1),
  'ENCBS671YQM': (1, 1),
  'ENCBS632ZLW': (2, 1),
  'ENCBS627EDW': (1, 1),
  'ENCBS191ARH': (1, 1),
  'ENCBS745VSI': (2, 1),
  'ENCBS488HJZ': (1, 1),
  'ENCBS156BEM': (2, 1),
  'ENCBS087RNX': (1, 2),
  'ENCBS026RHM': (1, 2)},
 'ENCBS368OIX': {'ENCBS852MNL': (2, 1),
  'ENCBS956YAX': (1, 1),
  'ENCBS140VNG': (2, 1),
  'ENCBS131GLR': (1, 1),
  'ENCBS287MEN': (2, 1),
  'ENCBS121KTD': (1, 1),
  'ENCBS978VFN': (2, 1),
  'ENCBS328EFM': (1, 1),
  'ENCBS241GOH': (1, 1),
  'ENCBS930EMD': (2, 1),
  'ENCBS762SPI': (2, 1),
  'ENCBS214JQO': (1, 1),
  'ENCBS702UDW': (1, 1),
  'ENCBS816ZRZ': (1, 2),
  'ENCBS981HND': (1, 1)},
 'ENCBS395CVW': {'ENCBS361VOK': (2, 1),
  'ENCBS885QKO': (1, 1),
  'ENCBS474PRW': (1, 1),
  'ENCBS540IRN': (1, 1),
  'ENCBS354DNN': (2, 1),
  'ENCBS387RXX': (1, 1),
  'ENCBS219YEV': (2, 1),
  'ENCBS515TQU': (1, 2),
  '

In [38]:
results = []
for parent in assigned_replicates:
    for biosample in assigned_replicates[parent]:
        results.append((parent, biosample, assigned_replicates[parent][biosample][0], assigned_replicates[parent][biosample][1]))

In [41]:
results = pandas.DataFrame(results, columns=["parent", "biosample", "bio_rep", "tech_rep"])

In [46]:
" ".join(results[results["tech_rep"] != 1]["biosample"])

'ENCBS087RNX ENCBS026RHM ENCBS816ZRZ ENCBS515TQU ENCBS086NCD ENCBS657VWI ENCBS982XND ENCBS385ZCY ENCBS033MSP ENCBS567HKI ENCBS814XCI ENCBS446YDY ENCBS828BDT ENCBS169OGM ENCBS338OSL ENCBS123WSE ENCBS720OOT ENCBS154UAS ENCBS253RUX ENCBS403YHB ENCBS628SFE ENCBS719AUN ENCBS599GGB ENCBS889ZME ENCBS456IZA ENCBS802HCI'