# Introduction

Some of the submissions are flagged as having an inconsistent biosample ontology ID, as some biosamples are tissues while the libraries are single cell.

In [1]:
import hashlib
import pandas
import sys
import os


In [2]:
HTSW=os.path.expanduser('~diane/proj/htsworkflow')
if HTSW not in sys.path:
    sys.path.append(HTSW)
from htsworkflow.submission import encoded

In [3]:
#server = encoded.ENCODED('www.encodeproject.org')
server = encoded.ENCODED('test.encodedcc.org')
server.load_netrc()
validator = encoded.DCCValidator(server)

In [4]:
uploaded_raw_sheet_filename = 'C1-mouse-forelimb-submission-201907-uploaded-production.xlsx'
uploaded = pandas.ExcelFile(uploaded_raw_sheet_filename)

In [5]:
submitted_experiment = uploaded.parse('Experiment')
submitted_libraries = uploaded.parse('Library')
submitted_replicates = uploaded.parse('Replicate')
submitted_files = uploaded.parse('File')


In [15]:
obj = server.get_json('/experiments/ENCSR869HLE/')
obj.keys()

dict_keys(['actions', 'audit', 'replicates', 'biosample_summary', 'internal_tags', 'accession', 'contributing_files', 'uuid', 'alternate_accessions', 'date_created', 'biosample_ontology', 'revoked_files', 'submitted_by', 'references', 'related_series', 'assay_title', '@type', 'category_slims', 'aliases', '@id', 'type_slims', 'lab', 'schema_version', 'status', 'description', 'documents', 'supersedes', 'assembly', 'superseded_by', 'award', 'original_files', '@context', 'assay_term_name', 'assay_slims', 'files', 'possible_controls', 'internal_status', 'objective_slims', 'replication_type', 'assay_term_id', 'dbxrefs', 'related_files'])

In [24]:
obj['biosample_ontology']['@id']

'/biosample-types/single_cell_UBERON_0002102/'

In [23]:
obj['replicates'][0]['library']['biosample']['biosample_ontology']['@id']

'/biosample-types/tissue_UBERON_0002102/'

In [31]:
needs_fixing = []
for i, row in submitted_experiment.iterrows():
    experiment = server.get_json(row.accession)
    experiment_ontology = experiment['biosample_ontology']['@id']
    for replicate in experiment['replicates']:
        library = replicate['library']
        biosample = library['biosample']
        biosample_ontology = biosample['biosample_ontology']['@id']
        if experiment_ontology != biosample_ontology:
            needs_fixing.append({
                'experiment': row['accession'],
                'biosample': biosample['accession'],
                'experiment_ontology': experiment_ontology,
                'biosample_ontology': biosample_ontology,
            })
            
needs_fixing = pandas.DataFrame(needs_fixing, columns=['experiment', 'biosample', 'biosample_ontology', 'experiment_ontology'])
needs_fixing

Unnamed: 0,experiment,biosample,biosample_ontology,experiment_ontology
0,ENCSR541RSL,ENCBS087UQX,/biosample-types/tissue_UBERON_0002102/,/biosample-types/single_cell_UBERON_0002102/
1,ENCSR736TCS,ENCBS623WDD,/biosample-types/tissue_UBERON_0002102/,/biosample-types/single_cell_UBERON_0002102/
2,ENCSR689LAI,ENCBS674DTZ,/biosample-types/tissue_UBERON_0002102/,/biosample-types/single_cell_UBERON_0002102/
3,ENCSR544DDX,ENCBS465TNA,/biosample-types/tissue_UBERON_0002102/,/biosample-types/single_cell_UBERON_0002102/
4,ENCSR093TNN,ENCBS930JPJ,/biosample-types/tissue_UBERON_0002102/,/biosample-types/single_cell_UBERON_0002102/
5,ENCSR349UAG,ENCBS643JBK,/biosample-types/tissue_UBERON_0002102/,/biosample-types/single_cell_UBERON_0002102/
6,ENCSR136NWW,ENCBS614UHT,/biosample-types/tissue_UBERON_0002102/,/biosample-types/single_cell_UBERON_0002102/
7,ENCSR615EFO,ENCBS243TMO,/biosample-types/tissue_UBERON_0002102/,/biosample-types/single_cell_UBERON_0002102/
8,ENCSR028XBU,ENCBS791QWH,/biosample-types/tissue_UBERON_0002102/,/biosample-types/single_cell_UBERON_0002102/
9,ENCSR664GEE,ENCBS698LOZ,/biosample-types/tissue_UBERON_0002102/,/biosample-types/single_cell_UBERON_0002102/
