# Introduction

The DCC has mentioned I have a bunch of biosamples where we forgot to add the NIC.

In [1]:
import pandas
import sys
import os
import base64
from io import BytesIO
from matplotlib import pyplot
from pathlib import Path
from IPython.display import HTML

In [2]:
EC = str(Path("~/proj/encoded_client").expanduser())
if EC not in sys.path:
    sys.path.append(EC)

In [3]:
from encoded_client.encoded import ENCODED, get_object_type

In [4]:
server = ENCODED("www.encodeproject.org")
server.load_netrc()

In [5]:
search = "https://www.encodeproject.org/search/?type=Experiment&lab.title=Barbara+Wold%2C+Caltech&lab.title=Ali+Mortazavi%2C+UCI&lab.title=Rob+Spitale%2C+UCI&audit.ERROR.category=missing+nih_institutional_certification&assay_title!=microRNA-seq"

In [6]:
graph = server.get_json(search)

In [7]:
print(len(graph["@graph"]))

133


In [8]:
graph["@graph"][0]

{'@id': '/experiments/ENCSR207WIB/',
 '@type': ['Experiment', 'Dataset', 'Item'],
 'accession': 'ENCSR207WIB',
 'assay_term_name': 'RNA-seq',
 'assay_title': 'total RNA-seq',
 'audit': {'ERROR': [{'path': '/experiments/ENCSR207WIB/',
    'level_name': 'ERROR',
    'level': 60,
    'name': 'audit_experiment',
    'detail': 'Experiment {ENCSR207WIB|/experiments/ENCSR207WIB/} uses biosample {ENCBS083NLF|/biosamples/ENCBS083NLF/} missing NIH institutional certification required for human data',
    'category': 'missing nih_institutional_certification'}],
    'level': 40,
    'name': 'audit_experiment',
    'detail': 'Experiment {ENCSR207WIB|/experiments/ENCSR207WIB/} only has raw data and does not contain any processed data.',
    'category': 'lacking processed data'}]},
 'award': {'project': 'ENCODE'},
 'biosample_ontology': {'term_name': 'middle frontal area 46'},
 'biosample_summary': "middle frontal area 46 tissue female adult (90 or above years) with Alzheimer's disease",
 'dbxrefs': 

In [9]:
biosample_nic = pandas.read_csv("biosample_nic.csv", index_col="ontology")
biosample_nic

Unnamed: 0_level_0,term_name,nic
ontology,Unnamed: 1_level_1,Unnamed: 2_level_1
/biosample-types/tissue_UBERON_0008971/,left colon,NIC00005
/biosample-types/tissue_UBERON_0001115/,left lobe of liver,NIC00005
/biosample-types/tissue_UBERON_0001072/,posterior vena cava,NIC00005
/biosample-types/tissue_UBERON_0002084/,heart left ventricle,NIC00005
/biosample-types/tissue_UBERON_0002113/,kidney,NIC00005
/biosample-types/tissue_NTR_0000493/,left ventricle myocardium inferior,NIC00005
/biosample-types/tissue_UBERON_0008450/,psoas muscle,NIC00005
/biosample-types/tissue_UBERON_0000947/,aorta,NIC00005
/biosample-types/tissue_UBERON_0015143/,mesenteric fat pad,NIC00005
/biosample-types/tissue_NTR_0000494/,left ventricle myocardium superior,NIC00005


In [10]:
lookup = set()
patches = []
not_by_me = []
for row in graph["@graph"]:
    for replicate in row["replicates"]:        
        biosample_accession = replicate["library"]["biosample"]["accession"]
        biosample = server.get_json(biosample_accession)
        ontology_id = biosample['biosample_ontology']["@id"]
        ontology_term = biosample['biosample_ontology']["term_name"]
        if ontology_id == '/biosample-types/tissue_UBERON_0006483/':
            continue
        try:
            nic = biosample_nic.loc[ontology_id, 'nic']
        except KeyError:
            lookup.add((ontology_id, ontology_term))
            nic = None

        print(biosample_accession, ontology_id, ontology_term, nic, biosample.get("nih_institutional_certification"))
        if biosample.get("nih_institutional_certification") is None and nic is not None:
            if biosample['submitted_by']['@id'] == '/users/bc5b62f7-ce28-4a1e-b6b3-81c9c5a86d7a/' and biosample['status'] == 'in progress':
                patches.append(server.patch_json(biosample["@id"], {"nih_institutional_certification": nic}))
            else:
                not_by_me.append((biosample_accession, biosample['submitted_by']['title'], ontology_id, biosample['status'], nic))

ENCBS883EPS /biosample-types/primary_cell_NTR_0000495/ activated CD4-positive, alpha-beta T cell NIC00027 None
ENCBS520WQB /biosample-types/primary_cell_NTR_0000500/ activated CD8-positive, alpha-beta T cell NIC00027 None
ENCBS984FMU /biosample-types/primary_cell_CL_0000624/ CD4-positive, alpha-beta T cell NIC00027 None
ENCBS547LLS /biosample-types/primary_cell_CL_0000625/ CD8-positive, alpha-beta T cell NIC00027 None
ENCBS809ILB /biosample-types/primary_cell_NTR_0000500/ activated CD8-positive, alpha-beta T cell NIC00027 None
ENCBS222RFX /biosample-types/primary_cell_CL_0000625/ CD8-positive, alpha-beta T cell NIC00027 None
ENCBS223BQO /biosample-types/primary_cell_NTR_0000500/ activated CD8-positive, alpha-beta T cell NIC00027 None
ENCBS081TPG /biosample-types/primary_cell_NTR_0000495/ activated CD4-positive, alpha-beta T cell NIC00027 None
ENCBS075SKQ /biosample-types/primary_cell_CL_0000624/ CD4-positive, alpha-beta T cell NIC00027 None
ENCBS272IFK /biosample-types/primary_cell_NTR

In [11]:
sorted(lookup)

[]

In [12]:
biosample['status']

'released'

In [13]:
couldnt_update = pandas.DataFrame(not_by_me, columns=['biosample', 'submitter', 'ontology_id', 'status', 'nic']).set_index('biosample')
couldnt_update

Unnamed: 0_level_0,submitter,ontology_id,status,nic
biosample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENCBS883EPS,Khine Lin,/biosample-types/primary_cell_NTR_0000495/,in progress,NIC00027
ENCBS520WQB,Khine Lin,/biosample-types/primary_cell_NTR_0000500/,in progress,NIC00027
ENCBS984FMU,Khine Lin,/biosample-types/primary_cell_CL_0000624/,in progress,NIC00027
ENCBS547LLS,Khine Lin,/biosample-types/primary_cell_CL_0000625/,in progress,NIC00027
ENCBS809ILB,Khine Lin,/biosample-types/primary_cell_NTR_0000500/,in progress,NIC00027
ENCBS222RFX,Khine Lin,/biosample-types/primary_cell_CL_0000625/,in progress,NIC00027
ENCBS223BQO,Khine Lin,/biosample-types/primary_cell_NTR_0000500/,in progress,NIC00027
ENCBS081TPG,Khine Lin,/biosample-types/primary_cell_NTR_0000495/,in progress,NIC00027
ENCBS075SKQ,Khine Lin,/biosample-types/primary_cell_CL_0000624/,in progress,NIC00027
ENCBS272IFK,Khine Lin,/biosample-types/primary_cell_NTR_0000500/,in progress,NIC00027


In [14]:
couldnt_update.to_csv('/dev/shm/couldnt_update.csv')