In [1]:
from collections import Counter
import pandas as pd
import zipfile
import json
import os

In [2]:
root = "."
pathogen_code = 'mtuberculosis'

In [3]:
# Load cleaned assays
ASSAYS_CLEANED = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, "assays_cleaned.csv"))

# Define PATH to parameters
PARAMETERS = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, 'assays_parameters.csv'))

# Merge tables
OUT = ASSAYS_CLEANED.merge(PARAMETERS, on=['assay_id', 'activity_type', 'unit'], how='inner', validate='1:1')

In [4]:
len(ASSAYS_CLEANED), len(PARAMETERS)

(10532, 10267)

In [5]:
OUT

Unnamed: 0,assay_id,assay_type,assay_organism,doc_chembl_id,target_type,target_chembl_id,target_organism,bao_label,source_label,activity_type,...,direction,organism_curated,target_type_curated,target_name_curated,target_chembl_id_curated,strain,atcc_id,mutations,known_drug_resistances,media
0,CHEMBL4649948,F,Mycobacterium tuberculosis,CHEMBL3988442,UNCHECKED,CHEMBL612545,,organism-based format,GATES_LIBRARY,PERCENTEFFECT,...,1.0,Mycobacterium tuberculosis,ORGANISM,Mycobacterium tuberculosis,,,,,,DPPC; cholesterol; tyloxapol based media
1,CHEMBL4649949,F,Mycobacterium tuberculosis,CHEMBL3988442,UNCHECKED,CHEMBL612545,,organism-based format,GATES_LIBRARY,PERCENTEFFECT,...,1.0,Mycobacterium tuberculosis,ORGANISM,,,,,,,7H9; glucose tyloxapol based media
2,CHEMBL4649971,F,Mycobacterium tuberculosis,CHEMBL3988442,ORGANISM,CHEMBL360,Mycobacterium tuberculosis,organism-based format,GATES_LIBRARY,PERCENTEFFECT,...,1.0,Mycobacterium tuberculosis,ORGANISM,Mycobacterium tuberculosis,,,,,,
3,CHEMBL4649972,F,Mycobacterium tuberculosis,CHEMBL3988442,PROTEIN COMPLEX,CHEMBL4662931,Mycobacterium tuberculosis (strain ATCC 25618 ...,assay format,GATES_LIBRARY,PERCENTEFFECT,...,1.0,Mycobacterium tuberculosis,PROTEIN COMPLEX,ClpP1P2,,H37Rv,ATCC 25618,,,
4,CHEMBL4649941,F,Mycobacterium tuberculosis,CHEMBL3988442,SINGLE PROTEIN,CHEMBL4662928,Mycobacterium tuberculosis (strain ATCC 25618 ...,assay format,GATES_LIBRARY,PERCENTEFFECT,...,1.0,Mycobacterium tuberculosis,SINGLE PROTEIN,MtCoaBC,,H37Rv,ATCC 25618,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10262,CHEMBL5139720,F,Mycobacterium tuberculosis,CHEMBL5137025,ORGANISM,CHEMBL360,Mycobacterium tuberculosis,organism-based format,LITERATURE,MIC90,...,-1.0,Mycobacterium tuberculosis,ORGANISM,Mycobacterium tuberculosis,,H37Rv,,,,
10263,CHEMBL4201141,B,Mycobacterium tuberculosis H37Rv,CHEMBL4196176,SINGLE PROTEIN,CHEMBL4295521,Mycobacterium tuberculosis,cell-based format,LITERATURE,INHIBITION,...,1.0,Mycobacterium tuberculosis,SINGLE PROTEIN,MptpB,,H37Rv,ATCC 35837,,,
10264,CHEMBL4201140,F,Mycobacterium tuberculosis variant bovis BCG,CHEMBL4196176,ORGANISM,CHEMBL615052,Mycobacterium tuberculosis variant bovis BCG,organism-based format,LITERATURE,ACTIVITY,...,0.0,Mycobacterium tuberculosis,ORGANISM,Mycobacterium tuberculosis variant bovis BCG,,Pasteur,,,,
10265,CHEMBL4201139,B,Mycobacterium tuberculosis variant bovis BCG,CHEMBL4196176,SINGLE PROTEIN,CHEMBL4295523,Mycobacterium bovis (strain BCG / Pasteur 1173P2),cell-based format,LITERATURE,INHIBITION,...,1.0,Mycobacterium tuberculosis,SINGLE PROTEIN,Mycobacterium tuberculosis protein-tyrosine-ph...,,Pasteur,,,,


In [6]:
Counter(ASSAYS_CLEANED['target_type'])

Counter({'ORGANISM': 8663,
         'SINGLE PROTEIN': 954,
         'UNCHECKED': 797,
         'PROTEIN COMPLEX': 84,
         'SUBCELLULAR': 17,
         'NON-MOLECULAR': 13,
         'ADMET': 2,
         'NO TARGET': 1,
         'PROTEIN FAMILY': 1})

In [7]:
len(OUT[OUT['target_type'] == 'UNCHECKED'])

777

In [8]:
len(OUT[(OUT['target_type'] == 'UNCHECKED') & (OUT['target_type_curated'] == 'DISCARDED')])

10

In [9]:
len(OUT[(OUT['target_type'] == 'UNCHECKED') & (OUT['target_type_curated'] == 'ORGANISM')])

53

In [10]:
len(OUT[(OUT['target_type'] == 'UNCHECKED') & (OUT['target_type_curated'] == 'SINGLE PROTEIN')])

714

In [11]:
OUT[(OUT['target_type'] == 'UNCHECKED') & (OUT['target_type_curated'].isin(['ORGANISM', 'SINGLE PROTEIN']) == False)]

Unnamed: 0,assay_id,assay_type,assay_organism,doc_chembl_id,target_type,target_chembl_id,target_organism,bao_label,source_label,activity_type,...,direction,organism_curated,target_type_curated,target_name_curated,target_chembl_id_curated,strain,atcc_id,mutations,known_drug_resistances,media
25,CHEMBL1738421,F,,CHEMBL1201862,UNCHECKED,CHEMBL612545,,cell-based format,PUBCHEM_BIOASSAY,CC50,...,-1.0,Mycobacterium tuberculosis,DISCARDED,,,,,,,
812,CHEMBL3855806,B,Mycobacterium tuberculosis H37Rv,CHEMBL3853352,UNCHECKED,CHEMBL612545,,assay format,LITERATURE,INHIBITION,...,1.0,Mycobacterium tuberculosis,DISCARDED,,,,,,,
3772,CHEMBL4685499,B,Mycobacterium tuberculosis,CHEMBL4680209,UNCHECKED,CHEMBL612545,,assay format,LITERATURE,MIC,...,-1.0,Mycobacterium tuberculosis,DISCARDED,,,PT2,,T313A,,
5980,CHEMBL4680626,B,Mycobacterium tuberculosis str. Erdman,CHEMBL4680071,UNCHECKED,CHEMBL612545,,assay format,LITERATURE,IC50,...,-1.0,Mycobacterium tuberculosis,DISCARDED,,,Erdman,,,,
6831,CHEMBL4312113,B,Mycobacterium tuberculosis H37Rv,CHEMBL4311933,UNCHECKED,CHEMBL612545,,assay format,LITERATURE,ACTIVITY,...,1.0,Mycobacterium tuberculosis,DISCARDED,,,H37Rv,ATCC 25618,,,
7196,CHEMBL3992575,B,Mycobacterium tuberculosis H37Rv,CHEMBL3992481,UNCHECKED,CHEMBL612545,,assay format,LITERATURE,GI,...,1.0,Mycobacterium tuberculosis,DISCARDED,,,H37Rv,,,,
7216,CHEMBL3992571,B,Mycobacterium tuberculosis H37Rv,CHEMBL3992481,UNCHECKED,CHEMBL612545,,assay format,LITERATURE,MIC,...,-1.0,Mycobacterium tuberculosis,DISCARDED,,,,,,,
9870,CHEMBL5346697,B,Mycobacterium tuberculosis variant bovis,CHEMBL5344478,UNCHECKED,CHEMBL612545,,assay format,LITERATURE,IC50,...,-1.0,Mycobacterium tuberculosis,DISCARDED,,,BCG,,,,
10092,CHEMBL4187826,B,Mycobacterium tuberculosis variant bovis BCG,CHEMBL4184228,UNCHECKED,CHEMBL612545,,assay format,LITERATURE,ACTIVITY,...,1.0,Mycobacterium tuberculosis,DISCARDED,,,H37Rv; R5401; X_61,,,isoniazid,
10161,CHEMBL4187825,B,Mycobacterium tuberculosis variant bovis BCG,CHEMBL4184228,UNCHECKED,CHEMBL612545,,assay format,LITERATURE,ACTIVITY,...,1.0,Mycobacterium tuberculosis,DISCARDED,,,H37Rv; R5401; X_61,,,isoniazid,


In [None]:
OUT