In [1]:
import pandas as pd
import zipfile
import json
import os
from tqdm import tqdm
import numpy as np
from collections import Counter
from IPython.display import display, HTML

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.width", 2000)
pd.set_option("display.max_colwidth", None)

display(HTML("""
<style>
.dataframe td, .dataframe th {
    white-space: nowrap !important;
}
</style>
"""))


In [2]:
root = "."
pathogen_code = "mtuberculosis"

In [3]:
# Load master table
master = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, "assays_master.csv"))

In [4]:
Counter(master['dataset_type'])

Counter({'quantitative': 8753, 'qualitative': 1479, 'none': 244, 'mixed': 45})

In [20]:
print("\n".join(master.columns.to_list()))

assay_id
assay_type
assay_organism
organism_curated
doc_chembl_id
target_type
target_type_curated
target_chembl_id
target_organism
strain
atcc_id
mutations
known_drug_resistances
media
activity_type
unit
canonical_unit
activities
nan_values
cpds
direction
activity_comment_counts
standard_text_count
equal
higher
lower
dataset_type
expert_cutoff
pos_qt
ratio_qt
cpds_qt
min_
p1
p25
p50
p75
p99
max_
pos_ql
ratio_ql
cpds_ql
clusters_0.3
clusters_0.6
clusters_0.85


In [None]:
COLS = ['assay_id', 'assay_organism', 'target_type', 'target_type_curated', 'activity_type', 'unit', 
        'activities', 'nan_values', "cpds", 'dataset_type', 'expert_cutoff', "pos_qt", "ratio_qt", "cpds_qt", 'pos_ql', 'ratio_ql', 'cpds_ql']


# master[master['dataset_type'] == 'qualitative'][COLS][:50]
master[COLS][:50]

Unnamed: 0,assay_id,assay_organism,target_type,target_type_curated,activity_type,unit,activities,nan_values,cpds,dataset_type,expert_cutoff,pos_qt,ratio_qt,cpds_qt,pos_ql,ratio_ql,cpds_ql
0,CHEMBL4649948,Mycobacterium tuberculosis,UNCHECKED,ORGANISM,PERCENTEFFECT,%,93555,0,86589,quantitative,50.0,1268.0,0.015,86589.0,,,
1,CHEMBL4649949,Mycobacterium tuberculosis,UNCHECKED,ORGANISM,PERCENTEFFECT,%,101515,0,86575,quantitative,50.0,2181.0,0.025,86575.0,,,
2,CHEMBL4649971,Mycobacterium tuberculosis,ORGANISM,ORGANISM,PERCENTEFFECT,%,68619,0,68613,quantitative,50.0,934.0,0.014,68613.0,,,
3,CHEMBL4649972,Mycobacterium tuberculosis,PROTEIN COMPLEX,PROTEIN COMPLEX,PERCENTEFFECT,%,68616,0,68610,none,,,,,,,
4,CHEMBL4649941,Mycobacterium tuberculosis,SINGLE PROTEIN,SINGLE PROTEIN,PERCENTEFFECT,%,67381,0,66941,quantitative,50.0,40.0,0.001,66941.0,,,
5,CHEMBL4649965,Mycobacterium tuberculosis,SINGLE PROTEIN,SINGLE PROTEIN,PERCENTEFFECT,%,66597,0,66591,quantitative,50.0,303.0,0.005,66591.0,,,
6,CHEMBL4649957,Mycobacterium tuberculosis,SINGLE PROTEIN,SINGLE PROTEIN,PERCENTEFFECT,%,65033,0,65027,quantitative,50.0,64.0,0.001,65027.0,,,
7,CHEMBL4649961,Mycobacterium tuberculosis,ORGANISM,ORGANISM,PERCENTEFFECT,%,53170,0,53165,quantitative,50.0,898.0,0.017,53165.0,,,
8,CHEMBL4649947,Mycobacterium tuberculosis,SINGLE PROTEIN,SINGLE PROTEIN,PERCENTEFFECT,%,8841,0,8841,quantitative,50.0,12.0,0.001,8841.0,,,
9,CHEMBL4649949,Mycobacterium tuberculosis,UNCHECKED,ORGANISM,IC50,umol.L-1,2468,0,2468,quantitative,10.0,90.0,0.036,2468.0,,,


In [58]:
COLS = ['assay_id', 'assay_organism', 'target_type', 'target_type_curated', 'activity_type', 'unit', 
        'activities', 'nan_values', "cpds", 'dataset_type', 'expert_cutoff', "pos_qt", "ratio_qt", "cpds_qt", 'pos_ql', 'ratio_ql', 'cpds_ql', 'clusters_0.6']

master[(master['ratio_qt'] > 0.5) | (master['ratio_ql'] > 0.5) ][COLS][:50].reset_index(drop=True)

Unnamed: 0,assay_id,assay_organism,target_type,target_type_curated,activity_type,unit,activities,nan_values,cpds,dataset_type,expert_cutoff,pos_qt,ratio_qt,cpds_qt,pos_ql,ratio_ql,cpds_ql,clusters_0.6
0,CHEMBL1794349,,SINGLE PROTEIN,SINGLE PROTEIN,AC50,umol.L-1,2126,0,2121,mixed,1.0,37.0,0.017,2121.0,1380.0,0.654,2111.0,1648
1,CHEMBL1794426,,SINGLE PROTEIN,SINGLE PROTEIN,EC50,umol.L-1,2121,0,2116,mixed,1.0,316.0,0.149,2116.0,1873.0,0.92,2036.0,1649
2,CHEMBL1794324,,SINGLE PROTEIN,SINGLE PROTEIN,AC50,umol.L-1,2069,0,2064,mixed,1.0,118.0,0.057,2064.0,1690.0,0.838,2016.0,1650
3,CHEMBL2098495,Mycobacterium tuberculosis variant bovis BCG,ORGANISM,ORGANISM,MIC90,umol.L-1,776,0,776,quantitative,10.0,776.0,1.0,776.0,,,,581
4,CHEMBL2114816,Mycobacterium tuberculosis H37Rv,SINGLE PROTEIN,SINGLE PROTEIN,AC50,umol.L-1,369,0,369,mixed,1.0,4.0,0.011,369.0,229.0,0.621,369.0,316
5,CHEMBL2114860,Mycobacterium tuberculosis UT205,SINGLE PROTEIN,SINGLE PROTEIN,AC50,umol.L-1,298,0,298,mixed,1.0,55.0,0.185,298.0,298.0,1.0,298.0,202
6,CHEMBL2098496,Mycobacterium tuberculosis H37Rv,ORGANISM,ORGANISM,MIC90,umol.L-1,177,0,177,quantitative,10.0,177.0,1.0,177.0,,,,137
7,CHEMBL2354305,,SINGLE PROTEIN,SINGLE PROTEIN,IC50,umol.L-1,174,0,174,mixed,1.0,28.0,0.161,174.0,149.0,0.856,174.0,172
8,CHEMBL4333704,Mycobacterium tuberculosis,ORGANISM,ORGANISM,MIC,umol.L-1,164,0,164,quantitative,10.0,106.0,0.646,164.0,,,,48
9,CHEMBL1614471,Mycobacterium tuberculosis H37Rv,SINGLE PROTEIN,SINGLE PROTEIN,IC50,umol.L-1,125,0,125,mixed,1.0,1.0,0.008,125.0,125.0,1.0,125.0,108
