In [1]:
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm
import pandas as pd
import numpy as np
import sys
import os

# Define root directory
root = "."
sys.path.append(os.path.join(root, "..", "src"))
from default import CONFIGPATH

# Loading ChEMBL preprocessed data
print("Loading ChEMBL preprocessed data...")
ChEMBL = pd.read_csv(os.path.join(root, "..", "config", "chembl_processed", "activities_preprocessed.csv"), low_memory=False)
print(f"Original size: {len(ChEMBL)}")
print("Filtering out nan values...")
ChEMBL = ChEMBL[ChEMBL['value'].isna() == False].reset_index(drop=True)
print(f"Size after filtering nan values: {len(ChEMBL)}")

# Load directionality
direction = pd.read_csv(os.path.join(CONFIGPATH, "manual_curation", "activity_std_units_curated_manual_curation.csv"))
direction = direction[direction['manual_curation'].isna() == False]
direction = {(i,j): int(k) for i,j,k in zip(direction['activity_type'], direction['unit'], direction['manual_curation'])}

# List of pathogens to process
pathogens = ["Mycobacterium tuberculosis"]

Loading ChEMBL preprocessed data...
Original size: 24267312
Filtering out nan values...
Size after filtering nan values: 20911360


In [2]:
def get_assay_data(ChEMBL_, assay_id, activity_type, unit):
    if pd.isna(unit):
        assay_data = ChEMBL_[(ChEMBL_['assay_chembl_id'] == assay_id) & (ChEMBL_['activity_type'] == activity_type) & 
                                    (ChEMBL_['unit'].isna() == True) & (ChEMBL_['canonical_smiles'].isna() == False)]
    else:
        assay_data = ChEMBL_[(ChEMBL_['assay_chembl_id'] == assay_id) & (ChEMBL_['activity_type'] == activity_type) & 
                                    (ChEMBL_['unit'] == unit) & (ChEMBL_['canonical_smiles'].isna() == False)]
    return assay_data


def get_direction(activity_type, unit, direction):
    if (activity_type, unit) not in direction:
        raise ValueError(f"Activity type - unit not in direction dict: {activity_type} - {unit}. Curate manually.")
    else:
        return direction[(activity_type, unit)]


In [3]:
# For each pathogen
for pathogen in pathogens:
    
    # Get assays info
    pathogen_code = str(pathogen.split()[0][0] + pathogen.split()[1]).lower()
    print(f"\n\nFiltering data from pathogen: {pathogen_code}...")
    ChEMBL_ = ChEMBL[ChEMBL['target_organism'].str.contains(pathogen, case=False, na=False) | 
                    ChEMBL['assay_organism'].str.contains(pathogen, case=False, na=False)].reset_index(drop=True)
    
    # Load assays
    print(f"Number of activities for {pathogen}: {len(ChEMBL_)}")
    ASSAYS_INFO = pd.read_csv(os.path.join(root, "..", "output", pathogen_code, 'assays.csv'))
    ASSAYS_INFO = ASSAYS_INFO[['assay_id', 'activity_type', 'unit', 'activities', 'cpds']].copy()
    print(f"Original number of assays: {len(ASSAYS_INFO)}")



Filtering data from pathogen: mtuberculosis...
Number of activities for Mycobacterium tuberculosis: 706206
Original number of assays: 11346


In [9]:
ASSAYS_INFO

Unnamed: 0,assay_id,activity_type,unit,activities,cpds
0,CHEMBL4649948,PERCENTEFFECT,%,93556,86590
1,CHEMBL4649949,PERCENTEFFECT,%,101516,86576
2,CHEMBL4649971,PERCENTEFFECT,%,68620,68614
3,CHEMBL4649972,PERCENTEFFECT,%,68617,68611
4,CHEMBL4649941,PERCENTEFFECT,%,67382,66942
...,...,...,...,...,...
11341,CHEMBL4434512,MIC,umol.L-1,1,1
11342,CHEMBL4434511,IC90,umol.L-1,1,1
11343,CHEMBL1675908,LOG10CFU,,1,1
11344,CHEMBL1676664,MIC,umol.L-1,1,1


In [10]:
Counter(ChEMBL_['activity_comment'])

Counter({0: 699095, 1: 5720, -1: 1391})

In [7]:
# For each assay
for assay_id, activity_type, unit in tqdm(ASSAYS_INFO[['assay_id', 'activity_type', 'unit']].values):

    # Get assay data        
    assay_data = get_assay_data(ChEMBL_, assay_id, activity_type, unit)
        
    # Filter assay data
    assay_data = assay_data[['assay_chembl_id', 'target_chembl_id', 'compound_chembl_id', 'canonical_smiles', 'MW', 'pchembl', 
                             'activity_comment', 'standard_text', 'value', 'unit', 'activity_type', 'relation', 'pchembl_calculated']]
        
    # # Get direction
    # dir_ = get_direction(activity_type, unit, direction)

    # # Get values
    # values = assay_data[assay_data['relation'] == "="]['value'].tolist()
    # minimum, maximum = np.min(values), np.max(values)

    # # Adjust relations and values
    # # assay_data = adjust_relations_values(dir_, assay_data)
    # # ...
    # if dir_ == 1:

    #     # If dir is 1, <'s are 0's
    #     assay_data["value"] = [i if j != '<' else 0 for i,j in zip(assay_data['value'], assay_data['relation'])]

    # elif dir_ == -1:

    #     # If dir is -1, >'s are 0's
    #     assay_data["value"] = [i if j != '>' else 0 for i,j in zip(assay_data['value'], assay_data['relation'])]

        

    print(assay_id, " -- ", len(assay_data[assay_data['standard_text'] == 1]))

  0%|          | 2/11346 [00:00<20:35,  9.18it/s]

CHEMBL4649948  --  0
CHEMBL4649949  --  0


  0%|          | 4/11346 [00:00<20:05,  9.41it/s]

CHEMBL4649971  --  0
CHEMBL4649972  --  0


  0%|          | 6/11346 [00:00<20:57,  9.02it/s]

CHEMBL4649941  --  0
CHEMBL4649965  --  0


  0%|          | 8/11346 [00:00<20:20,  9.29it/s]

CHEMBL4649957  --  0
CHEMBL4649961  --  0
CHEMBL4649947  --  0


  0%|          | 12/11346 [00:01<18:57,  9.96it/s]

CHEMBL4649949  --  0
CHEMBL4649948  --  0
CHEMBL1794349  --  0


  0%|          | 15/11346 [00:01<18:46, 10.06it/s]

CHEMBL1794426  --  0
CHEMBL1794324  --  0
CHEMBL2098495  --  0


  0%|          | 17/11346 [00:01<18:41, 10.10it/s]

CHEMBL4649972  --  0
CHEMBL4649957  --  0
CHEMBL4649941  --  0


  0%|          | 21/11346 [00:02<18:20, 10.29it/s]

CHEMBL2094262  --  0
CHEMBL2094261  --  0
CHEMBL2114816  --  0


  0%|          | 23/11346 [00:02<18:22, 10.27it/s]

CHEMBL4649965  --  0
CHEMBL2114860  --  0
CHEMBL4649971  --  0


  0%|          | 27/11346 [00:02<18:15, 10.33it/s]

CHEMBL2098496  --  0
CHEMBL2354305  --  0
CHEMBL5345967  --  0


  0%|          | 29/11346 [00:02<18:23, 10.25it/s]

CHEMBL5345966  --  0
CHEMBL4333704  --  0
CHEMBL4388634  --  0


  0%|          | 33/11346 [00:03<18:16, 10.32it/s]

CHEMBL4388638  --  0
CHEMBL2032580  --  0
CHEMBL3857706  --  0


  0%|          | 35/11346 [00:03<18:14, 10.34it/s]

CHEMBL1614471  --  0
CHEMBL3887425  --  0
CHEMBL1634496  --  0


  0%|          | 39/11346 [00:03<18:08, 10.39it/s]

CHEMBL1634497  --  0
CHEMBL747348  --  0
CHEMBL1049618  --  0


  0%|          | 41/11346 [00:04<18:09, 10.38it/s]

CHEMBL3832763  --  0
CHEMBL3832764  --  0
CHEMBL3832777  --  0


  0%|          | 45/11346 [00:04<18:08, 10.38it/s]

CHEMBL3832776  --  0
CHEMBL3832775  --  0
CHEMBL3832765  --  0


  0%|          | 47/11346 [00:04<18:09, 10.37it/s]

CHEMBL1049617  --  0
CHEMBL3832766  --  0
CHEMBL3832772  --  0


  0%|          | 51/11346 [00:05<18:09, 10.37it/s]

CHEMBL3832767  --  0
CHEMBL3832770  --  0
CHEMBL3832773  --  0


  0%|          | 53/11346 [00:05<18:13, 10.33it/s]

CHEMBL3832774  --  0
CHEMBL3832771  --  0
CHEMBL3832769  --  0


  0%|          | 55/11346 [00:05<18:28, 10.18it/s]

CHEMBL3832768  --  0
CHEMBL5246903  --  0


  1%|          | 59/11346 [00:05<18:36, 10.11it/s]

CHEMBL1839642  --  0
CHEMBL1839643  --  0
CHEMBL4879906  --  0


  1%|          | 61/11346 [00:06<18:38, 10.09it/s]

CHEMBL4384699  --  0
CHEMBL4011046  --  0
CHEMBL4011042  --  0


  1%|          | 65/11346 [00:06<18:21, 10.24it/s]

CHEMBL1115629  --  0
CHEMBL4011010  --  0
CHEMBL4011010  --  0


  1%|          | 67/11346 [00:06<18:14, 10.30it/s]

CHEMBL1115630  --  0
CHEMBL4384698  --  0
CHEMBL1948972  --  0


  1%|          | 69/11346 [00:06<18:02, 10.41it/s]

CHEMBL1948971  --  0
CHEMBL4150258  --  0


  1%|          | 73/11346 [00:07<18:23, 10.21it/s]

CHEMBL4150258  --  0
CHEMBL3832762  --  0
CHEMBL3832806  --  0


  1%|          | 75/11346 [00:07<18:27, 10.18it/s]

CHEMBL2423609  --  0
CHEMBL3270179  --  0
CHEMBL940872  --  0


  1%|          | 79/11346 [00:07<18:11, 10.32it/s]

CHEMBL3270180  --  0
CHEMBL3267866  --  0
CHEMBL2318567  --  0


  1%|          | 81/11346 [00:07<18:05, 10.38it/s]

CHEMBL5156632  --  0
CHEMBL3387309  --  0
CHEMBL4841891  --  0


  1%|          | 85/11346 [00:08<18:02, 10.40it/s]

CHEMBL3382519  --  0
CHEMBL5156633  --  0
CHEMBL893310  --  0


  1%|          | 87/11346 [00:08<18:03, 10.39it/s]

CHEMBL2329977  --  0
CHEMBL4425486  --  0
CHEMBL3096848  --  0


  1%|          | 91/11346 [00:08<17:56, 10.46it/s]

CHEMBL3096848  --  0
CHEMBL1212146  --  0
CHEMBL746402  --  0


  1%|          | 93/11346 [00:09<18:00, 10.41it/s]

CHEMBL3129228  --  0
CHEMBL1794344  --  0
CHEMBL1947387  --  0


  1%|          | 96/11346 [00:09<19:08,  9.79it/s]

CHEMBL1947388  --  0
CHEMBL4155946  --  0
CHEMBL5099097  --  0


  1%|          | 99/11346 [00:09<18:50,  9.95it/s]

CHEMBL2421470  --  0
CHEMBL2447071  --  0
CHEMBL2421471  --  0


  1%|          | 103/11346 [00:10<18:28, 10.14it/s]

CHEMBL2447069  --  0
CHEMBL5627291  --  0
CHEMBL3875318  --  0


  1%|          | 105/11346 [00:10<18:23, 10.18it/s]

CHEMBL1737995  --  0
CHEMBL858888  --  0
CHEMBL1018758  --  0


  1%|          | 109/11346 [00:10<18:20, 10.21it/s]

CHEMBL4052220  --  0
CHEMBL4052219  --  0
CHEMBL4135530  --  0


  1%|          | 111/11346 [00:10<18:11, 10.29it/s]

CHEMBL1918837  --  0
CHEMBL4194982  --  0
CHEMBL906555  --  0


  1%|          | 115/11346 [00:11<18:05, 10.34it/s]

CHEMBL1908450  --  0
CHEMBL3619020  --  0
CHEMBL2184408  --  0


  1%|          | 117/11346 [00:11<18:06, 10.33it/s]

CHEMBL4050695  --  0
CHEMBL4406692  --  0
CHEMBL4406693  --  0


  1%|          | 121/11346 [00:11<17:59, 10.40it/s]

CHEMBL4425484  --  0
CHEMBL1019550  --  0
CHEMBL750947  --  0


  1%|          | 123/11346 [00:12<18:00, 10.38it/s]

CHEMBL3375146  --  0
CHEMBL3375147  --  0
CHEMBL5329444  --  0


  1%|          | 127/11346 [00:12<18:02, 10.36it/s]

CHEMBL903345  --  0
CHEMBL2157635  --  0
CHEMBL3579627  --  0


  1%|          | 129/11346 [00:12<18:07, 10.32it/s]

CHEMBL3130168  --  0
CHEMBL1825403  --  0
CHEMBL2157637  --  0


  1%|          | 133/11346 [00:13<18:07, 10.31it/s]

CHEMBL3789231  --  0
CHEMBL2157636  --  0
CHEMBL2447070  --  0


  1%|          | 135/11346 [00:13<18:08, 10.30it/s]

CHEMBL3789232  --  0
CHEMBL3748592  --  0
CHEMBL2209271  --  0


  1%|          | 139/11346 [00:13<18:16, 10.22it/s]

CHEMBL5246869  --  0
CHEMBL994234  --  0
CHEMBL750948  --  0


  1%|          | 141/11346 [00:13<18:15, 10.23it/s]

CHEMBL3396596  --  0
CHEMBL907779  --  0
CHEMBL4221360  --  0


  1%|▏         | 145/11346 [00:14<18:29, 10.10it/s]

CHEMBL4307526  --  0
CHEMBL5253350  --  0
CHEMBL1104165  --  0


  1%|▏         | 147/11346 [00:14<19:27,  9.60it/s]

CHEMBL4221359  --  0
CHEMBL4384233  --  0
CHEMBL4384231  --  0


  1%|▏         | 151/11346 [00:14<18:39, 10.00it/s]

CHEMBL4481755  --  0
CHEMBL5129876  --  0
CHEMBL4481695  --  0


  1%|▏         | 154/11346 [00:15<19:11,  9.72it/s]

CHEMBL4055434  --  0
CHEMBL4395785  --  0
CHEMBL4481754  --  0


  1%|▏         | 156/11346 [00:15<19:32,  9.54it/s]

CHEMBL3630154  --  0
CHEMBL1664814  --  0


  1%|▏         | 159/11346 [00:15<19:12,  9.71it/s]

CHEMBL4375569  --  0
CHEMBL3096846  --  0
CHEMBL4375570  --  0


  1%|▏         | 162/11346 [00:15<19:22,  9.62it/s]

CHEMBL3386984  --  0
CHEMBL5322743  --  0
CHEMBL3096846  --  0


  1%|▏         | 165/11346 [00:16<19:02,  9.78it/s]

CHEMBL4824440  --  0
CHEMBL1045353  --  0
CHEMBL1045352  --  0


  1%|▏         | 167/11346 [00:16<18:44,  9.94it/s]

CHEMBL4725867  --  0
CHEMBL3624795  --  0
CHEMBL3624797  --  0


  1%|▏         | 170/11346 [00:16<18:33, 10.04it/s]

CHEMBL4481696  --  0
CHEMBL5262775  --  0
CHEMBL5381961  --  0


  2%|▏         | 174/11346 [00:17<18:12, 10.22it/s]

CHEMBL4844114  --  0
CHEMBL1776260  --  0
CHEMBL1054943  --  0


  2%|▏         | 176/11346 [00:17<18:29, 10.07it/s]

CHEMBL5227581  --  0
CHEMBL4423961  --  0
CHEMBL2209268  --  0


  2%|▏         | 180/11346 [00:17<18:18, 10.17it/s]

CHEMBL4673720  --  0
CHEMBL3132000  --  0
CHEMBL4404083  --  0


  2%|▏         | 182/11346 [00:17<18:41,  9.95it/s]

CHEMBL4425485  --  0
CHEMBL2094264  --  0
CHEMBL3056447  --  0


  2%|▏         | 186/11346 [00:18<18:24, 10.11it/s]

CHEMBL5034468  --  0
CHEMBL5623472  --  0
CHEMBL5335922  --  0


  2%|▏         | 188/11346 [00:18<18:17, 10.16it/s]

CHEMBL1291245  --  0
CHEMBL5623473  --  0
CHEMBL2094263  --  0


  2%|▏         | 190/11346 [00:18<18:20, 10.14it/s]

CHEMBL1291244  --  0
CHEMBL1104164  --  0


  2%|▏         | 192/11346 [00:18<18:30, 10.05it/s]

CHEMBL3102132  --  0
CHEMBL4253139  --  0


  2%|▏         | 195/11346 [00:19<18:56,  9.81it/s]

CHEMBL4712810  --  0
CHEMBL3887424  --  0
CHEMBL1061238  --  0


  2%|▏         | 199/11346 [00:19<18:25, 10.08it/s]

CHEMBL1061239  --  0
CHEMBL3073160  --  0
CHEMBL901696  --  0


  2%|▏         | 201/11346 [00:19<18:16, 10.17it/s]

CHEMBL1937176  --  0
CHEMBL3131200  --  0
CHEMBL4119731  --  0


  2%|▏         | 205/11346 [00:20<17:46, 10.45it/s]

CHEMBL825364  --  0
CHEMBL4306878  --  0
CHEMBL3102132  --  0


  2%|▏         | 207/11346 [00:20<17:51, 10.40it/s]

CHEMBL2350353  --  0
CHEMBL5524044  --  0
CHEMBL3405738  --  0


  2%|▏         | 211/11346 [00:20<18:04, 10.26it/s]

CHEMBL4119730  --  0
CHEMBL4119732  --  0
CHEMBL4043386  --  0


  2%|▏         | 213/11346 [00:21<18:05, 10.26it/s]

CHEMBL1030449  --  0
CHEMBL4261187  --  0
CHEMBL5375438  --  0


  2%|▏         | 217/11346 [00:21<17:56, 10.34it/s]

CHEMBL3380428  --  0
CHEMBL3806455  --  0
CHEMBL5047846  --  0


  2%|▏         | 219/11346 [00:21<17:55, 10.35it/s]

CHEMBL5588127  --  0
CHEMBL4308568  --  0
CHEMBL5047847  --  0


  2%|▏         | 223/11346 [00:21<17:41, 10.48it/s]

CHEMBL4365276  --  0
CHEMBL3056340  --  0
CHEMBL3056340  --  0


  2%|▏         | 225/11346 [00:22<17:53, 10.36it/s]

CHEMBL3131996  --  0
CHEMBL1936053  --  0
CHEMBL3135495  --  0


  2%|▏         | 229/11346 [00:22<18:39,  9.93it/s]

CHEMBL4388639  --  0
CHEMBL1936053  --  0
CHEMBL2443668  --  0


  2%|▏         | 231/11346 [00:22<18:22, 10.08it/s]

CHEMBL1936053  --  0
CHEMBL748927  --  0
CHEMBL3637831  --  0


  2%|▏         | 235/11346 [00:23<18:08, 10.20it/s]

CHEMBL4222624  --  0
CHEMBL1252095  --  0
CHEMBL2149219  --  0


  2%|▏         | 237/11346 [00:23<18:06, 10.22it/s]

CHEMBL3737394  --  0
CHEMBL3630438  --  0
CHEMBL1687502  --  0


  2%|▏         | 241/11346 [00:23<18:00, 10.28it/s]

CHEMBL1010779  --  0
CHEMBL3637832  --  0
CHEMBL2091603  --  0


  2%|▏         | 243/11346 [00:23<17:58, 10.29it/s]

CHEMBL4831379  --  0
CHEMBL2091604  --  0
CHEMBL3405737  --  0


  2%|▏         | 247/11346 [00:24<18:01, 10.26it/s]

CHEMBL4703225  --  0
CHEMBL4395371  --  0
CHEMBL3880091  --  0


  2%|▏         | 249/11346 [00:24<17:54, 10.32it/s]

CHEMBL5107643  --  0
CHEMBL4395372  --  0
CHEMBL4189875  --  0


  2%|▏         | 253/11346 [00:24<18:16, 10.12it/s]

CHEMBL4395370  --  0
CHEMBL3380429  --  0
CHEMBL4181648  --  0


  2%|▏         | 255/11346 [00:25<18:19, 10.09it/s]

CHEMBL4359366  --  0
CHEMBL3801596  --  0
CHEMBL3801592  --  0


  2%|▏         | 259/11346 [00:25<18:01, 10.25it/s]

CHEMBL4265148  --  0
CHEMBL4011977  --  0
CHEMBL4479191  --  0


  2%|▏         | 261/11346 [00:25<17:53, 10.33it/s]

CHEMBL1030446  --  0
CHEMBL3376392  --  0
CHEMBL3630322  --  0


  2%|▏         | 263/11346 [00:25<17:52, 10.34it/s]

CHEMBL3077826  --  0
CHEMBL746400  --  0
CHEMBL3866224  --  0


  2%|▏         | 267/11346 [00:26<18:06, 10.20it/s]

CHEMBL5146973  --  0
CHEMBL5389958  --  0
CHEMBL4032591  --  0


  2%|▏         | 271/11346 [00:26<18:04, 10.21it/s]

CHEMBL746399  --  0
CHEMBL3077826  --  0
CHEMBL746401  --  0


  2%|▏         | 273/11346 [00:26<18:02, 10.23it/s]

CHEMBL4011978  --  0
CHEMBL4771264  --  0
CHEMBL2394224  --  0


  2%|▏         | 277/11346 [00:27<17:59, 10.25it/s]

CHEMBL3870912  --  0
CHEMBL5625849  --  0
CHEMBL4059067  --  0


  2%|▏         | 279/11346 [00:27<17:44, 10.40it/s]

CHEMBL5625847  --  0
CHEMBL3073160  --  0
CHEMBL4255034  --  0


  2%|▏         | 283/11346 [00:27<17:27, 10.56it/s]

CHEMBL3380429  --  0
CHEMBL5246900  --  0
CHEMBL3637833  --  0


  3%|▎         | 285/11346 [00:28<17:34, 10.49it/s]

CHEMBL5144856  --  0
CHEMBL4771267  --  0
CHEMBL4771268  --  0


  3%|▎         | 289/11346 [00:28<17:50, 10.33it/s]

CHEMBL5246901  --  0
CHEMBL5246902  --  0
CHEMBL3135672  --  0


  3%|▎         | 292/11346 [00:28<18:37,  9.89it/s]

CHEMBL5374588  --  0
CHEMBL5374588  --  0
CHEMBL5262814  --  0


  3%|▎         | 294/11346 [00:28<18:09, 10.15it/s]

CHEMBL4703226  --  0
CHEMBL4831679  --  0





KeyboardInterrupt: 

In [None]:
assays_with_annotations = dict(Counter(ChEMBL_[ChEMBL_['activity_comment'] != 0]['assay_chembl_id']))

In [24]:
for assay in sorted(assays_with_annotations):

    assay_data = ChEMBL_[ChEMBL_['assay_chembl_id'] == assay]
    actives = assay_data[assay_data['activity_comment'] == 1]
    undefined = assay_data[assay_data['activity_comment'] == 0]
    inactives = assay_data[assay_data['activity_comment'] == -1]

    break

In [27]:
actives

Unnamed: 0,activity_id,assay_id,assay_chembl_id,assay_type,assay_confidence_score,assay_organism,tid,target_type,target_organism,target_chembl_id,...,canonical_smiles,MW,pchembl,activity_comment,standard_text,value,unit,activity_type,relation,pchembl_calculated
22496,6540023,752386,CHEMBL1794324,F,8,,103988,SINGLE PROTEIN,Mycobacterium tuberculosis,CHEMBL1741192,...,Cn1c(=O)c([N+](=O)[O-])c(-n2ccnc2)c2ccccc21,270.248,5.18,1,0,6.673,umol.L-1,AC50,=,5.175679
22497,6540024,752386,CHEMBL1794324,F,8,,103988,SINGLE PROTEIN,Mycobacterium tuberculosis,CHEMBL1741192,...,CCCCC(O)(CCCC)C(=O)NN(C(=O)CCl)c1ccccc1,354.878,4.82,1,0,15.040,umol.L-1,AC50,=,4.822752
22498,6540025,752386,CHEMBL1794324,F,8,,103988,SINGLE PROTEIN,Mycobacterium tuberculosis,CHEMBL1741192,...,COc1ccc(Nc2ccc(OC)cc2)cc1,229.279,,1,0,121.500,umol.L-1,AC50,=,3.915424
22499,6540026,752386,CHEMBL1794324,F,8,,103988,SINGLE PROTEIN,Mycobacterium tuberculosis,CHEMBL1741192,...,CCn1c(SCC(=O)Nc2nc(-c3ccc(C)cc3)cs2)nnc1-c1ccc...,451.577,5.46,1,0,3.460,umol.L-1,AC50,=,5.460924
22500,6540027,752386,CHEMBL1794324,F,8,,103988,SINGLE PROTEIN,Mycobacterium tuberculosis,CHEMBL1741192,...,O=C(Oc1ccc2ccccc2c1)c1ccco1,238.242,4.74,1,0,18.310,umol.L-1,AC50,=,4.737312
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24642,6596119,752386,CHEMBL1794324,F,8,,103988,SINGLE PROTEIN,Mycobacterium tuberculosis,CHEMBL1741192,...,O=S(=O)(CCSc1nc2ccccc2s1)c1nc2ccccc2s1,392.552,5.64,1,0,2.291,umol.L-1,AC50,=,5.639975
24645,6596122,752386,CHEMBL1794324,F,8,,103988,SINGLE PROTEIN,Mycobacterium tuberculosis,CHEMBL1741192,...,c1coc(-c2nnc(-c3ccco3)nn2)c1,214.184,5.24,1,0,5.786,umol.L-1,AC50,=,5.237622
24646,6596123,752386,CHEMBL1794324,F,8,,103988,SINGLE PROTEIN,Mycobacterium tuberculosis,CHEMBL1741192,...,COc1ccc(CC2=CC(=O)c3ccccc3C2=O)cc1,278.307,5.13,1,0,7.490,umol.L-1,AC50,=,5.125518
24647,6596124,752386,CHEMBL1794324,F,8,,103988,SINGLE PROTEIN,Mycobacterium tuberculosis,CHEMBL1741192,...,COc1ccc(N=C(S)NC(NC(=O)CCl)C(Cl)(Cl)Cl)c([N+](...,450.131,4.54,1,0,28.530,umol.L-1,AC50,=,4.544698


In [None]:
plt.hist(assay_data['value'].tolist(), bins=[i for i in range(-200, 100, 5)])
plt.show()

In [None]:
# Binarization

# 1. Get direction
# if dir_ == +1 ==> > are =, < are min
# if dir_ == -1 ==> < are =, > are max

# 2. 