Repeat the unbalanced data modeling experiment using a "standard" approach: the `BalancedRandomForestClassifier` from [sklearn imbalanced](https://imbalanced-learn.readthedocs.io/en/stable/). This uses the approach suggested by Breiman et al.: Chen, Chao, Andy Liaw, and Leo Breiman. “Using random forest to learn imbalanced data.” University of California, Berkeley 110 (2004): 1-12

In [1]:
from rdkit import Chem, DataStructs
from rdkit.Chem import Draw
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.Draw import IPythonConsole
from collections import defaultdict
import pandas as pd
import numpy as np
import gzip
import pickle
import matplotlib.pyplot as plt
%pylab inline

Populating the interactive namespace from numpy and matplotlib


Here's the function to build and evaluate a random forest for an assay. This won't build the best possible model for the assay - we aren't doing any parameter optimization and are always using the same fingerprints - but that's beside the point for this analysis

In [8]:
import sklearn
import random
from imblearn.ensemble import BalancedRandomForestClassifier
#from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

def run_it2(acts,inacts):
    # generate fingerprints for the actives and inactives and convert them to numpy arrays so that
    # we can learn from them. This uses the generalized fingerprinter added in the RDKit 2018.09 release
    act_data = []
    for i,fp in enumerate(rdFingerprintGenerator.GetFPs([Chem.MolFromSmiles(x) for x in acts['canonical_smiles']],fpType=rdFingerprintGenerator.MorganFP)):
        bv = np.zeros((len(fp),),np.int16)
        DataStructs.ConvertToNumpyArray(fp,bv)
        act_data.append((acts['compound_chembl_id'].iloc[i],bv,1))
    inact_data = []
    for i,fp in enumerate(rdFingerprintGenerator.GetFPs([Chem.MolFromSmiles(x) for x in inacts['canonical_smiles']],fpType=rdFingerprintGenerator.MorganFP)):
        bv = np.zeros((len(fp),),np.int16)
        DataStructs.ConvertToNumpyArray(fp,bv)
        inact_data.append((inacts['compound_chembl_id'].iloc[i],bv,0))

    # do a random, stratified 80/20 training/holdout split:
    random.seed(0xf00d)
    nAct = len(act_data)
    actis = list(range(nAct))
    nInact = len(inact_data)
    inactis = list(range(nInact))
    random.shuffle(actis)
    random.shuffle(inactis)

    train_pct = 0.80
    train = [act_data[x] for x in actis[:int(train_pct*nAct)]] + [inact_data[x] for x in inactis[:int(train_pct*nInact)]]
    test = [act_data[x] for x in actis[int(train_pct*nAct):]] + [inact_data[x] for x in inactis[int(train_pct*nInact):]]

    random.shuffle(train)
    random.shuffle(test)

    # build the random forest:
    cls = BalancedRandomForestClassifier(n_estimators=500,max_depth=15,min_samples_leaf=2,n_jobs=4,oob_score=True)
    cls.fit([y for x,y,z in train],[z for x,y,z in train])
    oob_auroc = metrics.roc_auc_score([z for x,y,z in train],[x[1] for x in cls.oob_decision_function_])

    # generate and show some evaluation stats for the model on the holdout data:
    scores = cls.predict([y for x,y,z in test])
    probs = cls.predict_proba([y for x,y,z in test])[:,1]
    auroc = metrics.roc_auc_score([z for x,y,z in test], probs)
    print('ratio: %.3f kappa: %.3f, AUC: %.3f, OOB_AUC: %.3f'%(nAct/nInact,metrics.cohen_kappa_score([z for x,y,z in test],scores),auroc,oob_auroc))
    print(metrics.confusion_matrix([z for x,y,z in test],scores))
    print(metrics.classification_report([z for x,y,z in test],scores))
    
    # to allow further analysis we return a tuple with:
    #  - predicted probabilities of being active (class 1) for the holdout set
    #  - actual values for the holdout set
    #  - (actual values for the training set, OOB predicted probabilities for the training set)
    return probs,[z for x,y,z in test],([z for x,y,z in train],cls.oob_decision_function_)

def run_it_(assay_id,acts,inacts,archive):
    print("--------- Default -----------")
    ps,tgt,oob_tpl = run_it2(acts,inacts)
    oob_tgt,oob_probs = oob_tpl
    oob_probs = [x[1] for x in oob_probs]
    auroc = metrics.roc_auc_score(oob_tgt, oob_probs)
    scores = [1 if x>=0.5 else 0 for x in ps]
    kappa = metrics.cohen_kappa_score(tgt,scores)
    confusion = metrics.confusion_matrix(tgt,scores)
    archive[assay_id].append((0.5,kappa,confusion,auroc))


# The serotonin Ki datasets
This is a set of data about binding to serotonin receptors exported from ChEMBL.

In [4]:
import pandas
with open('../data/serotonin_data.pkl','rb') as inf:
    serotonin_d,assay_lookup = pickle.load(inf)
for k,v in assay_lookup.items():
    print(k,v)
serotonin_d.head()

CHEMBL3371 ('Homo sapiens', 'Serotonin 6 (5-HT6) receptor')
CHEMBL224 ('Homo sapiens', 'Serotonin 2a (5-HT2a) receptor')
CHEMBL214 ('Homo sapiens', 'Serotonin 1a (5-HT1a) receptor')
CHEMBL3155 ('Homo sapiens', 'Serotonin 7 (5-HT7) receptor')
CHEMBL225 ('Homo sapiens', 'Serotonin 2c (5-HT2c) receptor')
CHEMBL1833 ('Homo sapiens', 'Serotonin 2b (5-HT2b) receptor')
CHEMBL1898 ('Homo sapiens', 'Serotonin 1b (5-HT1b) receptor')
CHEMBL1983 ('Homo sapiens', 'Serotonin 1d (5-HT1d) receptor')
CHEMBL1899 ('Homo sapiens', 'Serotonin 3a (5-HT3a) receptor')
CHEMBL1875 ('Homo sapiens', 'Serotonin 4 (5-HT4) receptor')
CHEMBL3426 ('Homo sapiens', 'Serotonin 5a (5-HT5a) receptor')
CHEMBL2182 ('Homo sapiens', 'Serotonin 1e (5-HT1e) receptor')
CHEMBL1805 ('Homo sapiens', 'Serotonin 1f (5-HT1f) receptor')


Unnamed: 0,standard_relation,standard_value,standard_type,pchembl_value,assay_chembl_id,target_chembl_id,canonical_smiles,compound_chembl_id
0,>,10000.0,Ki,,CHEMBL615807,CHEMBL214,CCN(C)C1CCCc2ccncc12.OC(=O)\C=C/C(=O)O.OC(=O)\...,CHEMBL1794855
1,=,168.0,Ki,6.78,CHEMBL615460,CHEMBL214,CCCN(CCC)[C@@H]1CCc2c(OC)cccc2[C@@H]1C,CHEMBL278751
2,=,181.0,Ki,6.74,CHEMBL615809,CHEMBL214,C(N1CCN(CC1)c2ncccn2)c3c[nH]c(n3)c4ccccc4,CHEMBL103772
3,=,3.9,Ki,8.41,CHEMBL615460,CHEMBL214,CCCN1CC[C@@H]2[C@H]1CCc3ccc4ccoc4c23,CHEMBL328107
4,=,1.6,Ki,8.8,CHEMBL615756,CHEMBL214,COc1cccc2OC[C@H](CN3[C@@H]4CC[C@H]3C[C@@](O)(C...,CHEMBL148860


In [10]:
def run_serotonin_assay(assay_id,d,assay_lookup,thresholds=None,archive=None):
    if archive is None:
        archive=defaultdict(list)
    # these need to be converted into a classification problem.
    # start with a pretty strict cutoff for active/inactive:
    assay = d.loc[d['target_chembl_id']==assay_id]
    acts = assay.loc[assay['pchembl_value']>9.0]
    if len(acts)>=50:
        inacts = assay.loc[assay['pchembl_value']<8.5]
    else:
        # but relax that if we don't end up with a reasonable number of actives:
        acts = assay.loc[assay['pchembl_value']>8.0]
        inacts = assay.loc[assay['pchembl_value']<7.5]
    print('*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*')
    print(f'assay_id {assay_id}, organism: {assay_lookup[assay_id][0]}, target: {assay_lookup[assay_id][1]}')
    run_it_(assay_id,acts,inacts,archive)
    print('*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*\n\n')


In [11]:
serotonin_archive = defaultdict(list)
tpls = sorted([(len(v),k) for k,v in serotonin_d.groupby('target_chembl_id').groups.items()],reverse=True)

for v,k in tpls:
    if v>900:
        run_serotonin_assay(k,serotonin_d,assay_lookup,archive=serotonin_archive)

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
assay_id CHEMBL3371, organism: Homo sapiens, target: Serotonin 6 (5-HT6) receptor
--------- Default -----------
ratio: 0.189 kappa: 0.684, AUC: 0.957, OOB_AUC: 0.981
[[384  44]
 [  7  74]]
              precision    recall  f1-score   support

           0       0.98      0.90      0.94       428
           1       0.63      0.91      0.74        81

   micro avg       0.90      0.90      0.90       509
   macro avg       0.80      0.91      0.84       509
weighted avg       0.93      0.90      0.91       509

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*


*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
assay_id CHEMBL224, organism: Homo sapiens, target: Serotonin 2a (5-HT2a) receptor
--------- Default -----------
ratio: 0.085 kappa: 0.550, AUC: 0.985, OOB_AUC: 0.988
[[441  55]
 [  1  42]]
              precision    recall  f1-score   support

           0       1.00      0.89      0.94       496
           1       0.

In [25]:
len(serotonin_archive)

6

# The "Dataset 1" datasets

Now we'll work with the "Dataset 1" datasets from the benchmarking set. These are collections of diverse actives from various ChEMBL targets together with assumed inactives pulled from an older version of ZINC.

In [12]:
import glob
datasets = glob.glob('../../Code/benchmarking_platform/compounds/ChEMBL/cmp_list*_actives.dat.gz')
print(len(datasets))
random.seed(0xf00d)
random.shuffle(datasets)
keep = datasets[:20]
keep[:5]

80


['../../Code/benchmarking_platform/compounds/ChEMBL/cmp_list_ChEMBL_10198_actives.dat.gz',
 '../../Code/benchmarking_platform/compounds/ChEMBL/cmp_list_ChEMBL_10980_actives.dat.gz',
 '../../Code/benchmarking_platform/compounds/ChEMBL/cmp_list_ChEMBL_11279_actives.dat.gz',
 '../../Code/benchmarking_platform/compounds/ChEMBL/cmp_list_ChEMBL_87_actives.dat.gz',
 '../../Code/benchmarking_platform/compounds/ChEMBL/cmp_list_ChEMBL_12252_actives.dat.gz']

In [13]:
import re
with gzip.open('../../Code/benchmarking_platform/compounds/ChEMBL/cmp_list_ChEMBL_zinc_decoys.dat.gz') as inf:
    inactive_df = pd.read_csv(inf,sep='\t')

chembl_active_sets = {}
for fn in datasets:
    nm = re.search(r'cmp_list_(ChEMBL_[0-9]*)_actives',fn).groups()[0]
    with gzip.open(fn) as inf:
        chembl_active_sets[nm] = pd.read_csv(inf,sep='\t')

In [14]:
def run_dataset1_assay(assay_id,chembl_active_sets,inactive_df,factor=20,thresholds=None,archive=None):
    if archive is None:
        archive=defaultdict(list)
    assay = chembl_active_sets[assay_id]
    acts = assay.rename(index=str,columns={'SMILES':'canonical_smiles','# _Name':'compound_chembl_id'})
    inacts = inactive_df.sample(n=factor*len(acts),random_state=hash(assay_id)%0xf00d).rename(index=str,columns={'SMILES':'canonical_smiles','# _Name':'compound_chembl_id'})

    print('*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*')
    print(f'assay_id {assay_id}')

    run_it_(assay_id,acts,inacts,archive)    
    print('*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*\n\n')


In [15]:
ds1_archive = defaultdict(list)
for k in chembl_active_sets:
    run_dataset1_assay(k,chembl_active_sets,inactive_df,archive=ds1_archive)

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
assay_id ChEMBL_10198
--------- Default -----------
ratio: 0.050 kappa: 0.884, AUC: 0.996, OOB_AUC: 0.999
[[400   0]
 [  4  16]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       400
           1       1.00      0.80      0.89        20

   micro avg       0.99      0.99      0.99       420
   macro avg       1.00      0.90      0.94       420
weighted avg       0.99      0.99      0.99       420

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*


*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
assay_id ChEMBL_10980
--------- Default -----------
ratio: 0.050 kappa: 0.495, AUC: 0.979, OOB_AUC: 0.998
[[370  30]
 [  2  18]]
              precision    recall  f1-score   support

           0       0.99      0.93      0.96       400
           1       0.38      0.90      0.53        20

   micro avg       0.92      0.92      0.92       420
   macro avg       0.68      0.91

ratio: 0.050 kappa: 0.729, AUC: 0.987, OOB_AUC: 1.000
[[388  12]
 [  1  19]]
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       400
           1       0.61      0.95      0.75        20

   micro avg       0.97      0.97      0.97       420
   macro avg       0.81      0.96      0.86       420
weighted avg       0.98      0.97      0.97       420

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*


*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
assay_id ChEMBL_114
--------- Default -----------
ratio: 0.050 kappa: 0.492, AUC: 0.960, OOB_AUC: 0.995
[[375  25]
 [  4  16]]
              precision    recall  f1-score   support

           0       0.99      0.94      0.96       400
           1       0.39      0.80      0.52        20

   micro avg       0.93      0.93      0.93       420
   macro avg       0.69      0.87      0.74       420
weighted avg       0.96      0.93      0.94       420

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-

ratio: 0.050 kappa: 0.915, AUC: 0.999, OOB_AUC: 1.000
[[400   0]
 [  3  17]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       400
           1       1.00      0.85      0.92        20

   micro avg       0.99      0.99      0.99       420
   macro avg       1.00      0.93      0.96       420
weighted avg       0.99      0.99      0.99       420

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*


*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
assay_id ChEMBL_19905
--------- Default -----------
ratio: 0.050 kappa: 0.582, AUC: 0.972, OOB_AUC: 1.000
[[381  19]
 [  3  17]]
              precision    recall  f1-score   support

           0       0.99      0.95      0.97       400
           1       0.47      0.85      0.61        20

   micro avg       0.95      0.95      0.95       420
   macro avg       0.73      0.90      0.79       420
weighted avg       0.97      0.95      0.95       420

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-

ratio: 0.050 kappa: 0.973, AUC: 0.999, OOB_AUC: 1.000
[[400   0]
 [  1  19]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       400
           1       1.00      0.95      0.97        20

   micro avg       1.00      1.00      1.00       420
   macro avg       1.00      0.97      0.99       420
weighted avg       1.00      1.00      1.00       420

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*


*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
assay_id ChEMBL_11489
--------- Default -----------
ratio: 0.050 kappa: 0.332, AUC: 0.910, OOB_AUC: 0.993
[[360  40]
 [  6  14]]
              precision    recall  f1-score   support

           0       0.98      0.90      0.94       400
           1       0.26      0.70      0.38        20

   micro avg       0.89      0.89      0.89       420
   macro avg       0.62      0.80      0.66       420
weighted avg       0.95      0.89      0.91       420

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-

ratio: 0.050 kappa: 0.904, AUC: 1.000, OOB_AUC: 1.000
[[396   4]
 [  0  20]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       400
           1       0.83      1.00      0.91        20

   micro avg       0.99      0.99      0.99       420
   macro avg       0.92      0.99      0.95       420
weighted avg       0.99      0.99      0.99       420

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*


*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
assay_id ChEMBL_12968
--------- Default -----------
ratio: 0.050 kappa: 1.000, AUC: 1.000, OOB_AUC: 1.000
[[400   0]
 [  0  20]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       400
           1       1.00      1.00      1.00        20

   micro avg       1.00      1.00      1.00       420
   macro avg       1.00      1.00      1.00       420
weighted avg       1.00      1.00      1.00       420

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-

ratio: 0.050 kappa: 0.632, AUC: 0.988, OOB_AUC: 0.999
[[381  19]
 [  1  19]]
              precision    recall  f1-score   support

           0       1.00      0.95      0.97       400
           1       0.50      0.95      0.66        20

   micro avg       0.95      0.95      0.95       420
   macro avg       0.75      0.95      0.81       420
weighted avg       0.97      0.95      0.96       420

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*


*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
assay_id ChEMBL_165
--------- Default -----------
ratio: 0.050 kappa: 0.323, AUC: 0.899, OOB_AUC: 0.995
[[346  54]
 [  3  17]]
              precision    recall  f1-score   support

           0       0.99      0.86      0.92       400
           1       0.24      0.85      0.37        20

   micro avg       0.86      0.86      0.86       420
   macro avg       0.62      0.86      0.65       420
weighted avg       0.96      0.86      0.90       420

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-

In [26]:
len(ds1_archive)

80

# PubChem HTS Validation assays found in ChEMBL

In [16]:
with open('../data/pubchem_data.pkl','rb') as inf:
    pubchem_d,pubchem_assay_lookup = pickle.load(inf)
tpls = sorted([(len(v),k) for k,v in pubchem_d.groupby('assay_chembl_id').groups.items()],reverse=True)
for k,v in pubchem_assay_lookup.items():
    print(k,v)
pubchem_d.head()

CHEMBL1794375 PUBCHEM_BIOASSAY: qHTS for inhibitors of binding or entry into cells for Marburg Virus. (Class of assay: confirmatory) [Related pubchem assays (depositor defined):AID463114, AID540249, AID540278]
CHEMBL1614421 PUBCHEM_BIOASSAY: qHTS for Inhibitors of Tau Fibril Formation, Thioflavin T Binding. (Class of assay: confirmatory) [Related pubchem assays: 596 ]
CHEMBL1614249 PUBCHEM_BIOASSAY: qHTS Assay for Identification of Novel General Anesthetics. In this assay, a GABAergic mimetic model system, apoferritin and a profluorescent 1-aminoanthracene ligand (1-AMA), was used to construct a competitive binding assay for identification of novel general anesthetics (Class of assay: confirmatory) [Related pubchem assays: 2385 (Probe Development Summary for Identification of Novel General Anesthetics), 2323 (Validation apoferritin assay run on SigmaAldrich LOPAC1280 collection)]
CHEMBL1614166 PubChem BioAssay. qHTS Assay for Inhibitors of MBNL1-poly(CUG) RNA binding.   (Class of assay

Unnamed: 0,canonical_smiles,compound_chembl_id,assay_chembl_id,standard_relation,Mean(standard_value),activity_comment
0,Br.Br.C(c1ccncc1)c2cnc[nH]2,CHEMBL1316355,CHEMBL1614421,=,44668.4,Inconclusive
1,Br.Br.NCCSC(=N)N,CHEMBL1256182,CHEMBL1614249,=,31622.8,Not Active
2,Br.Br.NCCSC(=N)N,CHEMBL1256182,CHEMBL1614364,=,446.7,Not Active
3,Br.Br.NCCSC(=N)N,CHEMBL1256182,CHEMBL1614421,=,17782.8,Inconclusive
4,Br.Br.NCCSC(=N)N,CHEMBL1256182,CHEMBL1794375,,35481.3,active


In [18]:
def run_pubchem_assay(assay_id,d,assay_lookup,thresholds=None,archive=None):
    if archive is None:
        archive=defaultdict(list)
    assay = d.loc[d['assay_chembl_id']==assay_id]
    acts = pd.concat((assay.loc[assay['activity_comment'] == 'Active'], 
                      assay.loc[assay['activity_comment'] == 'active']))
    inacts = pd.concat((assay.loc[assay['activity_comment'] == 'inactive'],
                        assay.loc[assay['activity_comment'] == 'inconclusive'], 
                        assay.loc[assay['activity_comment'] == 'Inconclusive'], 
                        assay.loc[assay['activity_comment'] == 'Not Active']))
    print('*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*')
    print(f'assay_id {assay_id}, description: {assay_lookup[assay_id]}')
    run_it_(assay_id,acts,inacts,archive)       
    print('*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*\n\n')


In [19]:
pubchem_archive = defaultdict(list)
for k in pubchem_assay_lookup:
    run_pubchem_assay(k,pubchem_d,pubchem_assay_lookup,archive=pubchem_archive)

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
assay_id CHEMBL1794375, description: PUBCHEM_BIOASSAY: qHTS for inhibitors of binding or entry into cells for Marburg Virus. (Class of assay: confirmatory) [Related pubchem assays (depositor defined):AID463114, AID540249, AID540278]
--------- Default -----------
ratio: 0.057 kappa: 0.086, AUC: 0.701, OOB_AUC: 0.872
[[10813  4925]
 [  356   542]]
              precision    recall  f1-score   support

           0       0.97      0.69      0.80     15738
           1       0.10      0.60      0.17       898

   micro avg       0.68      0.68      0.68     16636
   macro avg       0.53      0.65      0.49     16636
weighted avg       0.92      0.68      0.77     16636

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*


*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
assay_id CHEMBL1614421, description: PUBCHEM_BIOASSAY: qHTS for Inhibitors of Tau Fibril Formation, Thioflavin T Binding. (Class of assay: confirmatory) [Related

In [24]:
pubchem_archive.items()

dict_items([('CHEMBL1794375', [(0.5, 0.08550788613166005, array([[10813,  4925],
       [  356,   542]]), 0.8718738971125722)]), ('CHEMBL1614421', [(0.5, 0.2823561168797003, array([[6380, 2305],
       [ 256,  867]]), 0.9157041564063693)]), ('CHEMBL1614249', [(0.5, 0.014754218000767594, array([[5946, 2410],
       [  18,   33]]), 0.9627648625198217)]), ('CHEMBL1614166', [(0.5, 0.016042154888835203, array([[5202, 1619],
       [   2,   18]]), 0.9952373030776374)]), ('CHEMBL1614364', [(0.5, 0.1673907131531398, array([[1647,  537],
       [  73,  112]]), 0.9333826956465223)]), ('CHEMBL1613933', [(0.5, 1.0, array([[1430,    0],
       [   0,    1]]), 1.0)]), ('CHEMBL3214913', [(0.5, 0.3877967832524891, array([[532, 168],
       [ 50, 130]]), 0.9455255418604375)]), ('CHEMBL3215169', [(0.5, 0.2815568228584866, array([[385, 150],
       [ 38,  77]]), 0.9354770813844716)])])

In [28]:
len(pubchem_archive)

8

# ChEMBL DrugMatrix assays

In [20]:
with open('../data/drugmatrix_data.pkl','rb') as inf:
    drugmatrix_d,drugmatrix_assay_lookup = pickle.load(inf)
for k,v in drugmatrix_assay_lookup.items():
    print(k,v)
drugmatrix_d.head()

CHEMBL1909215 DRUGMATRIX: Adenosine A3 radioligand binding (ligand: AB-MECA)
CHEMBL1909211 DRUGMATRIX: Serotonin (5-Hydroxytryptamine) 5-HT2A radioligand binding (ligand: [3H] Ketanserin)
CHEMBL1909210 DRUGMATRIX: Serotonin (5-Hydroxytryptamine) 5-HT1B radioligand binding (ligand: [125I] Cyanopindolol)
CHEMBL1909209 DRUGMATRIX: Serotonin (5-Hydroxytryptamine) 5-HT1A radioligand binding (ligand: [3H] 8-OH-DPAT)
CHEMBL1909204 DRUGMATRIX: Protein Tyrosine Kinase, Fyn enzyme inhibition (substrate: Poly(Glu:Tyr))
CHEMBL1909203 DRUGMATRIX: Protein Tyrosine Kinase, EGF Receptor enzyme inhibition (substrate: Poly(Glu:Tyr))
CHEMBL1909191 DRUGMATRIX: Progesterone radioligand binding (ligand: [3H] R-5020)
CHEMBL1909174 DRUGMATRIX: Muscarinic M5 radioligand binding (ligand: [3H] N-Methylscopolamine)
CHEMBL1909173 DRUGMATRIX: Muscarinic M4 radioligand binding (ligand: [3H] N-Methylscopolamine)
CHEMBL1909172 DRUGMATRIX: Muscarinic M3 radioligand binding (ligand: [3H] N-Methylscopolamine)
CHEMBL19091

Unnamed: 0,canonical_smiles,compound_chembl_id,assay_chembl_id,standard_relation,Mean(standard_value),activity_comment
0,Br.CN1[C@@H]2CC[C@H]1C[C@H](C2)OC(=O)C(O)c3ccccc3,CHEMBL1319362,CHEMBL1909097,,,Not Active (inhibition < 50% @ 10 uM and thus ...
1,Br.CN1[C@@H]2CC[C@H]1C[C@H](C2)OC(=O)C(O)c3ccccc3,CHEMBL1319362,CHEMBL1909212,,,Not Active (inhibition < 50% @ 10 uM and thus ...
2,Br.CN1[C@@H]2CC[C@H]1C[C@H](C2)OC(=O)C(O)c3ccccc3,CHEMBL1319362,CHEMBL1909213,,,Not Active (inhibition < 50% @ 10 uM and thus ...
3,Br.CN1[C@@H]2CC[C@H]1C[C@H](C2)OC(=O)C(O)c3ccccc3,CHEMBL1319362,CHEMBL1909214,,,Not Active (inhibition < 50% @ 10 uM and thus ...
4,Br.CN1[C@@H]2CC[C@H]1C[C@H](C2)OC(=O)C(O)c3ccccc3,CHEMBL1319362,CHEMBL1909215,,,Not Active (inhibition < 50% @ 10 uM and thus ...


In [21]:
from collections import defaultdict
def run_drugmatrix_assay(assay_id,d,assay_lookup,thresholds=None,archive=None):
    if archive is None:
        archive=defaultdict(list)
    assay = d.loc[d['assay_chembl_id']==assay_id]
    inact_indices = [x for x,y in enumerate(assay['activity_comment']) if y.find('Not Active')==0]
    act_indices = [x for x,y in enumerate(assay['activity_comment']) if y.find('Active')==0]
    acts = assay.iloc[act_indices]
    inacts = assay.iloc[inact_indices]
    print('*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*')

    if(len(act_indices)<40):
        print(f'>>>>> SKIPPING {assay_id} DUE TO INSUFFICIENT ACTIVES <<<<<< ')
        print('*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*\n\n')
        return

    print(f'assay_id {assay_id}, description: {assay_lookup[assay_id]}')
    run_it_(assay_id,acts,inacts,archive)       

    print('*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*\n\n')


In [22]:
drugmatrix_archive = defaultdict(list)
for k in drugmatrix_assay_lookup:
    run_drugmatrix_assay(k,drugmatrix_d,drugmatrix_assay_lookup,archive=drugmatrix_archive)

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
assay_id CHEMBL1909215, description: DRUGMATRIX: Adenosine A3 radioligand binding (ligand: AB-MECA)
--------- Default -----------
ratio: 0.073 kappa: 0.249, AUC: 0.840, OOB_AUC: 0.990
[[133  24]
 [  5   7]]
              precision    recall  f1-score   support

           0       0.96      0.85      0.90       157
           1       0.23      0.58      0.33        12

   micro avg       0.83      0.83      0.83       169
   macro avg       0.59      0.72      0.61       169
weighted avg       0.91      0.83      0.86       169

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*


*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
assay_id CHEMBL1909211, description: DRUGMATRIX: Serotonin (5-Hydroxytryptamine) 5-HT2A radioligand binding (ligand: [3H] Ketanserin)
--------- Default -----------
ratio: 0.127 kappa: 0.551, AUC: 0.915, OOB_AUC: 0.990
[[133  17]
 [  3  16]]
              precision    recall  f1-score   support

      

ratio: 0.078 kappa: 0.343, AUC: 0.869, OOB_AUC: 0.987
[[132  25]
 [  3  10]]
              precision    recall  f1-score   support

           0       0.98      0.84      0.90       157
           1       0.29      0.77      0.42        13

   micro avg       0.84      0.84      0.84       170
   macro avg       0.63      0.80      0.66       170
weighted avg       0.92      0.84      0.87       170

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*


*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
assay_id CHEMBL1909157, description: DRUGMATRIX: Histamine H2 radioligand binding (ligand: [125I] Aminopotentidine)
--------- Default -----------
ratio: 0.054 kappa: 0.370, AUC: 0.916, OOB_AUC: 0.972
[[139  21]
 [  1   8]]
              precision    recall  f1-score   support

           0       0.99      0.87      0.93       160
           1       0.28      0.89      0.42         9

   micro avg       0.87      0.87      0.87       169
   macro avg       0.63      0.88      0.67  

ratio: 0.105 kappa: 0.392, AUC: 0.870, OOB_AUC: 0.989
[[130  23]
 [  4  12]]
              precision    recall  f1-score   support

           0       0.97      0.85      0.91       153
           1       0.34      0.75      0.47        16

   micro avg       0.84      0.84      0.84       169
   macro avg       0.66      0.80      0.69       169
weighted avg       0.91      0.84      0.86       169

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*


*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
assay_id CHEMBL1909115, description: DRUGMATRIX: Androgen (Testosterone) AR radioligand binding (ligand: [3H] Mibolerone)
--------- Default -----------
ratio: 0.067 kappa: 0.180, AUC: 0.646, OOB_AUC: 0.977
[[147  11]
 [  8   3]]
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       158
           1       0.21      0.27      0.24        11

   micro avg       0.89      0.89      0.89       169
   macro avg       0.58      0.60      

ratio: 0.141 kappa: 0.496, AUC: 0.887, OOB_AUC: 0.982
[[127  21]
 [  4  17]]
              precision    recall  f1-score   support

           0       0.97      0.86      0.91       148
           1       0.45      0.81      0.58        21

   micro avg       0.85      0.85      0.85       169
   macro avg       0.71      0.83      0.74       169
weighted avg       0.90      0.85      0.87       169

*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*


*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
assay_id CHEMBL1909088, description: DRUGMATRIX: Alpha-2A adrenergic receptor radioligand binding (ligand: MK-912)
--------- Default -----------
ratio: 0.150 kappa: 0.487, AUC: 0.877, OOB_AUC: 0.989
[[122  25]
 [  3  19]]
              precision    recall  f1-score   support

           0       0.98      0.83      0.90       147
           1       0.43      0.86      0.58        22

   micro avg       0.83      0.83      0.83       169
   macro avg       0.70      0.85      0.74   

In [27]:
len(drugmatrix_archive)

44

In [23]:
pickle.dump((drugmatrix_archive,pubchem_archive,ds1_archive,serotonin_archive),
            open('../data/unbalanced_data_results.sklearn.pkl','wb+'))