In [3]:
from indigo import *

indigo = Indigo()

In [4]:
print("Indigo version " + indigo.version())

Indigo version 1.5.0-g1aef03cac-x86_64-win-msvc-1929


In [5]:
# pip install pandas

In [16]:
import pandas as pd
import numpy as np
from pathlib import Path
pd.options.mode.chained_assignment = None

In [7]:
dataset = pd.read_csv("Adrenergic_dataset.csv")

In [8]:
ds_logp = dataset[["Structure", "ID", "logP"]]
ds_logp.head()

Unnamed: 0,Structure,ID,logP
0,CC\C(=C(\CC)/c1ccc(O)cc1)\c2ccc(O)cc2,CHEMBL411,4.871
1,CSc1ccc2Sc3ccccc3N(CCC4CCCCN4C)c2c1,CHEMBL479,5.9
2,COc1cccc(CCN2C3C4C5CC6C7C5C3C7C2(O)C46)c1,CHEMBL2205811,3.04
3,COc1cc(CN[C@H]2C3C4CC5C6C4CC3C6C25)cc(OC)c1OC,CHEMBL2432051,3.38
4,COc1cccc(CCN2C3C4C5CC6C7C5C3C7C2(O)C46)c1OC,CHEMBL2205813,2.899


In [9]:
ds_adra = dataset[dataset["AdrA1A_PCHEMBL_VALUE"].notna()][["Structure", "ID", "AdrA1A_PCHEMBL_VALUE"]]
ds_adra.reset_index(inplace=True, drop=True)
ds_adra.head()

Unnamed: 0,Structure,ID,AdrA1A_PCHEMBL_VALUE
0,CNC(=O)C(CCN1CCC(O)(CC1)c2ccc(Cl)cc2)(c3ccccc3...,CHEMBL1627,8.0
1,Cc1ccc2c(cccc2n1)N3CCN(CCc4cccc5c4OCc6c(ncn56)...,CHEMBL1241913,8.56
2,COc1ccccc1OCCNCC2CSC(S2)(c3ccccc3)c4ccccc4,CHEMBL1086156,7.45
3,O[C@H]1[C@H](CC[C@@H]1Oc2ccccc2)NC[C@H]3COc4cc...,CHEMBL135974,6.41
4,COc1cccc(OC)c1OCCNC[C@H]2COc3ccccc3O2,CHEMBL1182155,9.39


In [10]:
def indigo_fingerprint_short(structure: str) -> np.ndarray:
    indigo.setOption("ignore-stereochemistry-errors", True)
    indigo.setOption("ignore-bad-valence", True)
    m = indigo.loadMolecule(structure)
    m.aromatize()
    indigo.setOption("similarity-type", "ecfp6")
    indigo.setOption("fp-sim-qwords", 24)
    indigo.setOption("fp-ord-qwords", 6)
    indigo.setOption("fp-any-qwords", 6)
    indigo.setOption("fp-tau-qwords", 3)
    indigo.setOption("fp-ext-enabled", True)
    ecfp = numpy.frombuffer(m.fingerprint("full").toBuffer(), dtype=numpy.uint8)
    return ecfp

In [17]:
ds_logp["ECFP"] = [indigo_fingerprint_short(s) for s in ds_logp["Structure"]]
ds_adra["ECFP"] = [indigo_fingerprint_short(s) for s in ds_adra["Structure"]]
ds_logp.head()

Unnamed: 0,Structure,ID,logP,ECFP
0,CC\C(=C(\CC)/c1ccc(O)cc1)\c2ccc(O)cc2,CHEMBL411,4.871,"[3, 0, 0, 77, 70, 218, 234, 87, 218, 228, 213,..."
1,CSc1ccc2Sc3ccccc3N(CCC4CCCCN4C)c2c1,CHEMBL479,5.9,"[15, 3, 0, 255, 254, 254, 254, 253, 63, 251, 2..."
2,COc1cccc(CCN2C3C4C5CC6C7C5C3C7C2(O)C46)c1,CHEMBL2205811,3.04,"[7, 0, 0, 255, 95, 191, 254, 253, 122, 255, 25..."
3,COc1cc(CN[C@H]2C3C4CC5C6C4CC3C6C25)cc(OC)c1OC,CHEMBL2432051,3.38,"[7, 0, 0, 252, 223, 214, 239, 221, 94, 254, 28..."
4,COc1cccc(CCN2C3C4C5CC6C7C5C3C7C2(O)C46)c1OC,CHEMBL2205813,2.899,"[7, 0, 0, 255, 95, 255, 254, 253, 123, 255, 25..."


In [12]:
ds_adra.head()

Unnamed: 0,Structure,ID,AdrA1A_PCHEMBL_VALUE,ECFP
0,CNC(=O)C(CCN1CCC(O)(CC1)c2ccc(Cl)cc2)(c3ccccc3...,CHEMBL1627,8.0,"[15, 0, 0, 231, 126, 191, 255, 223, 62, 254, 2..."
1,Cc1ccc2c(cccc2n1)N3CCN(CCc4cccc5c4OCc6c(ncn56)...,CHEMBL1241913,8.56,"[31, 0, 0, 247, 255, 235, 255, 253, 255, 191, ..."
2,COc1ccccc1OCCNCC2CSC(S2)(c3ccccc3)c4ccccc4,CHEMBL1086156,7.45,"[7, 3, 0, 255, 239, 251, 227, 221, 251, 223, 1..."
3,O[C@H]1[C@H](CC[C@@H]1Oc2ccccc2)NC[C@H]3COc4cc...,CHEMBL135974,6.41,"[39, 0, 0, 84, 102, 219, 43, 221, 253, 255, 76..."
4,COc1cccc(OC)c1OCCNC[C@H]2COc3ccccc3O2,CHEMBL1182155,9.39,"[99, 0, 0, 124, 230, 211, 105, 221, 213, 91, 7..."


In [13]:
# pip install -U scikit-learn

In [14]:
# check scikit-learn version
import sklearn
print(sklearn.__version__)

1.0.1


In [27]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

scores = []
kf = KFold(n_splits=5, random_state=42, shuffle=True)
X = ds_logp.ECFP
y = ds_logp.logP
for train_index, test_index in kf.split(X, numpy.zeros(len(y))):
    X = ds_logp.loc[train_index]["ECFP"].tolist()
    y = ds_logp.loc[train_index]["logP"]
    
    Xt = ds_logp.loc[test_index]["ECFP"].tolist()
    yt = ds_logp.loc[test_index]["logP"]
    
    model_logP = RandomForestRegressor(n_estimators=100, max_features=None).fit(X, y)
    score = r2_score(yt, model_logP.predict(Xt))
    print(score, "\n")
    scores.append(score)

print("MEAN:", numpy.mean(scores))

0.7060103589187897 

0.7141735366955138 

0.7016053047427632 

0.7115975562008005 

0.7008605507177861 

MEAN: 0.7068494614551306


In [24]:
scores = []

kf = KFold(n_splits=5, random_state=42, shuffle=True)
X = ds_adra.ECFP
y = ds_adra.AdrA1A_PCHEMBL_VALUE
for train_index, test_index in kf.split(X, numpy.zeros(len(y))):
    X = ds_adra.loc[train_index]["ECFP"].tolist()
    z = ds_adra.loc[train_index]["AdrA1A_PCHEMBL_VALUE"]
    
    Xt = ds_adra.loc[test_index]["ECFP"].tolist()
    zt = ds_adra.loc[test_index]["AdrA1A_PCHEMBL_VALUE"]
    
    model_Adr = RandomForestRegressor(n_estimators=100, max_features=None).fit(X, z)
    score = r2_score(zt, model_Adr.predict(Xt))
    print(score, "\n")
    scores.append(score)
    
print("MEAN:", numpy.mean(scores))

0.5217329452018178 

0.5219955199242897 

0.6126633549067497 

0.5036185313364065 

0.5266338334538623 

MEAN: 0.5373288369646253
