In [1]:
import pandas as pd
import numpy as np
from indigo import *

from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, Legend, ColumnDataSource
from bokeh.palettes import Category10, Category20, Plasma

from sklearn.cluster import KMeans, SpectralClustering

In [2]:
import os
import sys

module_path = os.path.abspath(os.path.join('..\..\..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from api.python.indigo.ml.clustering import clustering
from api.python.indigo.ml.manifold import reduce_dim

In [4]:
methods = {
    "SpectralClustering": SpectralClustering,
    "KMeans": KMeans
}

In [5]:
DS_PATH = "Adrenergic_dataset.csv"

SMILES = "Structure"
ASSAY_1 = "logP"
ASSAY_2 = "AdrA1A_PCHEMBL_VALUE"

In [6]:
indigo = Indigo()

def indigo_fingerprint_short(structure: str) -> np.ndarray:
    indigo.setOption("ignore-stereochemistry-errors", True)
    indigo.setOption("ignore-bad-valence", True)
    m = indigo.loadMolecule(structure)
    m.aromatize()
    indigo.setOption("similarity-type", "ecfp6")
    indigo.setOption("fp-sim-qwords", 24)
    indigo.setOption("fp-ord-qwords", 6)
    indigo.setOption("fp-any-qwords", 6)
    indigo.setOption("fp-tau-qwords", 3)
    indigo.setOption("fp-ext-enabled", True)
    ecfp = np.frombuffer(m.fingerprint("full").toBuffer(), dtype=np.uint8)
    return ecfp

In [7]:
output_notebook()

# Experiment for assay_1 value

In [8]:
df = pd.read_csv(DS_PATH)

In [25]:
columns = [SMILES, "ID", ASSAY_1]

# CLUSTERING_METHOD = "SpectralClustering"
CLUSTERING_METHOD = "KMeans"
N_CLUSTERS = 8

if N_CLUSTERS == 2:
    colors = ["red", "blue"]
elif 2 < N_CLUSTERS <= 10:
    colors = Category10[N_CLUSTERS]
elif 10 < N_CLUSTERS <= 20:
    colors = Category20[N_CLUSTERS]


In [13]:
dataset = df[[*columns]]
dataset = dataset[dataset[ASSAY_1].notna()]
dataset.head()

Unnamed: 0,Structure,ID,logP
0,CC\C(=C(\CC)/c1ccc(O)cc1)\c2ccc(O)cc2,CHEMBL411,4.871
1,CSc1ccc2Sc3ccccc3N(CCC4CCCCN4C)c2c1,CHEMBL479,5.9
2,COc1cccc(CCN2C3C4C5CC6C7C5C3C7C2(O)C46)c1,CHEMBL2205811,3.04
3,COc1cc(CN[C@H]2C3C4CC5C6C4CC3C6C25)cc(OC)c1OC,CHEMBL2432051,3.38
4,COc1cccc(CCN2C3C4C5CC6C7C5C3C7C2(O)C46)c1OC,CHEMBL2205813,2.899


In [15]:
assay_1_values = dataset[ASSAY_1].values.reshape(len(dataset[ASSAY_1].values), 1)
clusters_1 = clustering(assay_1_values, method=methods[CLUSTERING_METHOD], n_clusters=N_CLUSTERS)

In [16]:
fingerprints = [indigo_fingerprint_short(s) for s in dataset[SMILES]]
coordinates = reduce_dim(fingerprints, random_state=0)

In [21]:
x = [c[0] for c in coordinates]
y = [c[1] for c in coordinates]

In [33]:
p = figure(title=f"Assay value: {ASSAY_1},   Clustering method: {CLUSTERING_METHOD},   Clusters: {N_CLUSTERS}")
p.circle(x, y, color=[colors[i] for i in clusters_1])
show(p)

# Experiment for assay_2 values

In [77]:
cols = [SMILES, "ID", ASSAY_2]

CLUSTERING_METHOD_2 = "SpectralClustering"
# CLUSTERING_METHOD_2 = "KMeans"
N_CLUSTERS_2 = 8

if N_CLUSTERS_2 == 2:
    colors_2 = ["red", "blue"]
elif 2 < N_CLUSTERS_2 <= 10:
    colors_2 = Category10[N_CLUSTERS_2]
elif 10 < N_CLUSTERS_2 <= 20:
    colors_2 = Category20[N_CLUSTERS_2]

In [69]:
ds = df[[SMILES, "ID", ASSAY_2]]
ds = ds[ds[ASSAY_2].notna()]
ds.head()

Unnamed: 0,Structure,ID,AdrA1A_PCHEMBL_VALUE
7,CNC(=O)C(CCN1CCC(O)(CC1)c2ccc(Cl)cc2)(c3ccccc3...,CHEMBL1627,8.0
12,Cc1ccc2c(cccc2n1)N3CCN(CCc4cccc5c4OCc6c(ncn56)...,CHEMBL1241913,8.56
13,COc1ccccc1OCCNCC2CSC(S2)(c3ccccc3)c4ccccc4,CHEMBL1086156,7.45
16,O[C@H]1[C@H](CC[C@@H]1Oc2ccccc2)NC[C@H]3COc4cc...,CHEMBL135974,6.41
19,COc1cccc(OC)c1OCCNC[C@H]2COc3ccccc3O2,CHEMBL1182155,9.39


In [78]:
assay_2_values = ds[ASSAY_2].values.reshape(len(ds[ASSAY_2].values), 1)
clusters_2 = clustering(assay_2_values, method=methods[CLUSTERING_METHOD_2], n_clusters=N_CLUSTERS_2)

In [75]:
fp = [indigo_fingerprint_short(s) for s in ds[SMILES]]
coords = reduce_dim(fp, random_state=0)

x2 = [c[0] for c in coords]
y2 = [c[1] for c in coords]

In [79]:
p = figure(title=f"Assay value: {ASSAY_2},   Clustering method: {CLUSTERING_METHOD_2},   Clusters: {N_CLUSTERS_2}")
p.circle(x2, y2, color=[colors_2[i] for i in clusters_2])
show(p)