In [2]:
import pandas as pd, numpy as np

In [3]:
data = pd.read_csv("data/curedData.csv")

## Scaffolding

In [7]:
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold

In [8]:
data["scaff"]=data.SMILES.apply(lambda x: MurckoScaffold.MurckoScaffoldSmiles(smiles=x) )

In [9]:
data

Unnamed: 0,SMILES,label,scaff
0,BrCCBr,1,
1,Brc1c(Br)c(Br)c(Oc2c(Br)c(Br)c(Br)c(Br)c2Br)c(...,0,c1ccc(Oc2ccccc2)cc1
2,Brc1cc(-c2ccc(Br)c(Br)c2Br)cc(Br)c1Br,1,c1ccc(-c2ccccc2)cc1
3,Brc1cc(Br)c(-c2cc(Br)c(Br)cc2Br)cc1Br,1,c1ccc(-c2ccccc2)cc1
4,Brc1cc(Oc2cc(Br)c(Br)c(Br)c2Br)c(Br)c(Br)c1Br,1,c1ccc(Oc2ccccc2)cc1
...,...,...,...
728,c1ccc2[nH]c(-c3cscn3)nc2c1,1,c1ccc2[nH]c(-c3cscn3)nc2c1
729,c1ccc2c(c1)cc1ccc3cccc4ccc2c1c34,1,c1ccc2c(c1)cc1ccc3cccc4ccc2c1c34
730,c1cncc(C2=NCCCC2)c1,1,c1cncc(C2=NCCCC2)c1
731,c1cncc(C2CCCCN2)c1,1,c1cncc(C2CCCCN2)c1


In [10]:
scaffolds = data.scaff.unique()

In [14]:
from rdkit.Chem.rdmolops import PatternFingerprint
from rdkit.DataStructs import TanimotoSimilarity
from itertools import combinations

In [12]:
fpPattern = pd.Series([ PatternFingerprint(Chem.MolFromSmiles(s)) for s in scaffolds])

In [15]:
## This calc needs bit of time ##

mutualSimilarities = pd.DataFrame(np.identity(len(scaffolds)), index=scaffolds, columns=scaffolds)

# computing similarity values for every couple of scaffolds
# getting an adj matrix
for comb in combinations(scaffolds,2):
    comb = list(comb)
    similarityValue = TanimotoSimilarity(*fpPattern[comb])
    
    a,b=comb
    mutualSimilarities.at[a,b]=similarityValue
    mutualSimilarities.at[b,a]=similarityValue

## Clustering

In [20]:
import networkx as nx
from networkx.algorithms.community.louvain import louvain_communities

In [21]:
G = nx.from_pandas_adjacency(mutualSimilarities)

In [73]:
# resolution default is 1.0, but it would return 1 too low populated cluster.
comms = louvain_communities(G, resolution=.95)

In [74]:
len(comms)

3

In [37]:
communities_id = {}
for idx, val in enumerate(comms):
    for k in val:
        communities_id[k] = idx

In [38]:
data["commIdx"]=data.scaff.apply(communities_id.get)

In [39]:
data.commIdx.value_counts()

2    399
1    196
0    138
Name: commIdx, dtype: int64

In [40]:
commGroup = data.groupby("commIdx").indices

In [42]:
CVclusterData=pd.DataFrame(columns=range(len(commGroup)), index=range(len(data)))
CVclusterData=CVclusterData.applymap(lambda x: "train")
for col, idxs in commGroup.items():
    CVclusterData.loc[idxs,col]="test"

In [45]:
CVclusterData

Unnamed: 0,0,1,2
0,test,train,train
1,train,train,test
2,train,train,test
3,train,train,test
4,train,train,test
...,...,...,...
728,train,train,test
729,train,test,train
730,train,train,test
731,train,train,test


In [65]:
Y.loc[CVclusterData[2]=="test"].value_counts()

1    341
0     58
Name: label, dtype: int64

In [72]:
Y.loc[CVclusterData[1]=="test"].value_counts()

1    146
0     50
Name: label, dtype: int64

In [71]:
Y.loc[CVclusterData[0]=="test"].value_counts()

1    122
0     16
Name: label, dtype: int64

## CV leave-one-cluster-out

In [47]:
from consensusModel import clsf

In [48]:
X = pd.read_csv("data/trainMatrix.csv")

In [49]:
Y = data.label

In [77]:
CVclusterData=pd.read_csv("data/scaffoldsCluster.csv", index_col=0)

In [50]:
folds = len(CVclusterData.columns)
allScores=pd.DataFrame(columns=pd.MultiIndex.from_product((range(folds), clsf.keys())),
                       index=range(len(Y))
                      )

for fold in range(folds):
    series=CVclusterData[fold]
    split = series.groupby(series).indices
    train, val = split["train"], split["test"]

    Ytrain=Y[train]

    Xtrain=X.loc[train].values
    Xval=X.loc[val].values
    for cName,cObj in clsf.items():
        cObj.fit(Xtrain,Ytrain)
        prob=cObj.predict_proba(Xval)[:,1]

        allScores.loc[val,(fold,cName)]=prob

In [51]:
allScores

Unnamed: 0_level_0,0,0,0,0,0,1,1,1,1,1,2,2,2,2,2
Unnamed: 0_level_1,RF,SVM,XGB,KNN,ADA,RF,SVM,XGB,KNN,ADA,RF,SVM,XGB,KNN,ADA
0,0.964167,0.912627,0.967284,1.0,0.513374,,,,,,,,,,
1,,,,,,,,,,,0.896667,0.924781,0.887849,0.8,0.609905
2,,,,,,,,,,,0.957333,0.886215,0.822383,1.0,0.506833
3,,,,,,,,,,,0.957333,0.886215,0.822383,1.0,0.506833
4,,,,,,,,,,,0.896667,0.924781,0.887849,0.8,0.609905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
728,,,,,,,,,,,0.790667,0.532786,0.822383,0.8,0.40628
729,,,,,,0.89,0.893714,0.936344,1.0,0.511162,,,,,
730,,,,,,,,,,,0.712833,0.529798,0.773416,0.6,0.414902
731,,,,,,,,,,,0.674167,0.55075,0.357571,0.6,0.414902


## Results

In [52]:
consensusScores =pd.concat( 
            [allScores[ n ].mean(axis=1) for n in range(folds)], 
            axis=1, names = range(folds)
                            )

In [53]:
consensusScores

Unnamed: 0,0,1,2
0,0.871491,,
1,,,0.823840
2,,,0.834553
3,,,0.834553
4,,,0.823840
...,...,...,...
728,,,0.670423
729,,0.846244,
730,,,0.606190
731,,,0.519478


In [59]:
from utilsMetric import metrics

In [57]:
steps = np.arange(0.3,0.81,0.1)

metricsClusterCV=pd.DataFrame( columns=pd.MultiIndex.from_product( (range(folds), steps) ),
                           index=metrics.keys())

for metrName, metrFun in metrics.items():
    metricsClusterCV.loc[metrName] = [ 
                                    metrFun(Y[mask],pd.Series(scores)[mask].apply(lambda x: 1 if x>=cutoff else 0), scores[mask])
                                    for mask,scores in zip( (CVclusterData=="test").values.T, consensusScores.values.T)  
                                    for cutoff in steps
                                    ]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [68]:
metricsClusterCV.loc[:,(slice(None),0.5)]

Unnamed: 0_level_0,0,1,2
Unnamed: 0_level_1,0.5,0.5,0.5
spec,0.0,0.0,0.051724
sens,0.983607,1.0,0.903226
balacc,0.491803,0.5,0.477475
mcc,-0.043916,0.0,-0.055422
ppv,0.882353,0.744898,0.848485
npv,0.0,0.0,0.083333
PRcurve,0.910793,0.812116,0.882377
ROCcurve,0.540215,0.618699,0.532334
