In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs


import random
import math

import pandas as pd
import numpy as np

import sklearn.metrics as sklm



In [2]:
# Define base directory here
basedir = ''

In [3]:
# RDKit object to provide the molecules one-by-one on request, plug in your molecular library in Smiles/SDF/etc.
library = Chem.SmilesMolSupplier(basedir+'.smiles', delimiter='\t', titleLine=False)

def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

# Determine library smiles, please adjust this when using multi-line file formats
library_size = file_len(basedir+'.smiles')

In [None]:
random.seed(0)

# Fingerprint settings
radius = 4
bits = 1024

results = []

# Plug in the number of repetitions (no. of reference molecules in the range function)
for iteration in range(10):
    # Draw molecule randomly from *library*
    reference = library[random.randint(0, library_size)]
    
    refname = reference.GetProp('_Name')
    
    reference_nheavy = reference.GetNumHeavyAtoms()
    
    # Generate Morgan (ECFP-like) fingerprints
    fpref = AllChem.GetMorganFingerprintAsBitVect(reference,radius,nBits=bits)
    
    # Size expressed by the norm is the number of On bits
    reference_size = fpref.GetNumOnBits()
    
    for j in range(100):
        
        query = library[random.randint(0, library_size)]
        
        queryname = query.GetProp('_Name')
        
        query_nheavy = query.GetNumHeavyAtoms()
        
        fpquery = AllChem.GetMorganFingerprintAsBitVect(query,radius,nBits=bits)
        
        query_size = fpquery.GetNumOnBits()
        
        commonOnbits = len([i for i in fpref.GetOnBits() if i in fpquery.GetOnBits()])
        
        euclidean = math.sqrt( reference_size + query_size - 2*commonOnbits )
        
        tanimoto = DataStructs.FingerprintSimilarity(fpref,fpquery)
        
        # Write out molecule names, sizes in nheavy and norm, Tanimoto and Euclidean
        results.append([refname,queryname,reference_nheavy,query_nheavy,
                        reference_size,query_size,tanimoto,euclidean])

# Convert to pandas DataFrame and write to file
df = pd.DataFrame(results, columns=['reference','query','reference_nheavy','query_nheavy',
                           'reference_size','query_size','tanimoto','euclidean'])
df.to_excel(basedir+'primary_results.xlsx')

In [6]:
results = []

# Split to smaller tables per reference molecule
for ref in df['reference'].unique():
    subdf = df[df.reference == ref]
    
    # Iterate over each pair of query molecules
    for iteration,i in enumerate(subdf.index):
        rowi = subdf.loc[i]
        for j in subdf.index[iteration+1:]:
            
            rowj = subdf.loc[j]
            
            # Calculate differences moving from molecule i to j
            delta_size = rowj['query_size']-rowi['query_size']
            deltaT = rowj['tanimoto']-rowi['tanimoto']
            deltaE = rowj['euclidean']-rowi['euclidean']
            
            # If the size and Euclidean distance change in the same direction (bigger size results in bigger distance),
            # then *smaller_is_more_similar* is True. The product of the signs can be zero as well, in that case
            # *smaller_is_more_similar* is also True.
            if np.sign(deltaE) * np.sign(delta_size) != -1.0:
                smaller_is_more_similar = True
            else:
                smaller_is_more_similar = False
            
            # If the change of Tanimoto and Euclidean is in the opposite direction (bigger Tanimoto similarity goes
            # together with smaller Euclidean distance), then *consistent* is True. Also True when the product is zero.
            if np.sign(deltaE) * np.sign(deltaT) != 1.0:
                consistent = True
            else:
                consistent = False
            
            results.append([delta_size,deltaT,deltaE,smaller_is_more_similar,consistent])
# Convert to pandas DataFrame and write to file
resultsdf = pd.DataFrame(results, columns=['delta_size','deltaT','deltaE','smaller_is_more_similar','consistent'])
resultsdf.to_excel(basedir+'comparisons.xlsx')

In [18]:
# Check confusion matrix and write to file
conf = sklm.confusion_matrix(resultsdf['smaller_is_more_similar'],resultsdf['consistent'])

index = pd.MultiIndex.from_tuples([('smaller_is_more_similar','False'),('smaller_is_more_similar','True')])
columns = pd.MultiIndex.from_tuples([('consistent','False'),('consistent','True')])
confmatrix = pd.DataFrame(conf, index=index, columns=columns)

confmatrix.to_excel(basedir+'confusion_matrix.xlsx')