In [1]:
#Import necessary libraries, install them if neccessary 
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import pandas as pd

#pip install progressbar2, this is handy to track calculations for larger files.
import progressbar


In [2]:
#Function to generate 2D similarities between the ligands in rdkit MolSupplier

def generate_similarity_matrix(mols):
    '''
    Function to generate pairwise 2D similarities between the ligands in rdkit MolSupplier. 
        Arguments:
            mols (SmilesMolSupplier): list of ligands in rdkit SmilesMolSupplier format
        Returns:
            similarity_matrix (data frame): dataframe with ligand pairs and corresponding similarity                 
    '''
    
    seen_pairs = []
    similarity_matrix = []  
    
    #This is only helper function to help to track progress of calculation.
    bar = progressbar.ProgressBar(max_value=len(mols))
    
    #Double loop through all the ligand pairs
    for i in range(0, len(mols)):
        #Calculate fingerprint for the first molecule. Here Morgan fingerprints are used but feel 
        #free to use any other function.
        fp1 = AllChem.GetMorganFingerprintAsBitVect(mols[i], 2, nBits=1024)
        
        for j in range(i, len(mols)):
            #Calculate fingerprint for the second molecule. Here Morgan fingerprints are used but feel 
            #free to use any other function. It should be same as for the first molecule.
            fp2 = AllChem.GetMorganFingerprintAsBitVect(mols[j], 2, nBits=1024)   
            
            #Check if this pair similarity is already calculated. 
            if not(set([mols[i].GetProp("_Name"), mols[j].GetProp("_Name")]) in seen_pairs):
                
                seen_pairs.append(set([mols[i].GetProp("_Name"), mols[j].GetProp("_Name")]))
                #Calculate fingerprints. You can change the similarity type here, e.g dice
                similarity_matrix.append([mols[i].GetProp("_Name"), mols[j].GetProp("_Name"), round(DataStructs.TanimotoSimilarity(fp1, fp2), 2)])
        
        #This is only helper function to help to track progress of calculation.
        bar.update(i)
    
    return pd.DataFrame(similarity_matrix, columns = ["Molecule1", "Molecule2", "Similarity"])


In [6]:
#Here you can read smiles files with molecules in rdkit MolSupplier. You can change arguments, 
#e.g if you have different delimiter, or if you have header row. For more details, check rdkit 
#documentation.
mols = Chem.SmilesMolSupplier("test.smi",
                              delimiter='\t', titleLine=True)

In [7]:
#Generate similarity matrix for the input file.
generate_similarity_matrix(mols)

 60% (3 of 5) |###############           | Elapsed Time: 0:00:00 ETA:  00:00:00

Unnamed: 0,Molecule1,Molecule2,Similarity
0,ethyl 2-{[(3-methoxyphenyl)carbamothioyl]amino...,ethyl 2-{[(3-methoxyphenyl)carbamothioyl]amino...,1.0
1,ethyl 2-{[(3-methoxyphenyl)carbamothioyl]amino...,methyl 2-({[2-(morpholin-4-yl)ethyl]carbamoyl}...,0.36
2,ethyl 2-{[(3-methoxyphenyl)carbamothioyl]amino...,"methyl 2-[(phenylcarbamothioyl)amino]-4,5,6,7-...",0.64
3,ethyl 2-{[(3-methoxyphenyl)carbamothioyl]amino...,"N-({1H,4H,5H,6H,7H,8H-cyclohepta[c]pyrazol-3-y...",0.15
4,ethyl 2-{[(3-methoxyphenyl)carbamothioyl]amino...,N-methyl-4-nitro-2-[4-(4-nitrophenyl)-1H-imida...,0.08
5,methyl 2-({[2-(morpholin-4-yl)ethyl]carbamoyl}...,methyl 2-({[2-(morpholin-4-yl)ethyl]carbamoyl}...,1.0
6,methyl 2-({[2-(morpholin-4-yl)ethyl]carbamoyl}...,"methyl 2-[(phenylcarbamothioyl)amino]-4,5,6,7-...",0.45
7,methyl 2-({[2-(morpholin-4-yl)ethyl]carbamoyl}...,"N-({1H,4H,5H,6H,7H,8H-cyclohepta[c]pyrazol-3-y...",0.19
8,methyl 2-({[2-(morpholin-4-yl)ethyl]carbamoyl}...,N-methyl-4-nitro-2-[4-(4-nitrophenyl)-1H-imida...,0.06
9,"methyl 2-[(phenylcarbamothioyl)amino]-4,5,6,7-...","methyl 2-[(phenylcarbamothioyl)amino]-4,5,6,7-...",1.0
