# Compute fingerprint similarity between a reference molecule and all those within a dataframe

In [15]:
import pandas as pd
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem

In [12]:
def compute_fp_similarity(reference_smiles, df, smiles_column='smiles', similarity_column='fp_similarity'):
    """
    Computes fingerprint similarity of the reference molecule with all molecules in the DataFrame.

    Args:
        reference_smiles (str): SMILES string of the reference molecule.
        df (pd.DataFrame): DataFrame containing a 'smiles' column with SMILES strings.
        smiles_column (str): Name of the column containing SMILES strings (default: 'smiles').
        similarity_column (str): Name of the new column to store similarity scores (default: 'similarity').

    Returns:
        pd.DataFrame: DataFrame with an additional column containing similarity scores.
    """
    reference_mol = Chem.MolFromSmiles(reference_smiles)
    
    #check if reference smiles is valid:
    if reference_mol is None:
        raise ValueError("Invalid reference molecule SMILES.")

    # Compute Morgan fingerprints for the reference molecule
    reference_fp = AllChem.GetMorganFingerprintAsBitVect(reference_mol, 2, nBits=2048)

    def compute_single_similarity(smiles):
        
        mol = Chem.MolFromSmiles(smiles)
        
        if mol is None:
            return None
        
        mol_fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
        similarity = DataStructs.TanimotoSimilarity(reference_fp, mol_fp)
        return similarity

    # Compute similarity for each SMILES
    df[similarity_column] = df[smiles_column].apply(compute_single_similarity)
    return df

In [13]:
smiles_df = pd.DataFrame({'smiles': ['CCO', 'C1CCCC1', 'CNC']})

# Specify the reference molecule SMILES
my_reference = 'CCO'  # Example: Ethanol

# Compute similarity and add a new column
result_df = compute_fp_similarity(my_reference, smiles_df)

In [14]:
result_df

Unnamed: 0,smiles,fp_similarity
0,CCO,1.0
1,C1CCCC1,0.0
2,CNC,0.111111
