# Fingerprint-based similarity scores

## Challenge
1. Many different fingerprints AND possible variations thereof (e.g. count/binary, log, weighted...) AND different metrics lead to **a lot** of possible variations!
2. No ground truth, so it is hard to measure what is good and what is better...

Data:

Compare multiple fingerprints on the "ms2 structures" dataset [Bushuiev 2024].

In [9]:
import os
from pathlib import Path
import pickle
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

#from matchms.importing import load_from_mgf

ROOT = Path(os.getcwd()).parents[0]

path_data = os.path.join(Path(ROOT), "data")

In [3]:
filename = "compounds_unique_inchikey14.pickle"
compounds = pd.read_pickle(os.path.join(path_data, filename))
compounds.head()

Unnamed: 0,inchikey,inchi,smiles,spectrum_id
0,AAAQFGUYHFJNHI,InChI=1S/C22H22ClN5O2/c1-4-24-20(29)12-18-22-2...,CCNC(=O)C[C@@H]1N=C(c2ccc(Cl)cc2)c2c(ccc(OC)c2...,
1,AABFWJDLCCDJJN,InChI=1S/C22H16N2O/c1-25-15-9-10-20-19(13-15)1...,COc1cc2c(cc1)[nH]c1c2ccnc1-c1cccc2ccccc12,
2,AABILZKQMVKFHP,InChI=1S/C21H33NO8/c1-7-13(2)18(23)30-16-9-11-...,C/C=C(/C)\C(=O)O[C@H]1CC[N+]2([C@@H]1C(=CC2)CO...,MSBNK-NaToxAq-NA003250
3,AABUHSBGEIUSRJ,InChI=1S/C17H16N2O2/c1-13(20)18-15-8-10-16(11-...,CC(O)=Nc1ccc(NC(=O)C=Cc2ccccc2)cc1,CCMSLIB00010107357
4,AABUKWVVUWBZCS,InChI=1S/C21H14O3/c22-16-11-12-17-18(13-16)24-...,O=c1oc2cc(O)ccc2c(-c2ccccc2)c1-c1ccccc1,CCMSLIB00010104862


In [4]:
from rdkit import Chem
from rdkit.Chem import Descriptors

def calculate_mass(smiles):
    # Convert SMILES string to a molecule object
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return "Invalid SMILES"
    # Calculate the molecular mass
    mass = Descriptors.MolWt(molecule)
    return mass

In [5]:
masses = []
for smiles_string in tqdm(compounds.smiles):
    masses.append(calculate_mass(smiles_string))

  0%|          | 0/37663 [00:00<?, ?it/s]

In [6]:
compounds["mass"] = masses
compounds.head()

Unnamed: 0,inchikey,inchi,smiles,spectrum_id,mass
0,AAAQFGUYHFJNHI,InChI=1S/C22H22ClN5O2/c1-4-24-20(29)12-18-22-2...,CCNC(=O)C[C@@H]1N=C(c2ccc(Cl)cc2)c2c(ccc(OC)c2...,,423.904
1,AABFWJDLCCDJJN,InChI=1S/C22H16N2O/c1-25-15-9-10-20-19(13-15)1...,COc1cc2c(cc1)[nH]c1c2ccnc1-c1cccc2ccccc12,,324.383
2,AABILZKQMVKFHP,InChI=1S/C21H33NO8/c1-7-13(2)18(23)30-16-9-11-...,C/C=C(/C)\C(=O)O[C@H]1CC[N+]2([C@@H]1C(=CC2)CO...,MSBNK-NaToxAq-NA003250,427.494
3,AABUHSBGEIUSRJ,InChI=1S/C17H16N2O2/c1-13(20)18-15-8-10-16(11-...,CC(O)=Nc1ccc(NC(=O)C=Cc2ccccc2)cc1,CCMSLIB00010107357,280.327
4,AABUKWVVUWBZCS,InChI=1S/C21H14O3/c22-16-11-12-17-18(13-16)24-...,O=c1oc2cc(O)ccc2c(-c2ccccc2)c1-c1ccccc1,CCMSLIB00010104862,314.34


In [14]:
from utils import FingerprintGenerator, fingerprint_from_smiles #, compute_all_fingerprints

ImportError: cannot import name 'fingerprint_from_smiles' from 'utils' (/home/daisy/Florian/molecular_fingerprint_comparisons/notebooks/utils.py)

In [11]:
import metrics


def compute_ruzicka_similarities(fingerprints):
    return metrics.ruzicka_similarity_matrix(fingerprints, fingerprints)

def compute_ruzicka_similarities_weighted(fingerprints, weights):
    return metrics.ruzicka_similarity_matrix_weighted(fingerprints, fingerprints, weights)

def compute_ruzicka_similarities_log(fingerprints):
    return metrics.ruzicka_similarity_matrix(np.log(1 + fingerprints), np.log(1 + fingerprints))

def compute_jaccard_similarities(fingerprints):
    return metrics.jaccard_similarity_matrix(fingerprints, fingerprints)

def compute_jaccard_similarities_weighted(fingerprints, weights):
    return metrics.jaccard_similarity_matrix_weighted(fingerprints, fingerprints, weights)

In [None]:
experiments = [
    ("morgan2_count", rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=4096), True, False),
    ("morgan3_count", rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=4096), True, False),
    ("morgan4_count", rdFingerprintGenerator.GetMorganGenerator(radius=4, fpSize=4096), True, False),
    ("morgan5_count", rdFingerprintGenerator.GetMorganGenerator(radius=5, fpSize=4096), True, False),
    ("morgan6_count", rdFingerprintGenerator.GetMorganGenerator(radius=6, fpSize=4096), True, False),
    ("morgan2_binary", rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=4096), False, False),
    ("morgan3_binary", rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=4096), False, False),
    ("morgan2_binary_1024", rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024), False, False),
    ("morgan3_binary_1024", rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=1024), False, False),
    ("morgan6_binary_1024", rdFingerprintGenerator.GetMorganGenerator(radius=6, fpSize=1024), False, False),
    ("daylight", rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=4096), False, False),
    ("daylight_1024", rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=1024), False, False),
]

In [None]:
fp_generator = FingerprintGenerator(fpgen)

    #fingerprints, valid_smiles, valid_compounds = compute_all_fingerprints(compounds, fpgen, count)
    valid_smiles = []
    valid_compounds = []
    fingerprints = []
    
    for inchikey, row in tqdm(compounds.iterrows(), total=len(compounds)):
        fp = fp_generator.fingerprint_from_smiles(row.smiles, count)

In [13]:
fingerprints = []
for i, row in tqdm(compounds.iterrows(), total=compounds.shape[0]):
    fingerprint = fingerprint_from_smiles(row["smiles"], fpgen)
    fingerprints.append(fingerprint)

fingerprints = np.array(fingerprints)

  0%|          | 0/37663 [00:00<?, ?it/s]

NameError: name 'fingerprint_from_smiles' is not defined

In [12]:
NUM_FINGERPRINTS = 10_000

rng = np.random.default_rng(seed=0)
indices_selected = rng.choice(np.arange(len(fingerprints)), NUM_FINGERPRINTS, replace=False)

fingerprints_selected = fingerprints[indices_selected]
fingerprints_count_selected = fingerprints_count[indices_selected]
fingerprints_tfidf_selected = fingerprints_tfidfs[indices_selected]
fingerprints_selected.shape

NameError: name 'fingerprints' is not defined