# Fingerprint-based similarity scores

Similarity score computations...

## Data

We here use the **biostructures dataset** containing 730,464 unique compounds.

In [1]:
import os
import sys
from pathlib import Path
import pickle
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdFingerprintGenerator
ROOT = Path(os.getcwd()).parents[0]

path_data = os.path.join(Path(ROOT), "data")

## Data: Combination of MassSpecGym + MS2DeepScore

In [7]:
filename = "compounds_ms2structures_251202.csv"
compounds = pd.read_csv(os.path.join(path_data, filename))
compounds.head()

Unnamed: 0,inchikey,smiles,mass,cf_class,cf_subclass,cf_superclass,formula,npc_class_results,npc_pathway_results,npc_superclass_results
0,AAAQFGUYHFJNHI,CCNC(=O)C[C@H]1C2=NN=C(N2C3=C(C=C(C=C3)OC)C(=N...,423.146204,Benzodiazepines,"1,4-benzodiazepines",Organoheterocyclic compounds,C22H22ClN5O2,,Alkaloids,
1,AABFWJDLCCDJJN,COC1=CC2=C(C=C1)NC3=C2C=CN=C3C4=CC=CC5=CC=CC=C54,324.126264,Harmala alkaloids,,Alkaloids and derivatives,C22H16N2O,Carboline alkaloids,Alkaloids,Tryptophan alkaloids
2,AABILZKQMVKFHP,C/C=C(/C)\C(=O)O[C@H]1CC[N+]2([C@@H]1C(=CC2)CO...,427.220624,,,Alkaloids and derivatives,C21H33NO8,Pyrrolizidine alkaloids,Alkaloids,Ornithine alkaloids
3,AABUHSBGEIUSRJ,CC(=O)NC1=CC=C(C=C1)NC(=O)C=CC2=CC=CC=C2,280.120724,Cinnamic acids and derivatives,Cinnamic acid amides,Phenylpropanoids and polyketides,C17H16N2O2,Cinnamic acid amides,Shikimates and Phenylpropanoids,Phenylpropanoids (C6-C3)
4,AABUKWVVUWBZCS,C1=CC=C(C=C1)C2=C(C(=O)OC3=C2C=CC(=C3)O)C4=CC=...,314.094724,Neoflavonoids,Neoflavones,Phenylpropanoids and polyketides,C21H14O3,Neoflavonoids,Shikimates and Phenylpropanoids,Flavonoids


In [7]:
compounds.describe()

Unnamed: 0,mass
count,37811.0
mean,424.170589
std,201.716816
min,31.058
25%,306.49
50%,383.172782
75%,483.217489
max,3401.902


In [2]:
# Add source code path
sys.path.insert(0, os.path.join(ROOT, "src"))

# Import funtion/classes from source code
from fingerprint_computation import FingerprintGenerator, compute_fingerprints_from_smiles

In [17]:
import metrics


def compute_ruzicka_similarities(fingerprints):
    return metrics.ruzicka_similarity_matrix(fingerprints, fingerprints)

def compute_ruzicka_similarities_weighted(fingerprints, weights):
    return metrics.ruzicka_similarity_matrix_weighted(fingerprints, fingerprints, weights)

def compute_ruzicka_similarities_log(fingerprints):
    return metrics.ruzicka_similarity_matrix(np.log(1 + fingerprints), np.log(1 + fingerprints))

def compute_jaccard_similarities(fingerprints):
    return metrics.jaccard_similarity_matrix(fingerprints, fingerprints)

def compute_jaccard_similarities_weighted(fingerprints, weights):
    return metrics.jaccard_similarity_matrix_weighted(fingerprints, fingerprints, weights)

In [14]:
from rdkit.Chem import rdFingerprintGenerator


fpgen = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=4096)

fingerprints_rdkit = compute_fingerprints_from_smiles(
    compounds.smiles, fpgen, count=False, sparse=False, progress_bar=True
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 37811/37811 [00:35<00:00, 1060.62it/s]


In [22]:
fingerprints_rdkit.sum(axis=1).min()

np.uint64(2)

In [23]:
compounds.shape[0]

37811

In [24]:
%%time

similarities_rdkit = compute_jaccard_similarities(fingerprints_rdkit)

CPU times: user 3min 54s, sys: 1min 12s, total: 5min 6s
Wall time: 15 s


In [16]:
similarities_rdkit.shape

(37811, 37811)

In [25]:
similarities_rdkit[:5, :5]

array([[1.        , 0.16414265, 0.22535211, 0.0787172 , 0.1648299 ],
       [0.16414265, 1.        , 0.14056017, 0.06025267, 0.20114942],
       [0.22535211, 0.14056017, 1.        , 0.06589891, 0.14409369],
       [0.0787172 , 0.06025267, 0.06589891, 1.        , 0.06349207],
       [0.1648299 , 0.20114942, 0.14409369, 0.06349207, 1.        ]],
      dtype=float32)

In [25]:
# similarities_rdkit = similarities_rdkit.astype(np.float16)

In [26]:
# too big! --> 5 GB !!
np.save("similarities_tanimoto_rdkit_4096bits.npy", similarities_rdkit)

In [3]:
similarities_rdkit = np.load("similarities_tanimoto_rdkit_4096bits.npy")

In [28]:
fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=4096)
fingerprints_morgan3_count = compute_fingerprints_from_smiles(
    compounds.smiles, fpgen, count=True, sparse=False, progress_bar=True
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 37811/37811 [00:06<00:00, 6254.58it/s]


In [29]:
%%time

similarities_morgan3_count = compute_ruzicka_similarities(fingerprints_morgan3_count)

CPU times: user 1h 12min 3s, sys: 8.9 s, total: 1h 12min 12s
Wall time: 2min 30s


In [15]:
# too big! --> 5 GB !!
np.save("similarities_ruzicka_morgan3_4096bits.npy", similarities_morgan3_count.astype(np.float32))

In [16]:
similarities_morgan3_count = np.load("similarities_ruzicka_morgan3_4096bits.npy")

In [26]:
# similarities_morgan3_count = similarities_morgan3_count.astype(np.float16)

In [31]:
similarities_morgan3_count.shape

(37811, 37811)

## Morgan-3 binary + Tanimoto

In [32]:
fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=4096)
fingerprints_morgan3 = compute_fingerprints_from_smiles(
    compounds.smiles, fpgen, count=False, sparse=False, progress_bar=True
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 37811/37811 [00:05<00:00, 6976.93it/s]


In [33]:
%%time

similarities_morgan3 = compute_jaccard_similarities(fingerprints_morgan3)

CPU times: user 3min 52s, sys: 1min 16s, total: 5min 8s
Wall time: 15.1 s


In [13]:
# too big! --> 5 GB !!
np.save("similarities_tanimoto_morgan3_4096bits.npy", similarities_morgan3.astype(np.float32))

In [14]:
similarities_morgan3 = np.load("similarities_tanimoto_morgan3_4096bits.npy")

## Morgan-9 count

In [35]:
fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=9, fpSize=4096)
fingerprints_morgan9_count = compute_fingerprints_from_smiles(
    compounds.smiles, fpgen, count=True, sparse=False, progress_bar=True
)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 37811/37811 [00:09<00:00, 4176.50it/s]


In [36]:
%%time

similarities_morgan9_count = compute_ruzicka_similarities(fingerprints_morgan9_count)

CPU times: user 1h 12min 23s, sys: 8.86 s, total: 1h 12min 31s
Wall time: 2min 30s


In [11]:
# too big! --> 5 GB !!
np.save("similarities_ruzicka_morgan9_4096bits.npy", similarities_morgan9_count.astype(np.float32))

In [12]:
similarities_morgan9_count = np.load("similarities_ruzicka_morgan9_4096bits.npy")

## MAP4

In [8]:
from typing import List
from rdkit.Chem import Mol, MolFromSmiles # pylint: disable=import-error,no-name-in-module
from map4 import MAP4


map4 = MAP4(
    dimensions=4096,
    radius=2,
    include_duplicated_shingles=True,
)

mol = MolFromSmiles(compounds.smiles[4])

fp_map4 = map4._calculate(mol)
len(fp_map4)

552

In [39]:
map4 = MAP4(
    dimensions=4096,
    radius=2,
    include_duplicated_shingles=False,
)

mol = MolFromSmiles(compounds.smiles[4])

fp_map4 = map4._calculate(mol)
len(fp_map4)

272

In [71]:
mol = MolFromSmiles("C[C@H](N)C(=O)O")
fp_map4 = map4._calculate(mol)              
fp_map4

{b'C(C)(C)N|1|C([CH])(=O)O',
 b'C(C)(N)C(=O)O|1|C(=O)(O)C(C)N',
 b'CC(C)N|1|C(C)(N)C(=O)O',
 b'CC(C)N|2|C(=O)(O)C(C)N',
 b'CC(C)N|2|NC(C)C',
 b'CC(C)N|3|O=C([CH])O',
 b'CC(C)N|3|OC([CH])=O',
 b'C[CH]|1|C(C)(C)N',
 b'C[CH]|2|C([CH])(=O)O',
 b'C[CH]|2|N[CH]',
 b'NC(C)C|1|C(C)(N)C(=O)O',
 b'NC(C)C|2|C(=O)(O)C(C)N',
 b'NC(C)C|3|O=C([CH])O',
 b'NC(C)C|3|OC([CH])=O',
 b'N[CH]|1|C(C)(C)N',
 b'N[CH]|2|C([CH])(=O)O',
 b'O=C([CH])O|1|C(=O)(O)C(C)N',
 b'O=C([CH])O|2|C(C)(N)C(=O)O',
 b'O=C([CH])O|2|OC([CH])=O',
 b'O=C|1|C([CH])(=O)O',
 b'O=C|2|C(C)(C)N',
 b'O=C|3|C[CH]',
 b'O=C|3|N[CH]',
 b'OC([CH])=O|1|C(=O)(O)C(C)N',
 b'OC([CH])=O|2|C(C)(N)C(=O)O',
 b'OC|1|C([CH])(=O)O',
 b'OC|2|C(C)(C)N',
 b'OC|2|O=C',
 b'OC|3|C[CH]',
 b'OC|3|N[CH]'}

In [9]:
map4 = MAP4(
    dimensions=4096,
    radius=2,
    include_duplicated_shingles=False,
)

molecules: List[Mol] = [MolFromSmiles(smiles) for smiles in tqdm(compounds.smiles.values)]
fingerprints_map4: np.ndarray = map4.calculate_many(
    molecules,
    number_of_threads=32,
    verbose=True,
)

  0%|          | 0/37811 [00:00<?, ?it/s]

Calculating fingerprints:   0%|                                                                               …

In [42]:
fingerprints_map4.shape

(37811, 4096)

In [10]:
similarities_morgan3_count.dtype

dtype('float64')

In [18]:
%%time

similarities_map4 = compute_jaccard_similarities(fingerprints_map4).astype(np.float32)

CPU times: user 4min 3s, sys: 1min 11s, total: 5min 15s
Wall time: 16.1 s


In [55]:
map2 = MAP4(
    dimensions=4096,
    radius=1,
    include_duplicated_shingles=False,
)

molecules: List[Mol] = [MolFromSmiles(smiles) for smiles in tqdm(compounds.smiles.values)]
fingerprints_map2: np.ndarray = map2.calculate_many(
    molecules,
    number_of_threads=32,
    verbose=True,
)

  0%|          | 0/37811 [00:00<?, ?it/s]

Calculating fingerprints:   0%|                                                                               …

In [56]:
%%time

similarities_map2 = compute_jaccard_similarities(fingerprints_map2)

CPU times: user 3min 57s, sys: 1min 4s, total: 5min 2s
Wall time: 14.9 s
