## Computing Tanimoto score with new filtering

Computing Tanimoto similarity score with the new filtering suggested for GC/MS
In this notebook the select_by_relative_intensity will be tried with intensities from 0.05 to 1

In [1]:
import os
import sys

ROOT = os.path.dirname(os.getcwd())
sys.path.insert(0, ROOT)

## Obtaining the data from the Mona File

In [None]:
from custom_functions.spectra_functions import get_data_folder_path

# from_external=False to use the data folder within the project
path = get_data_folder_path(from_external=False)
msp_file = os.path.join(path, "MoNA-export-GC-MS.msp")

## Getting the info from Mona file as Spectra

In [2]:
from matchms.importing import load_from_msp

spectrums = [s for s in load_from_msp(msp_file)]
print("Number of spectra: ", len(spectrums))

Number of spectra:  14847


## Filtering the Spectra with select_by_relative_intensity from 0.05

In [3]:
from matchms.filtering import normalize_intensities
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity

def apply_my_filters(s):
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = select_by_relative_intensity(s, intensity_from=0.05, intensity_to=1.0)
    return s

spectrums = [apply_my_filters(s) for s in spectrums]

spectrums = [s for s in spectrums if s is not None]

print("Remaining spectra: ", len(spectrums))

Remaining spectra:  14847


In [4]:
spectrums = [s for s in spectrums if len(s.peaks.intensities) > 0]
print("Remaining number of spectra:", len(spectrums))

Remaining number of spectra: 14844


## Obtaining Canonical smiles to compute Tanimoto similarity score

GC/MS data has 2 smiles. A hypothesis is that the fingerprints of a canonical smiles will be better for the similarity score, rather than chosing from the first or second smiles

In [5]:
from rdkit.Chem import rdMolHash
from rdkit import Chem

def set_canonical_smiles(spectrum):
    spectrum.set('smiles', spectrum.get('smiles_2'))
    return spectrum

spectrums_canonical_smiles = [set_canonical_smiles(s) for s in spectrums]

## Adding fingerprints to compute the Similarity score

In [6]:
from matchms.filtering import add_fingerprint

spectrums_clone = spectrums_canonical_smiles.copy()

def change_smiles(spectrum):
    spectrum.set('smiles', spectrum.get('smiles_2')) 
    return spectrum

spectrums_clone = [change_smiles(s) for s in spectrums_clone]

spectrums_fingerprint = [add_fingerprint(s, fingerprint_type="daylight", nbits=2048) for s in spectrums_clone]



## Spectra without fingerprints

In [7]:
for i, spec1 in enumerate(spectrums_fingerprint):
    if spec1.get("fingerprint") is None:
        print(i, "no figerprint")
    elif spec1.get("fingerprint").sum() < 1:
        print(i, "weird")

1414 no figerprint
9276 no figerprint
13426 no figerprint


## Computing Jaccard (Tanimoto) similarity score

In [8]:
from matchms.similarity import FingerprintSimilarityParallel

similarity_measure = FingerprintSimilarityParallel(similarity_measure="jaccard")

scores_mol_similarity = similarity_measure(spectrums_fingerprint, spectrums_fingerprint)

In [9]:
import numpy

filename = os.path.join(path, "gcms_similarities_filter05_daylight2048_jaccard.npy")
numpy.save(filename, scores_mol_similarity)

In [10]:
from matchms.utils import mol_converter

for i, spec in enumerate(spectrums_fingerprint):
    inchikey_smiles = None
    inchikey_inchi = None
    inchi = spec.get("inchi")
    if inchi:
        inchikey_inchi = mol_converter(inchi, "inchi", "inchikey")
    smiles = spec.get("smiles")
    if smiles:
        inchikey_smiles = mol_converter(smiles, "smiles", "inchikey")
    if inchikey_inchi and inchikey_smiles:
        if not inchikey_inchi[:14] == inchikey_smiles[:14]:
            print(i, "\n")
            print(10* "--", smiles)
            print(10* "--", inchi)
            print(5* "--", inchikey_inchi[:14])
            print(5* "--", inchikey_smiles[:14])
            



1792 

-------------------- [H]C([H])=C1C([H])([H])C([H])([H])C(C(=O)C(C([H])([H])[H])(C([H])([H])[H])C1([H])[H])(C([H])([H])[H])C([H])([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H][Sn]([H])([H])[H]
-------------------- InChI=1S/C12H20O.3CH4.Sn.4H/c1-9-6-7-11(2,3)10(13)12(4,5)8-9;;;;;;;;/h1,6-8H2,2-5H3;3*1H4;;;;;
---------- MXZAGCHAAXTYPS
---------- KPGCSSPWYGKSDS




1985 

-------------------- [H][Sn]([H])([H])[H].[H]C([H])([H])C([H])([H])[H].[H]C([H])([H])C([H])([H])[H].[H]C([H])([H])C([H])([H])[H].[H]C([H])([H])C([H])([H])[H]
-------------------- InChI=1S/4C2H6.Sn.4H/c4*1-2;;;;;/h4*1-2H3;;;;;
---------- GDHITDSZLDSFOV
---------- YWKYQLQFOKBAQU
1988 

-------------------- [H][Sn]([H])([H])[H].[H]C([H])([H])C([H])([H])C([H])([H])C([H])([H])[H].[H]C([H])([H])C([H])([H])C([H])([H])C([H])([H])[H].[H]C([H])([H])C([H])([H])C([H])([H])C([H])([H])[H].[H]C([H])([H])C([H])([H])C([H])([H])C([H])([H])[H]
-------------------- InChI=1S/4C4H10.Sn.4H/c4*1-3-4-2;;;;;/h4*3-4H2,1-2H3;;;;;
---------- STETXKIDVIWJIC
---------- OJXINEUEVXOYNS
1991 

-------------------- [H]C=1C([H])=C([H])C([H])=C([H])C1[H].[H]C=1C([H])=C([H])C([H])=C([H])C1[H].[H]C=1C([H])=C([H])C([H])=C([H])C1[H].[H]C=1C([H])=C([H])C([H])=C([H])C1[H].[H][Sn]([H])([H])[H]
-------------------- InChI=1S/4C6H6.Sn.4H/c4*1-2-4-6-5-3-1;;;;;/h4*1-6H;;;;;
---------- LIMGUWBQKIGOTP
---------- IBCZEPGDGKNMAW




2321 

-------------------- [H][Sn]([H])([H])[H].[H]C([H])([H])C([H])([H])C([H])([H])C([H])([H])[H].[H]C([H])([H])C([H])([H])C([H])([H])C([H])([H])[H].[H]C([H])([H])C([H])([H])C([H])([H])C([H])([H])[H].[H]C([H])([H])C([H])([H])C([H])([H])C([H])([H])[H]
-------------------- InChI=1S/4C4H10.Sn.4H/c4*1-3-4-2;;;;;/h4*3-4H2,1-2H3;;;;;
---------- STETXKIDVIWJIC
---------- OJXINEUEVXOYNS




4653 

-------------------- [H]C(=C([H])C([H])([H])C([H])([H])C([H])([H])[H])C([H])([H])C([H])([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H][Sn]([H])([H])[H]
-------------------- InChI=1S/C7H14.3CH4.Sn.4H/c1-3-5-7-6-4-2;;;;;;;;/h5,7H,3-4,6H2,1-2H3;3*1H4;;;;;/b7-5+;;;;;;;;
---------- UAKOCVPFSUQANI
---------- DAQUPUUNYZEHQC
4671 

-------------------- [H]C=1C([H])=C([H])C(SC([H])([H])C([H])([H])C([H])([H])[H])=C([H])C1[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H][Sn]([H])([H])[H]
-------------------- InChI=1S/C9H12S.3CH4.Sn.4H/c1-2-8-10-9-6-4-3-5-7-9;;;;;;;;/h3-7H,2,8H2,1H3;3*1H4;;;;;
---------- VMQIEGDMFYMEHK
---------- UXVBLNHZZOJONZ




8986 

-------------------- [H][Zn+2][H].[H]C([H])([H])N(C(=S)[S-])C([H])([H])[H].[H]C([H])([H])N(C(=S)[S-])C([H])([H])[H]
-------------------- InChI=1S/2C3H7NS2.Zn.2H/c2*1-4(2)3(5)6;;;/h2*1-2H3,(H,5,6);;;/q;;+2;;/p-2
---------- VHHLBXGMWZJIPX
---------- DUBNHZYBDBBJHD




9276 

-------------------- [H]C([H])([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H][Sn]([H])([H])[H]
-------------------- InChI=1S/4CH4.Sn.4H/h4*1H4;;;;;
---------- JDMJRFMRAXUYFT
---------- QZTAMPRLPRXMFU




10732 

-------------------- [H][C+]([H])C(=C([H])[C-]([H])[H])C([H])([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H][Sn]([H])([H])[H].[H][Sn]([H])([H])[H]
-------------------- InChI=1S/C5H8.6CH4.2Sn.8H/c1-4-5(2)3;;;;;;;;;;;;;;;;/h4H,1-2H2,3H3;6*1H4;;;;;;;;;;/b5-4-;;;;;;;;;;;;;;;;
---------- ODKHJQGKBJZVLD
---------- SGGXOFPOCGNAGX
10734 

-------------------- [H][C+]([H])C([H])=C([H])[C-]([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H]C([H])([H])[H].[H][Sn]([H])([H])[H].[H][Sn]([H])([H])[H]
-------------------- InChI=1S/C4H6.6CH4.2Sn.8H/c1-3-4-2;;;;;;;;;;;;;;;;/h3-4H,1-2H2;6*1H4;;;;;;;;;;/b4-3+;;;;;;;;;;;;;;;;
---------- RECVJKMOHRWGHD
---------- IFKQLEVLWQTRTE
10735 

-------------------- [H][C+]([H])C([H])=C([H])[C-]([H])[H].[H][Sn]([H])([H])[H].[H][Sn]([H])([H])[H].[H]C([H])([H])C([H])([H])C([H])([H])C([H])([H])[H].[H]C([H])([H])C([H])([H])C([H])([H])C([H])(



11161 

-------------------- [H]C(OC1([H])C([H])([H])C([H])([H])C2([H])OC3([H])C([H])([H])C([H])([H])C([H])([H])OC3([H])C([H])([H])C([H])([H])C2([H])OC1([H])C([H])([H])C([H])=O)=C([H])C([H])([H])[H].[H][Sn]([H])([H])[H].[H]C([H])([H])C([H])([H])C([H])([H])C([H])([H])[H].[H]C([H])([H])C([H])([H])C([H])([H])C([H])([H])[H].[H]C([H])([H])C([H])([H])C([H])([H])C([H])([H])[H]
-------------------- InChI=1S/C18H28O5.3C4H10.Sn.4H/c1-2-11-20-14-6-8-16-17(23-18(14)9-10-19)7-5-13-15(22-16)4-3-12-21-13;3*1-3-4-2;;;;;/h2,10-11,13-18H,3-9,12H2,1H3;3*3-4H2,1-2H3;;;;;/b11-2-;;;;;;;;/t13-,14+,15+,16-,17+,18-;;;;;;;;/m1......../s1
---------- FYFNCFYJJSSGFU
---------- UDFIFHMITQYMSZ


In [11]:
spectrums_2 = [add_fingerprint(s, fingerprint_type="morgan3", nbits=2048) for s in spectrums_fingerprint]


In [12]:
similarity_measure = FingerprintSimilarityParallel(similarity_measure="jaccard")
scores_mol_similarity = similarity_measure(spectrums_2, spectrums_2)

## Saving Tanimoto Similarity Scores

In [13]:
filename = os.path.join(path, "gcms_similarities_filter05_morgan3_2048_dice.npy")
numpy.save(filename, scores_mol_similarity)