In [73]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm
import numba as nb
import numpy as np

from ms_entropy.file_io.msp_file import read_one_spectrum
import pyteomics.mgf
from rdkit import Chem

# Get siamese network vectors

In [103]:
# Get siamese vectors for query
siamese_query_df = pd.read_parquet('s3://enveda-data-user/chloe.engler/cosine_similarity/wout_GNPS_df_w_siamese_vecs.parquet')
siamese_query_df = siamese_query_df.set_index('identifier')

In [4]:
# Get siamese vectors for nist library
siamese_library_df = pd.read_parquet('s3://enveda-data-user/chloe.engler/cosine_similarity/nist_df_w_siamese_vecs.parquet')

# Get NIST23 data

In [None]:
def get_identifier(x):
    if pd.isna(x['cas#']):
        return str(x['index']) + '_' + x['nist#']
    else:
        return str(x['index']) + '_' + x['cas#']

In [39]:
# Get NIST23 library
spectra_list = []
for i,spectrum in tqdm(enumerate(read_one_spectrum('../data/NIST23-HR-MSMS.MSP'))):
    spectra_list.append(spectrum)

1934658it [03:17, 9779.13it/s] 


In [40]:
# Create NIST dataframe
nist_df = pd.DataFrame(spectra_list)
nist_df = nist_df[nist_df['precursor_type'] == '[M+H]+' ]

In [66]:
# Create identifiers for each nist spectra
nist_df = nist_df.reset_index()
nist_df['identifier'] = nist_df.apply(lambda x: get_identifier(x), axis=1)
nist_df = nist_df.set_index('identifier')

# Get Wout Data

In [69]:
# Profile spectra contain 0 intensity values.
@nb.njit
def is_centroid(intensity_array):
    return np.all(intensity_array > 0)

In [71]:
# Read all spectra from the MGF.
spectra = []

# Download from https://zenodo.org/record/6829249/files/ALL_GNPS_NO_PROPOGATED.mgf?download=1
filename = ("../data/ALL_GNPS_NO_PROPOGATED.mgf")

# Get wout spectra
with pyteomics.mgf.MGF(filename) as f_in:
    for spectrum_dict in tqdm(f_in):
        spectra.append(spectrum_dict)

495600it [02:20, 3539.49it/s] 


In [72]:
# Create wout dataframe
wout_df = pd.DataFrame(spectra)
wout_df = pd.concat([wout_df.drop(['params'], axis=1), wout_df['params'].apply(pd.Series)], axis=1)
wout_df.head(2)

Unnamed: 0,m/z array,intensity array,charge array,pepmass,charge,mslevel,source_instrument,filename,seq,ionmode,...,pi,datacollector,smiles,inchi,inchiaux,pubmed,submituser,libraryquality,spectrumid,scans
0,"[289.286377, 295.545288, 298.489624, 317.32495...","[8068.0, 22507.0, 3925.0, 18742.0, 8604.0, 804...","[--, --, --, --, --, --, --, --, --, --, --, -...","(981.54, None)",[0+],2,LC-ESI-qTof,130618_Ger_Jenia_WT-3-Des-MCLR_MH981.4-qb.1.1....,*..*,Positive,...,Gerwick,Jenia,CC(C)CC1NC(=O)C(C)NC(=O)C(=C)N(C)C(=O)CCC(NC(=...,,,,mwang87,1,CCMSLIB00000001547,1
1,"[278.049927, 278.957642, 281.258667, 291.99609...","[35793.0, 47593.0, 95495.0, 115278.0, 91752.0,...","[--, --, --, --, --, --, --, --, --, --, --, -...","(940.25, None)",[0+],2,LC-ESI-qTof,20111105_Anada_Ger_HoiamideB_MH940_qb.1.1..mgf,*..*,Positive,...,Gerwick,Amanda,CCC[C@@H](C)[C@@H]([C@H](C)[C@@H]1[C@H]([C@H](...,InChI=1S/C45H73N5O10S3/c1-14-17-24(6)34(52)26(...,,,mwang87,1,CCMSLIB00000001548,1


# Filter query spectra

In [74]:
def is_valid_smiles(sm):
    """
    Return true if the input is a string that is nonempty and maps to a non-null
    rdkit Mol, false otherwise.
    """
    try:
        mol = Chem.MolFromSmiles(sm)
        if sm == "" or mol is None:
            return False
        else:
            return True
    except Exception:
        return False

def smile2inchi(smile):
    if is_valid_smiles(smile):
        return Chem.MolToInchiKey(Chem.MolFromSmiles(smile)).split("-")[0]
    else:
        return None

In [91]:
# Remove spectra that don's have siamese vectors (these were filtered using Wout's criteria)
filtered_wout_df = wout_df.loc[wout_df.spectrumid.isin(siamese_query_df.identifier)]

# Get partial inchikeys for Wout data
filtered_wout_df['partial_inchikey'] = filtered_wout_df.smiles.apply(lambda x: smile2inchi(x))

[14:30:43] SMILES Parse Error: syntax error while parsing: InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)
[14:30:43] SMILES Parse Error: Failed parsing SMILES 'InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)' for input: 'InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)'
[14:30:43] Can't kekulize mol.  Unkekulized atoms: 10 11 12 14 16
[14:30:43] Can't kekulize mol.  Unkekulized atoms: 10 11 12 14 16
[14:30:43] SMILES Parse Error: syntax error while parsing: N/A
[14:30:43] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[14:30:43] SMILES Parse Error: syntax error while parsing: N/A
[14:30:43] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[14:30:43] SMILES Parse Error: syntax error while parsing: N/A
[14:30:43] SMILES Parse Error: Failed parsing SMILES 'N/A' for input: 'N/A'
[14

# Filter siamese query vectors

In [114]:
# Filter siamese query spectra
siamese_query_df = siamese_query_df.loc[siamese_query_df.index.isin(filtered_wout_df.spectrumid)]
filtered_wout_df = filtered_wout_df.set_index('spectrumid')

# Filter nist df
nist_df = nist_df.loc[nist_df.index.isin(siamese_library_df.identifier)]

In [130]:
#FIXME add hashes!, top n_scores, threshold

# Define ppm window
ppm_window = 10
query_results = pd.DataFrame(columns=['query_results'])

# Query NIST23 using siamese vectors
for id_ in tqdm(siamese_query_df.index.values[:5]):
    query_vector = siamese_query_df.loc[id_].siamese_vector
    query_precursor_mz = filtered_wout_df.loc[id_, 'pepmass'][0]
    library_precursor = np.array(list(nist_df.precursormz)).astype(float)
    ppm = 10**6 * np.abs((library_precursor - query_precursor_mz) / query_precursor_mz)
    indexes_to_keep = np.nonzero(np.abs(ppm) <= ppm_window)[0]
    library_vectors = np.array(list(siamese_library_df['siamese_vector']))[indexes_to_keep]
    library_ids = np.array(list(siamese_library_df.identifier))[indexes_to_keep]

    # Calculate scores
    if len(library_vectors) > 0:
        scores = cosine_similarity(query_vector.reshape(1, -1), library_vectors)[0]
        query_results.loc[id_] = [{library_ids[i]: scores[i] for i in range(len(library_ids))}]
    else:
        query_results.loc[id_] = [{}]

100%|██████████| 5/5 [00:09<00:00,  1.84s/it]


In [131]:
query_results.head()

Unnamed: 0,query_results
CCMSLIB00000001621,{'1084030_26605-16-3; NIST#: 3837818': 0.9639...
CCMSLIB00000006832,{}
CCMSLIB00000006837,{}
CCMSLIB00000006843,{'8548_135158-54-2; NIST#: 1062816': 0.716510...
CCMSLIB00000006844,{'8548_135158-54-2; NIST#: 1062816': 0.777845...
