In [1]:
import pandas as pd
from tqdm.notebook import tqdm
from ms_entropy.file_io.msp_file import read_one_spectrum
import random
import numpy as np

from rdkit import RDLogger
import importlib

import sys
sys.path.append('../src/')
from ms_similarity_metrics.hash_utils import hash_spectrum
from ms_similarity_metrics.create_spectrum import create_spectra_NIST23, create_spectra_wout
from ms_similarity_metrics.query_pool import query, call_modified_cosine
from ms_similarity_metrics.frequency import get_weights
importlib.reload(sys.modules['ms_similarity_metrics.hash_utils'])
importlib.reload(sys.modules['ms_similarity_metrics.create_spectrum'])
importlib.reload(sys.modules['ms_similarity_metrics.query_pool'])
importlib.reload(sys.modules['ms_similarity_metrics.frequency'])
from ms_similarity_metrics.hash_utils import hash_spectrum
from ms_similarity_metrics.create_spectrum import create_spectra_NIST23, create_spectra_wout
from ms_similarity_metrics.create_spectrum import weight_NIST23_spectra, weight_wout_spectra
from ms_similarity_metrics.query_pool import query, call_modified_cosine
from ms_similarity_metrics.frequency import get_weights


In [2]:
RDLogger.DisableLog('rdApp.*')

In [3]:
%matplotlib inline

# Get NIST data

In [4]:
# Get NIST23 library
spectra_list = []

# Get data from s3://enveda-data-user/chloe.engler/cosine_similarity/NIST_data/NIST23-HR-MSMS.MSP
for i,spectrum in tqdm(enumerate(read_one_spectrum('../../data/NIST23-HR-MSMS.MSP'))):
    spectra_list.append(spectrum)

0it [00:00, ?it/s]

In [5]:
# read tsv file
inchikey_nist23_to_smiles = pd.read_csv(
    's3://enveda-datascience/daniel_domingo/random/nist23_resolver_inchikeys_to_smiles.tsv.gz',
    sep='\t',
    compression='gzip',
)

In [6]:
# remove rows where smiles is nan
inchikey_nist23_to_smiles = inchikey_nist23_to_smiles.dropna(subset=['smiles'])

# make a dict from inchikey to smiles
inchikey_to_smiles = dict(zip(inchikey_nist23_to_smiles['inchikey'], inchikey_nist23_to_smiles['smiles']))

In [8]:
# Convert spectra to right format and filter
# See create_spectrum.py for more filtering information
nist_spectra, info_dict, nist_inchi_dict = create_spectra_NIST23(spectra_list, inchikey_to_smiles, min_n_peaks=6)

1934658it [01:08, 28384.32it/s] 


# Get Wout data

In [10]:
# Covert Wout spectra to right format and filter
# Get data from s3://enveda-data-user/chloe.engler/cosine_similarity/Wout_data/ALL_GNPS_NO_PROPOGATED.mgf
wout_spectra, wout_info_dict = create_spectra_wout(
    '../../data/ALL_GNPS_NO_PROPOGATED.mgf',
    min_n_peaks=6,
)

0it [00:00, ?it/s]

495600it [01:28, 5605.32it/s] 


In [11]:
# Get wout metadata
metadata = pd.read_csv(
    'https://zenodo.org/record/6829249/files/gnps_libraries_metadata.csv?download=1'
)
metadata.set_index('id', inplace=True)

# Get overlapping spectra

In [12]:
# Get set with all inchis from nist spectra
nist_inchis = {
    spectra.partial_inchikey
    for spectra in tqdm(nist_spectra)
}

  0%|          | 0/402412 [00:00<?, ?it/s]

In [13]:
# Check for matching spectra in NIST23 and wout
matching_inchis = []
for i,spectra in enumerate(tqdm(wout_spectra)):
    current_inchi = spectra.partial_inchikey
    if current_inchi != None and current_inchi in nist_inchis:
        matching_inchis.append(current_inchi)

  0%|          | 0/59165 [00:00<?, ?it/s]

In [14]:
with open('../../data/matching_inchis.txt', 'w') as f:
    for inchi in matching_inchis:
        f.write(inchi + '\n')

In [15]:
print('Number of overlapping spectra: ', len(matching_inchis))
print('Number of overlapping structures: ', len(set(matching_inchis)))

Number of overlapping spectra:  30100
Number of overlapping structures:  3193


# Create NIST parquet file

In [22]:
# Create nist df with mz, intensities, precursor_mz
nist_df = pd.DataFrame(columns=['identifier','mz_values', 'intensites', 'precursor_mz'])
for s in tqdm(nist_spectra):
    nist_df.loc[len(nist_df)] = [s.identifier, s.mz, s.intensity, s.precursor_mz]


  0%|          | 0/402412 [00:00<?, ?it/s]

In [24]:
# Save NIST df
nist_df.to_parquet('../data/nist_df.parquet')

# Create wout parquet file

In [17]:
# create wout df with mz, intensities, precursor_mz
wout_df = pd.DataFrame(columns=['identifier', 'mz_values', 'intensities', 'precursor_mz'])
for s in tqdm(wout_spectra):
    wout_df.loc[len(wout_df)] = [s.identifier, s.mz, s.intensity, s.precursor_mz]

  0%|          | 0/59165 [00:00<?, ?it/s]

In [19]:
# Save Wout dataframe
wout_df.to_parquet('../../data/wout_GNPS_df.parquet')