In [2]:
import numpy as np
import pandas as pd
import sys
import importlib
from tqdm import tqdm

import functools
import numba as nb
from rdkit import Chem, DataStructs, RDLogger
import pyteomics.mgf
import spectrum_utils.spectrum as sus
from matchms.filtering import normalize_intensities

from matplotlib import pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
from matplotlib.colors import LogNorm
from matplotlib.gridspec import GridSpec

sys.path.append("../../src/ms_similarity_metrics")
import frequency
import similarity_weighted as similarity
importlib.reload(frequency)
importlib.reload(similarity)

<module 'similarity_weighted' from '/Users/chloe/Desktop/weighted-mod-cosine-sim/notebooks/network_method/../../src/ms_similarity_metrics/similarity_weighted.py'>

# Get spectra data

In [3]:
# Spectra and spectrum pairs to include with the following settings.
charges = 0, 1
min_n_peaks = 6
fragment_mz_tolerance = 0.1
min_mass_diff = 1    # Da
max_mass_diff = 200    # Da

In [4]:
# Profile spectra contain 0 intensity values.
@nb.njit
def is_centroid(intensity_array):
    return np.all(intensity_array > 0)
            

# Convert smiles to mol
@functools.lru_cache
def _smiles_to_mol(smiles):
    try:
        return Chem.MolFromSmiles(smiles)
    except:
        return None

# Assumes that the spectra are sorted by ascending precusor m/z.
@nb.njit
def generate_pairs(
    spectrum_indexes, masses, min_mass_diff, max_mass_diff
):
    for i in range(len(spectrum_indexes)):
        j = i + 1
        while (
            j < len(spectrum_indexes) and
            masses[j] - masses[i] < min_mass_diff
        ):
            j += 1
        while (
            j < len(spectrum_indexes) and
            masses[j] - masses[i] < max_mass_diff
        ):
            yield spectrum_indexes[i]
            yield spectrum_indexes[j]
            j += 1
    
# Function for calulating tanimoto scores
@functools.lru_cache
def tanimoto(smiles1, smiles2):
    mol1, mol2 = _smiles_to_mol(smiles1), _smiles_to_mol(smiles2)
    if mol1 is None or mol2 is None:
        return np.nan
    fp1, fp2 = Chem.RDKFingerprint(mol1), Chem.RDKFingerprint(mol2)
    return DataStructs.TanimotoSimilarity(fp1, fp2)

In [5]:
# Read all spectra from the MGF.
# ALL_GNPS_NO_PROPOGATED (retrieved on 2022-05-12) downloaded from
# https://gnps-external.ucsd.edu/gnpslibrary

# Spectrum quality filtering:
#   - Don't include propagated spectra (LIBRARYQUALITY==4).
#   - Don't include multiply charged molecules.
#   - Don't include spectra with invalid precursor m/z (0).
#   - Don't include spectra with too few peaks (minimum 6).
#   - Only include positive ion mode spectra.
#   - Only include spectra with [M+H]+ adducts.
#   - Only include centroid data (does not contain zero intensity values).
#   - Only include spectra with InChI and/or SMILES specified.

spectra = []
# Download from https://zenodo.org/record/6829249/files/ALL_GNPS_NO_PROPOGATED.mgf?download=1
filename = ("../../data/ALL_GNPS_NO_PROPOGATED.mgf")

with pyteomics.mgf.MGF(filename) as f_in:
    for spectrum_dict in tqdm(f_in):
        if (
            int(spectrum_dict["params"]["libraryquality"]) <= 3 and
            int(spectrum_dict["params"]["charge"][0]) in charges and
            float(spectrum_dict["params"]["pepmass"][0]) > 0 and
            len(spectrum_dict["m/z array"]) >= min_n_peaks and
            spectrum_dict["params"]["ionmode"] == "Positive" and
            spectrum_dict["params"]["name"].rstrip().endswith(" M+H") and
            is_centroid(spectrum_dict["intensity array"]) and
            (
                spectrum_dict["params"]["inchi"] != "N/A" or
                spectrum_dict["params"]["smiles"] != "N/A"
            )
        ):
            spec = sus.MsmsSpectrum(
                spectrum_dict["params"]["spectrumid"],
                float(spectrum_dict["params"]["pepmass"][0]),
                # Re-assign charge 0 to 1.
                max(int(spectrum_dict["params"]["charge"][0]), 1),
                spectrum_dict["m/z array"],
                spectrum_dict["intensity array"]/max(spectrum_dict["intensity array"]),
            )
            spec.library = spectrum_dict["params"]["organism"]
            spec.inchi = spectrum_dict["params"]["inchi"]
            spec.smiles = spectrum_dict["params"]["smiles"]
            spec.remove_precursor_peak(0.1, "Da")
            spec.filter_intensity(0.01, max_num_peaks=200)
            spectra.append(spec)


495600it [01:09, 7094.35it/s] 


In [6]:
# Round mz to 1 decimal
for s in spectra:
    s.mz.round(1, out=s.mz)

In [7]:
# Read in Wout metadata
metadata = pd.read_csv(
    'https://zenodo.org/record/6829249/files/gnps_libraries_metadata.csv?download=1'
)

# Get frequency data

In [8]:
# Get pair subset that Wout used
# Download file from s3://enveda-data-user/chloe.engler/cosine_similarity/Wout_data/pairs_subset.txt
with open('../../data/pairs_subset.txt', 'r') as f:
    pairs_subset = f.read().splitlines()
    pairs_subset = pairs_subset[1:]
    pairs_subset = [np.array(pair.split(' ')).astype(int) for pair in pairs_subset]
pairs_subset = np.array(pairs_subset)

In [9]:
# Get dataframe containing m/z frequency information
frequency_df = pd.read_csv('s3://enveda-data-user/chloe.engler/cosine_similarity/Wout_data/frequency_df.csv')

# Get number of spectra used to generate the frequency_df
# Get file from s3://enveda-data-user/chloe.engler/cosine_similarity/Wout_data/num_spectra.txt
with open('../../data/num_spectra.txt', 'r') as f:
    num_spectra = int(f.read())

In [10]:
# Get frequency counts
# Get file from s3://enveda-data-user/chloe.engler/cosine_similarity/Wout_data/frequency_count.txt
with open('../../data/frequency_count.txt', 'r') as f:
    frequency_count = f.read().splitlines()
    frequency_count = frequency_count[1:]
    frequency_count = [float(x) for x in frequency_count]

In [38]:
# Get sample of pairs
np.random.seed(1)
#small_sample = pairs_subset[np.random.choice(pairs_subset.shape[0], 1_000_000, replace=False)]
small_sample = pairs_subset.copy()

# Get weights

In [39]:
# Define weight functions
idf_dict = frequency.idf(frequency_df, num_spectra, frequency_col='frequency')
weight_func = lambda x: x**(1/2) 
intensity_weight_func = lambda x: x**(1/2)
# Get dataframe of weights for m/z frequency
weight_df = frequency.get_weights(frequency_df, weight_func, weight_col='prob')


# # Define weight functions without mz
# idf_dict = frequency.idf(frequency_df, num_spectra, frequency_col='frequency')
# weight_func = lambda x: 1
# intensity_weight_func = lambda x: x**(1/4)
# # Get dataframe of weights for m/z frequency
# weight_df = frequency.get_weights(frequency_df, weight_func, weight_col='prob')

# # Define binary weights
# idf_dict = frequency.idf(frequency_df, num_spectra, frequency_col='frequency')
# weight_func = lambda x: 1
# intensity_weight_func = lambda x: np.ones(len(x))
# # Get dataframe of weights for m/z frequency
# weight_df = frequency.get_weights(frequency_df, weight_func, weight_col='prob')

# #Define Stein weights
# mz_weight_func = lambda x: x**(2)
# intensity_weight_func = lambda x: x**(0.6)
# # Get dataframe of weights for m/z frequency
# weight_df = frequency.get_weights(frequency_df, mz_weight_func, weight_col='prob')
# # Create stein_weight_df
# stein_weight_df = pd.DataFrame(index=weight_df.index.values, columns=['mz', 'weight'])
# stein_weight_df['mz'] = weight_df['mz'].copy()
# stein_weight_df['weight'] = weight_df['mz'].apply(lambda x: mz_weight_func(x))
# weight_df = stein_weight_df.copy()

# Get similarities

In [None]:
# Compute similarities between spectrum pairs.
weight_scores = []
mod_scores = []
neutral_loss_scores = []
for i, j in tqdm(small_sample):

    # Get weighted and modified cosine similarity
    weight_cos = similarity.weighted_modified_cosine(spectra[i], spectra[j], fragment_mz_tolerance, 
                                                     weight_df, intensity_weight_func=intensity_weight_func)
    mod_cos = similarity.modified_cosine(spectra[i], spectra[j], fragment_mz_tolerance)

    # save scores
    tan = tanimoto(metadata.at[i, "smiles"], metadata.at[j, "smiles"])
    weight_scores.append((weight_cos[0], weight_cos[1], tan))
    mod_scores.append((mod_cos[0], mod_cos[1], tan))

# save score information
similarities = pd.DataFrame(
    {
        "pair1": small_sample[:, 0],
        "pair2": small_sample[:, 1],
        "id1": metadata.loc[small_sample[:, 0], "id"].values,
        "id2": metadata.loc[small_sample[:, 1], "id"].values,
        "smiles1": metadata.loc[small_sample[:, 0], "smiles"].values,
        "smiles2": metadata.loc[small_sample[:, 1], "smiles"].values,
        "charge1": metadata.loc[small_sample[:, 0], "charge"].values,
        "charge2": metadata.loc[small_sample[:, 1], "charge"].values,
        "mz1": metadata.loc[small_sample[:, 0], "mz"].values,
        "mz2": metadata.loc[small_sample[:, 1], "mz"].values,
    }
)

similarities[
    [
        "weighted_modified_cosine",
        "weighted_modified_cosine_explained",
        "tanimoto",
    ]
] = weight_scores

similarities[
    [
        "modified_cosine",
        "modified_cosine_explained",
        "tanimoto",
    ]
] = mod_scores

    

  0%|                                 | 373/10000000 [00:00<6:21:00, 437.42it/s][18:17:23] Explicit valence for atom # 19 N, 4, is greater than permitted
  0%|                                | 1459/10000000 [00:03<5:57:04, 466.69it/s][18:17:25] Explicit valence for atom # 22 N, 4, is greater than permitted
  0%|                                | 3449/10000000 [00:07<6:17:28, 441.38it/s][18:17:30] SMILES Parse Error: syntax error while parsing: N/ACCC1(C(=O)NCNC1=O)c2ccccc2
[18:17:30] SMILES Parse Error: Failed parsing SMILES 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2' for input: 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2'
  0%|                                | 4161/10000000 [00:09<6:21:12, 437.03it/s][18:17:32] SMILES Parse Error: syntax error while parsing: N/ACCC1(C(=O)NCNC1=O)c2ccccc2
[18:17:32] SMILES Parse Error: Failed parsing SMILES 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2' for input: 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2'
  0%|                                | 4651/10000000 [00:10<6:27:56, 429.42it/s][18:17:33] Explici

  1%|▏                              | 62234/10000000 [02:20<5:55:31, 465.86it/s][18:19:42] SMILES Parse Error: syntax error while parsing: InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)
[18:19:42] SMILES Parse Error: Failed parsing SMILES 'InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)' for input: 'InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)'
  1%|▏                              | 62569/10000000 [02:20<6:07:59, 450.08it/s][18:19:43] Explicit valence for atom # 19 N, 4, is greater than permitted
  1%|▏                              | 62907/10000000 [02:21<6:05:00, 453.74it/s][18:19:44] Explicit valence for atom # 22 O, 3, is greater than permitted
  1%|▏                              | 65061/10000000 [02:26<5:46:07, 478.39it/s][18:19:48] SMILES Parse Error: unclosed ring for input: 'C\CC[C@@]2([C@@H](O2)CC(/C=

  2%|▍                             | 159663/10000000 [05:55<5:49:09, 469.71it/s][18:23:17] SMILES Parse Error: unclosed ring for input: 'C\CC(/C=C/C(=O)/C(=C/CC1)/C)(C)C'
  2%|▍                             | 160759/10000000 [05:57<6:12:30, 440.22it/s][18:23:20] Explicit valence for atom # 6 O, 3, is greater than permitted
  2%|▍                             | 163732/10000000 [06:04<5:49:31, 469.02it/s][18:23:26] SMILES Parse Error: unclosed ring for input: 'C\CC(/C=C/C(=O)/C(=C/CC1)/C)(C)C'
  2%|▍                             | 163924/10000000 [06:04<5:59:31, 455.98it/s][18:23:27] Explicit valence for atom # 17 N, 4, is greater than permitted
  2%|▌                             | 167413/10000000 [06:12<5:52:17, 465.18it/s][18:23:34] SMILES Parse Error: syntax error while parsing: InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)
[18:23:34] SMILES Parse Error: Failed parsing SMILES 'InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)1

  2%|▋                             | 242699/10000000 [09:01<6:38:00, 408.59it/s][18:26:24] SMILES Parse Error: syntax error while parsing: N/ACCC1(C(=O)NCNC1=O)c2ccccc2
[18:26:24] SMILES Parse Error: Failed parsing SMILES 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2' for input: 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2'
  2%|▋                             | 243748/10000000 [09:04<6:45:20, 401.15it/s][18:26:27] Explicit valence for atom # 35 O, 3, is greater than permitted
  2%|▋                             | 245110/10000000 [09:07<6:41:36, 404.83it/s][18:26:30] SMILES Parse Error: syntax error while parsing: InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)
[18:26:30] SMILES Parse Error: Failed parsing SMILES 'InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)' for input: 'InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)'
  2%|▋               

  3%|▉                             | 311273/10000000 [11:44<6:42:22, 401.32it/s][18:29:06] Can't kekulize mol.  Unkekulized atoms: 10 11 12 14 16
  3%|▉                             | 311938/10000000 [11:45<6:26:47, 417.45it/s][18:29:08] SMILES Parse Error: extra open parentheses for input: 'COC(=O)C1=CCCC2C3(C)CC(OC(=O)C3CC(OC3OC(CO)C(O)C(O)C3O'
[18:29:08] SMILES Parse Error: syntax error while parsing: N/ACCC1(C(=O)NCNC1=O)c2ccccc2
[18:29:08] SMILES Parse Error: Failed parsing SMILES 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2' for input: 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2'
  3%|▉                             | 311982/10000000 [11:45<6:22:42, 421.90it/s][18:29:08] Explicit valence for atom # 19 N, 4, is greater than permitted
  3%|▉                             | 316649/10000000 [11:56<6:11:09, 434.83it/s][18:29:19] SMILES Parse Error: unclosed ring for input: 'C\CC(/C=C/C(=O)/C(=C/CC1)/C)(C)C'
  3%|▉                             | 316921/10000000 [11:57<6:10:49, 435.20it/s][18:29:20] SMILES Parse Error:

  4%|█▏                            | 380405/10000000 [14:28<6:31:53, 409.12it/s][18:31:50] SMILES Parse Error: unclosed ring for input: 'C\CC(/C=C/C(=O)/C(=C/CC1)/C)(C)C'
  4%|█▏                            | 382101/10000000 [14:32<6:04:47, 439.42it/s][18:31:54] Explicit valence for atom # 22 N, 4, is greater than permitted
[18:31:54] SMILES Parse Error: syntax error while parsing: N/ACCC1(C(=O)NCNC1=O)c2ccccc2
[18:31:54] SMILES Parse Error: Failed parsing SMILES 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2' for input: 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2'
  4%|█▏                            | 382564/10000000 [14:33<6:04:18, 439.99it/s][18:31:56] Can't kekulize mol.  Unkekulized atoms: 10 11 12 14 16
  4%|█▏                            | 389310/10000000 [14:48<6:12:12, 430.34it/s][18:32:11] SMILES Parse Error: unclosed ring for input: 'C\CC(/C=C/C(=O)/C(=C/C(C1)O)/C)(C)C'
  4%|█▏                            | 390154/10000000 [14:50<5:54:44, 451.49it/s][18:32:13] Can't kekulize mol.  Unkekulized atoms: 10 11 1

  5%|█▍                            | 460111/10000000 [17:34<6:12:09, 427.23it/s][18:34:57] Explicit valence for atom # 19 N, 4, is greater than permitted
  5%|█▍                            | 462144/10000000 [17:39<6:28:51, 408.79it/s][18:35:01] Explicit valence for atom # 19 N, 4, is greater than permitted
  5%|█▍                            | 463406/10000000 [17:42<6:28:49, 408.77it/s][18:35:04] Explicit valence for atom # 17 N, 4, is greater than permitted
  5%|█▍                            | 464650/10000000 [17:45<6:17:12, 421.32it/s][18:35:07] Explicit valence for atom # 22 N, 4, is greater than permitted
  5%|█▍                            | 465137/10000000 [17:46<6:22:53, 415.04it/s][18:35:09] Explicit valence for atom # 22 N, 4, is greater than permitted
  5%|█▍                            | 465709/10000000 [17:47<6:38:04, 399.18it/s][18:35:10] Can't kekulize mol.  Unkekulized atoms: 10 11 12 14 16
  5%|█▍                            | 468069/10000000 [17:53<5:52:53, 450.19it/s][18:

  5%|█▌                            | 540826/10000000 [20:45<6:42:54, 391.29it/s][18:38:07] SMILES Parse Error: syntax error while parsing: N/ACCC1(C(=O)NCNC1=O)c2ccccc2
[18:38:07] SMILES Parse Error: Failed parsing SMILES 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2' for input: 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2'
  5%|█▋                            | 543094/10000000 [20:50<6:33:50, 400.20it/s][18:38:13] Explicit valence for atom # 22 N, 4, is greater than permitted
  5%|█▋                            | 545890/10000000 [20:57<6:35:57, 397.94it/s][18:38:20] Explicit valence for atom # 22 N, 4, is greater than permitted
  5%|█▋                            | 548994/10000000 [21:04<5:52:10, 447.27it/s][18:38:27] SMILES Parse Error: unclosed ring for input: 'C\CC(/C=C/C(=O)/C(=C/C(C1)O)/C)(C)C'
  6%|█▋                            | 550880/10000000 [21:08<5:50:58, 448.70it/s][18:38:31] SMILES Parse Error: syntax error while parsing: N/ACCC1(C(=O)NCNC1=O)c2ccccc2
[18:38:31] SMILES Parse Error: Failed parsing SMILES

  6%|█▉                            | 631656/10000000 [24:18<5:44:13, 453.60it/s][18:41:41] SMILES Parse Error: extra open parentheses for input: 'COC(=O)C1=CCCC2C3(C)CC(OC(=O)C3CC(OC3OC(CO)C(O)C(O)C3O'
  6%|█▉                            | 636115/10000000 [24:29<5:58:06, 435.80it/s][18:41:51] Can't kekulize mol.  Unkekulized atoms: 10 11 12 14 16
  6%|█▉                            | 636531/10000000 [24:30<6:10:15, 421.49it/s][18:41:52] SMILES Parse Error: syntax error while parsing: N/ACCC1(C(=O)NCNC1=O)c2ccccc2
[18:41:52] SMILES Parse Error: Failed parsing SMILES 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2' for input: 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2'
[18:41:52] SMILES Parse Error: unclosed ring for input: 'C\CC(/C=C/C(=O)/C(=C/C(C1)O)/C)(C)C'
  6%|█▉                            | 637924/10000000 [24:33<6:04:33, 428.01it/s][18:41:55] Explicit valence for atom # 22 N, 4, is greater than permitted
  6%|█▉                            | 638981/10000000 [24:35<6:09:11, 422.59it/s][18:41:58] SMILES Parse Err

  7%|██                            | 692697/10000000 [26:44<6:24:40, 403.25it/s][18:44:07] Explicit valence for atom # 22 N, 4, is greater than permitted
  7%|██                            | 693061/10000000 [26:45<5:50:17, 442.81it/s][18:44:08] Explicit valence for atom # 19 N, 4, is greater than permitted
  7%|██                            | 693793/10000000 [26:47<6:34:31, 393.14it/s][18:44:09] Can't kekulize mol.  Unkekulized atoms: 10 11 12 14 16
  7%|██                            | 694974/10000000 [26:50<5:51:30, 441.20it/s][18:44:12] SMILES Parse Error: unclosed ring for input: 'C\CC(/C=C/C(=O)/C(=C/C(C1)O)/C)(C)C'
  7%|██                            | 695314/10000000 [26:51<6:24:55, 402.89it/s][18:44:13] Can't kekulize mol.  Unkekulized atoms: 10 11 12 14 16
  7%|██                            | 696608/10000000 [26:54<6:17:42, 410.51it/s][18:44:16] SMILES Parse Error: unclosed ring for input: 'C\CC(/C=C/C(=O)/C(=C/C(C1)O)/C)(C)C'
  7%|██                            | 696970/10000000

  7%|██▏                           | 745066/10000000 [28:45<5:41:18, 451.93it/s][18:46:07] Explicit valence for atom # 22 N, 4, is greater than permitted
  7%|██▏                           | 747708/10000000 [28:51<5:39:13, 454.58it/s][18:46:13] Explicit valence for atom # 22 N, 4, is greater than permitted
  7%|██▏                           | 748423/10000000 [28:52<5:43:50, 448.43it/s][18:46:15] Explicit valence for atom # 19 N, 4, is greater than permitted
  8%|██▎                           | 750197/10000000 [28:56<5:54:06, 435.35it/s][18:46:19] Can't kekulize mol.  Unkekulized atoms: 10 11 12 14 16
  8%|██▎                           | 751170/10000000 [28:59<6:14:16, 411.86it/s][18:46:21] SMILES Parse Error: unclosed ring for input: 'C\CC(/C=C/C(=O)/C(=C/C(C1)O)/C)(C)C'
  8%|██▎                           | 751867/10000000 [29:00<5:37:42, 456.42it/s][18:46:23] SMILES Parse Error: unclosed ring for input: 'C\CC(/C=C/C(=O)/C(=C/C(C1)O)/C)(C)C'
  8%|██▎                           | 758473/

  8%|██▌                           | 847051/10000000 [32:45<6:08:46, 413.66it/s][18:50:08] SMILES Parse Error: syntax error while parsing: N/ACCC1(C(=O)NCNC1=O)c2ccccc2
[18:50:08] SMILES Parse Error: Failed parsing SMILES 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2' for input: 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2'
  8%|██▌                           | 848701/10000000 [32:49<5:45:33, 441.38it/s][18:50:12] SMILES Parse Error: syntax error while parsing: InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)
[18:50:12] SMILES Parse Error: Failed parsing SMILES 'InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)' for input: 'InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)'
  8%|██▌                           | 848890/10000000 [32:50<5:32:19, 458.95it/s][18:50:12] SMILES Parse Error: unclosed ring for input: 'C\CC(/C=C/C(=O)/C(=C/C(C1)O)/C)(C)C'
 

 10%|██▉                          | 1027484/10000000 [39:38<5:53:30, 423.02it/s][18:57:01] Can't kekulize mol.  Unkekulized atoms: 10 11 12 14 16
 10%|██▉                          | 1029366/10000000 [39:42<6:06:45, 407.65it/s][18:57:05] Explicit valence for atom # 35 O, 3, is greater than permitted
 10%|██▉                          | 1030865/10000000 [39:46<5:42:55, 435.92it/s][18:57:08] SMILES Parse Error: unclosed ring for input: 'C\CC[C@@]2([C@@H](O2)CC(/C=C/C1=O)(C)C)C'
 10%|██▉                          | 1033774/10000000 [39:52<5:25:03, 459.72it/s][18:57:15] SMILES Parse Error: syntax error while parsing: InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)
[18:57:15] SMILES Parse Error: Failed parsing SMILES 'InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)' for input: 'InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,1

 11%|███▏                         | 1105127/10000000 [42:33<5:28:07, 451.81it/s][18:59:56] Explicit valence for atom # 17 N, 4, is greater than permitted
 11%|███▏                         | 1111029/10000000 [42:47<5:20:03, 462.89it/s][19:00:09] SMILES Parse Error: unclosed ring for input: 'C\CC[C@@]2([C@@H](O2)CC(/C=C/C1=O)(C)C)C'
[19:00:09] Can't kekulize mol.  Unkekulized atoms: 10 11 12 14 16
 11%|███▏                         | 1111313/10000000 [42:47<5:31:14, 447.24it/s][19:00:10] SMILES Parse Error: syntax error while parsing: InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)
[19:00:10] SMILES Parse Error: Failed parsing SMILES 'InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)' for input: 'InChI=1S/C16H21NO2/c1-2-3-4-5-6-11-14-16(19)15(18)12-9-7-8-10-13(12)17-14/h7-10,19H,2-6,11H2,1H3,(H,17,18)'
 11%|███▏                         | 1115902/10000000 [42:57<5:31:27, 446.

 12%|███▍                         | 1192348/10000000 [45:50<5:16:31, 463.77it/s][19:03:12] Explicit valence for atom # 35 O, 3, is greater than permitted
 12%|███▍                         | 1192784/10000000 [45:51<5:19:52, 458.89it/s][19:03:13] SMILES Parse Error: unclosed ring for input: 'C\CC[C@@]2([C@@H](O2)CC(/C=C/C1=O)(C)C)C'
 12%|███▍                         | 1192880/10000000 [45:51<5:18:04, 461.49it/s][19:03:14] SMILES Parse Error: extra open parentheses for input: 'COC(=O)C1=CCCC2C3(C)CC(OC(=O)C3CC(OC3OC(CO)C(O)C(O)C3O'
 12%|███▍                         | 1193394/10000000 [45:52<6:04:37, 402.54it/s][19:03:15] Explicit valence for atom # 17 N, 4, is greater than permitted
 12%|███▍                         | 1195787/10000000 [45:58<5:37:29, 434.79it/s][19:03:21] Explicit valence for atom # 17 N, 4, is greater than permitted
 12%|███▍                         | 1196678/10000000 [46:00<5:17:53, 461.56it/s][19:03:23] SMILES Parse Error: extra open parentheses for input: 'COC(=O)C1=C

 13%|███▋                         | 1282900/10000000 [49:12<5:09:14, 469.82it/s][19:06:34] Explicit valence for atom # 19 N, 4, is greater than permitted
 13%|███▋                         | 1283041/10000000 [49:12<5:30:21, 439.78it/s][19:06:35] Explicit valence for atom # 22 N, 4, is greater than permitted
 13%|███▋                         | 1285581/10000000 [49:18<5:00:21, 483.57it/s][19:06:40] Explicit valence for atom # 22 N, 4, is greater than permitted
 13%|███▋                         | 1290614/10000000 [49:29<5:05:46, 474.72it/s][19:06:51] SMILES Parse Error: syntax error while parsing: N/ACCC1(C(=O)NCNC1=O)c2ccccc2
[19:06:51] SMILES Parse Error: Failed parsing SMILES 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2' for input: 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2'
 13%|███▊                         | 1294521/10000000 [49:37<5:20:00, 453.41it/s][19:07:00] Explicit valence for atom # 35 O, 3, is greater than permitted
 13%|███▊                         | 1299528/10000000 [49:48<5:12:38, 463.82it/s][19:07:

 14%|███▉                         | 1356104/10000000 [51:59<5:09:22, 465.66it/s][19:09:22] Can't kekulize mol.  Unkekulized atoms: 10 11 12 14 16
 14%|███▉                         | 1357364/10000000 [52:02<5:22:53, 446.12it/s][19:09:25] SMILES Parse Error: unclosed ring for input: 'C\CC[C@@]2([C@@H](O2)CC(/C=C/C1=O)(C)C)C'
 14%|███▉                         | 1359493/10000000 [52:07<5:51:17, 409.93it/s][19:09:29] Explicit valence for atom # 19 N, 4, is greater than permitted
 14%|███▉                         | 1359669/10000000 [52:07<5:49:47, 411.70it/s][19:09:30] SMILES Parse Error: syntax error while parsing: N/ACCC1(C(=O)NCNC1=O)c2ccccc2
[19:09:30] SMILES Parse Error: Failed parsing SMILES 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2' for input: 'N/ACCC1(C(=O)NCNC1=O)c2ccccc2'
 14%|███▉                         | 1361574/10000000 [52:11<5:09:27, 465.24it/s][19:09:34] Explicit valence for atom # 22 N, 4, is greater than permitted
 14%|███▉                         | 1367136/10000000 [52:23<5:19:05, 4

In [None]:
# Normalize scores
if similarities['weighted_modified_cosine'].max() > 1:
    similarities['weighted_modified_cosine'] = similarities['weighted_modified_cosine']/similarities['weighted_modified_cosine'].max()
similarities['weighted_difference'] = abs(similarities['weighted_modified_cosine'] - similarities['modified_cosine'])

In [None]:
similarities["tanimoto_interval"] = pd.cut(
    similarities["tanimoto"],
    5,
    labels=["0.0–0.2", "0.2–0.4", "0.4–0.6", "0.6–0.8", "0.8–1.0"],
)
similarities_tanimoto = pd.melt(
    similarities,
    id_vars="tanimoto_interval",
    value_vars=["weighted_modified_cosine", "modified_cosine"],
)

In [None]:
# Look at scores for tanimoto intervals
stats = similarities_tanimoto[similarities_tanimoto.variable != 'neutral_loss']
stats.groupby(['tanimoto_interval', 'variable']).agg(['mean', 'std'])


In [None]:
# Look at mean difference between cosine scores and tanimoto scores
similarities['weighted_tanimoto_diff'] = abs(similarities['weighted_modified_cosine'] - similarities['tanimoto'])
similarities['modified_tanimoto_diff'] = abs(similarities['modified_cosine'] - similarities['tanimoto'])
print('mean difference between weighted cosine and tanimoto scores: ', similarities['weighted_tanimoto_diff'].mean())
print('mean difference between modified cosine and tanimoto scores: ', similarities['modified_tanimoto_diff'].mean())
similarities[['tanimoto_interval', 'weighted_tanimoto_diff', 'modified_tanimoto_diff']].groupby(['tanimoto_interval']).agg(['mean'])


In [None]:
# Look at weighted vs. unweighted scores colored by tanimoto scores
similarities.plot.hexbin('weighted_modified_cosine', 'modified_cosine', C='tanimoto', gridsize=30, cmap='inferno')
plt.plot([0,1], [0,1], c='red')
plt.show()

# Show violin plots

In [None]:
mosaic = """
11111.
222223
222223
222223
222223
222223
"""

bins = 100
tick_locators = mticker.FixedLocator(np.arange(0, bins + 1, bins / 4))
tick_labels = np.asarray([f"{a:.2f}" for a in np.arange(0, 1.01, 0.25)])

with sns.plotting_context("paper", font_scale=1.6):
    fig = plt.figure(constrained_layout=True, figsize=(7.2 * 2, 7.2 / 1.618 * 3))
    gs = GridSpec(3, 3, figure=fig)
    
    # Top panel: Compare different similarities.
    axes_left = fig.add_subfigure(gs[0, 0]).subplot_mosaic(mosaic)
    axes_middle = fig.add_subfigure(gs[0, 1]).subplot_mosaic(mosaic)
    axes_right = fig.add_subfigure(gs[0, 2]).subplot_mosaic(mosaic)
    cbar_ax = fig.add_axes([-0.04, 0.75, 0.02, 0.15])
    
    labels = np.asarray([
        ["weighted_modified_cosine", "modified_cosine"],
    ])

    for i, (axes, (xlabel, ylabel)) in enumerate(
        zip([axes_left, axes_middle, axes_right], labels)
    ):
        # Plot heatmaps.
        hist, _, _ = np.histogram2d(
            similarities[xlabel],
            similarities[ylabel],
            bins=bins,
            range=[[0, 1], [0, 1]],
        )
        hist /= len(similarities)
        heatmap = sns.heatmap(
            np.rot90(hist),
            vmin=0.0,
            vmax=0.001,
            cmap="viridis",
            cbar=i == 2,
            cbar_kws={"format": mticker.StrMethodFormatter("{x:.3%}")},
            cbar_ax=cbar_ax if i == 2 else None,
            square=True,
            xticklabels=False,
            yticklabels=False,
            ax=axes["2"],
            norm=LogNorm(vmin=0.00001,vmax=0.001),
        )
        axes["2"].yaxis.set_major_locator(tick_locators)
        axes["2"].set_yticklabels(tick_labels[::-1])
        axes["2"].xaxis.set_major_locator(tick_locators)
        axes["2"].set_xticklabels(tick_labels)
        for _, spine in heatmap.spines.items():
            spine.set_visible(True)
        axes["2"].set_xlabel(xlabel.replace("_", " ").capitalize())
        axes["2"].set_ylabel(ylabel.replace("_", " ").capitalize())

        axes["2"].plot(
            [0, bins], [bins, 0], color="black", linestyle="dashed"
        )

        sns.despine(ax=axes["2"])

        # Plot density plots.
        sns.kdeplot(
            data=similarities,
            x=xlabel,
            clip=(0, 1),
            legend=True,
            color="black",
            fill=True,
            ax=axes["1"],
        )
        axes["1"].set_xlim(0, 1)
        axes["1"].xaxis.set_ticklabels([])
        axes["1"].yaxis.set_major_locator(tick_locators)
        axes["1"].set_yticks([])
        sns.despine(ax=axes["1"], left=True)
        sns.kdeplot(
            data=similarities,
            y=ylabel,
            clip=(0, 1),
            legend=True,
            color="black",
            fill=True,
            ax=axes["3"],
        )
        axes["3"].set_ylim(0, 1)
        axes["3"].yaxis.set_ticklabels([])
        axes["3"].xaxis.set_major_locator(tick_locators)
        axes["3"].set_xticks([])
        sns.despine(ax=axes["3"], bottom=True)
        for ax in [axes[c] for c in "13"]:
            ax.set_xlabel("")
            ax.set_ylabel("")
            
    cbar_ax.set_ylabel("Proportion of pairs")
    cbar_ax.yaxis.set_label_position("left")
    # cbar_ax.spines["outline"].set(visible=True, lw=.8, edgecolor="black")
    
    # Middle panel: Compare similarities vs explained intensity.
    axes_left = fig.add_subfigure(gs[1, 0]).subplot_mosaic(mosaic)
    axes_middle = fig.add_subfigure(gs[1, 1]).subplot_mosaic(mosaic)
    axes_right = fig.add_subfigure(gs[1, 2]).subplot_mosaic(mosaic)
    cbar_ax = fig.add_axes([-0.04, 0.45, 0.02, 0.15])
    
    labels = np.asarray([
        ["weighted_modified_cosine_explained", "weighted_modified_cosine"],
        ["modified_cosine_explained", "modified_cosine"],
    ])

    for i, (axes, (xlabel, ylabel)) in enumerate(
        zip([axes_left, axes_middle, axes_right], labels)
    ):
        # Plot heatmaps.
        hist, _, _ = np.histogram2d(
            similarities[xlabel],
            similarities[ylabel],
            bins=bins,
            range=[[0, 1], [0, 1]],
        )
        hist /= len(similarities)
        heatmap = sns.heatmap(
            np.rot90(hist),
            vmin=0.0,
            vmax=0.001,
            cmap="viridis",
            cbar=i == 2,
            cbar_kws={"format": mticker.StrMethodFormatter("{x:.3%}")},
            cbar_ax=cbar_ax if i == 2 else None,
            square=True,
            xticklabels=False,
            yticklabels=False,
            ax=axes["2"],
            norm=LogNorm(vmin=0.00001,vmax=0.001),
        )
        axes["2"].yaxis.set_major_locator(tick_locators)
        axes["2"].set_yticklabels(tick_labels[::-1])
        axes["2"].xaxis.set_major_locator(tick_locators)
        axes["2"].set_xticklabels(tick_labels)
        axes["2"].xaxis.set_major_formatter(mticker.PercentFormatter())
        for _, spine in heatmap.spines.items():
            spine.set_visible(True)
        axes["2"].set_xlabel("Explained intensity")
        axes["2"].set_ylabel(ylabel.replace("_", " ").capitalize())

        sns.despine(ax=axes["2"])

        # Plot density plots.
        sns.kdeplot(
            data=similarities,
            x=xlabel,
            clip=(0, 1),
            legend=True,
            color="black",
            fill=True,
            ax=axes["1"],
        )
        axes["1"].set_xlim(0, 1)
        axes["1"].xaxis.set_ticklabels([])
        axes["1"].yaxis.set_major_locator(tick_locators)
        axes["1"].set_yticks([])
        sns.despine(ax=axes["1"], left=True)
        sns.kdeplot(
            data=similarities,
            y=ylabel,
            clip=(0, 1),
            legend=True,
            color="black",
            fill=True,
            ax=axes["3"],
        )
        axes["3"].set_ylim(0, 1)
        axes["3"].yaxis.set_ticklabels([])
        axes["3"].xaxis.set_major_locator(tick_locators)
        axes["3"].set_xticks([])
        sns.despine(ax=axes["3"], bottom=True)
        for ax in [axes[c] for c in "13"]:
            ax.set_xlabel("")
            ax.set_ylabel("")
            
    cbar_ax.set_ylabel("Proportion of pairs")
    cbar_ax.yaxis.set_label_position("left")
    # cbar_ax.spines["outline"].set(visible=True, lw=.8, edgecolor="black")
    
    # Bottom panel: Evaluate similarities in terms of the Tanimoto index.
    ax = fig.add_subplot(gs[2, :])
    
    sns.violinplot(
        data=similarities_tanimoto,
        x="tanimoto_interval",
        y="value",
        hue="variable",
        hue_order=["weighted_modified_cosine", "modified_cosine"],
        cut=0,
        scale="width",
        scale_hue=False,
        ax=ax,
    )
    ax.set_xlabel("Tanimoto index")
    ax.set_ylabel("Spectrum similarity")
    for label in ax.legend().get_texts():
        label.set_text(label.get_text().replace("_", " ").capitalize())
    sns.move_legend(
        ax,
        "lower center",
        bbox_to_anchor=(.5, 1),
        ncol=3,
        title=None,
        frameon=False,
    )

    sns.despine(ax=ax)
    
    # Subplot labels.
    for y, label in zip([1, 2/3, 0.35], "abc"):
        fig.text(
            -0.05, y, label, fontdict=dict(fontsize="xx-large", weight="bold")
        )

    # Save figure.
    plt.savefig("gnps_libraries.png", dpi=300, bbox_inches="tight")
    plt.show()
    plt.close()

In [19]:
similarities.to_csv('../../data/network_method/similarities_10m_stein_0.6_2.csv')

In [20]:
similarities.to_csv('s3://enveda-data-user/chloe.engler/cosine_similarity/network_method/similarities_10m_stein_0.6_2.csv')