In [None]:
import os
import pandas as pd

from matchms.filtering import add_losses
from matchms.filtering import add_parent_mass
from matchms.filtering import default_filters
from matchms.filtering import normalize_intensities
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.importing import load_from_mgf
from matchms.importing import load_from_mzml
from spec2vec import SpectrumDocument
from spec2vec.model_building import train_new_word2vec_model

# Import the raw GC-MS data

Note: The GC-MS Agilent .M data was converted to mgf format using OpenChrom

In [None]:
# With GC-MS data it gets WARNING:matchms:add_precursor_mz:No precursor_mz found in metadata.
# Setting metadata_harmonization=False seems to suppress warning.

# The specrums list contains double the amount of scans in the raw file, two identical sets.
# I assume that it is trying to load the MS-MS data. Keep first half of list only
# list(spectrums[-1].peaks)[10]==list(spectrums[1766].peaks)[10] ::: array([ True,  True])

spectrums = list(load_from_mgf("mgf/file_1.mgf", metadata_harmonization=False))
spectrums = spectrums[:int(len(spectrums)/2)]

# Only do minor processing because I am using my own pre-selected list of features
import matchms.filtering as msfilters
def spectrum_processing(s):
    s = normalize_intensities(s)
    s = reduce_to_number_of_peaks(s, n_required=20, n_max=200)
    return s
spectrums = [spectrum_processing(s) for s in spectrums]

# Import the list of pre-selected feature RTs

In [None]:
# The table contains the metabolite names and retention times
spectra = pd.read_csv("spectra_info.txt", sep="\t")

known_dict = {}
# Overlapping metabolites will be overwritten
for rt, name in spectra.dropna()[["RTINSECONDS", "known"]].values.tolist():
    known_dict[rt] = name
rt_list = spectra["RTINSECONDS"].values.tolist()

short_names = spectra["short"].values.tolist()

# Keep the pre-selected spectra and add metabolite names

In [None]:
selected_spectrums = []
for rt in rt_list:
    for s in spectrums:
        # There might be come features with overlapping RTs
        if rt == int(s.metadata['retention_time']):
            if rt in known_dict.keys():
                s.set("compound_name", known_dict[rt])
            selected_spectrums.append(s)
            break
len(selected_spectrums)

# Make a plot of selected metabolite

In [None]:
# selected_spectrums[384].plot(grid=False,annotate_ions=True)
# plt.savefig("spectrum-plot-example_1.png", dpi=300)  # If you want to save a plot

# Calculate the all-vs-all cosine similarity scores

In [None]:
import numpy as np
from matchms import Spectrum, calculate_scores
from matchms.similarity import CosineGreedy
from matchms.networking import SimilarityNetwork

# Use factory to construct a similarity function
cosinegreedy = CosineGreedy(tolerance=0.2)
# modified_cosine = ModifiedCosine(tolerance=0.2)
scores = calculate_scores(selected_spectrums, selected_spectrums, cosinegreedy, is_symmetric=True)

# Convert the similarity scores to a network file for Cytoscape

In [None]:
import networkx as nx

sim_matrix = []
for row in scores.scores:
    sim_matrix.append([])
    for col in row:
        sim_matrix[-1].append(col[0])
sim_matrix = pd.DataFrame(sim_matrix)
sim_matrix[sim_matrix < 0.9] = 0
sim_matrix[sim_matrix == 1] = 0
G = nx.from_pandas_adjacency(sim_matrix)
mapping = {num: short_names[num] for num in range(len(short_names))}
G = nx.relabel_nodes(G, mapping)

print(len(G.edges))
nx.write_edgelist(G, "similarity.cosine.networkx", comments='#', data=False, delimiter='\t', encoding='utf-8')

# Generate word2vec model

In [None]:
reference_documents = [SpectrumDocument(s, n_decimals=2) for s in selected_spectrums]

model_file = "references.model"
model = train_new_word2vec_model(reference_documents, iterations=[10, 20, 30], filename=model_file,
                                 workers=8, progress_logger=True, 
                                 learning_rate_initial=0.25, learning_rate_decay=0.025)

# Import spec2vec model to get the all-vs-all similarity scores

In [None]:
import gensim
from matchms import calculate_scores
from spec2vec import Spec2Vec

# Import pre-trained word2vec model (see code example above)
model_file = "references.model"
model = gensim.models.Word2Vec.load(model_file)

# Define similarity_function
spec2vec_similarity = Spec2Vec(model=model, intensity_weighting_power=0.5,
                               allowed_missing_percentage=5.0)

# Calculate scores on all combinations of reference spectrums and queries
scores = calculate_scores(selected_spectrums, selected_spectrums, spec2vec_similarity)

import networkx as nx

sim_matrix = pd.DataFrame(scores.scores)
sim_matrix[sim_matrix < 0.85] = 0
sim_matrix[sim_matrix == 1] = 0
G = nx.from_pandas_adjacency(sim_matrix)
mapping = {num: short_names[num] for num in range(len(short_names))}
G = nx.relabel_nodes(G, mapping)

print(len(G.edges))
nx.write_edgelist(G, "similarity.spec2vec.networkx", comments='#', data=False, delimiter='\t', encoding='utf-8')