In [21]:
import os
import sys
import numpy as np

ROOT = os.path.dirname(os.getcwd())
sys.path.insert(0, ROOT)
path = os.path.join(os.getcwd(), "../data")

In [2]:
msp_file = os.path.join(path, "MoNA-export-GC-MS.msp")

In [3]:
from matchms.filtering import normalize_intensities
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.importing import load_from_msp

def apply_my_filters(s):
    s = normalize_intensities(s)
    s = reduce_to_number_of_peaks(s, n_required=10, n_max=100)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=5)
    return s

spectrums = [apply_my_filters(s) for s in load_from_msp(msp_file)]
spectrums = [s for s in spectrums if s is not None]

print("number of spectrums:", len(spectrums))

number of spectrums: 14361


In [4]:
from spec2vec import SpectrumDocument

reference_documents = [SpectrumDocument(s) for s in spectrums]

In [5]:
from spec2vec.model_building import train_new_word2vec_model

model_file = "references_MoNA.model"
model = train_new_word2vec_model(documents=reference_documents,
                                 filename=model_file, iterations=[10, 20, 30],
                                 workers=4, progress_logger=True)

The value of workers is set from 4 (default) to 4
  Epoch 1 of 30.Change in loss after epoch 1: 319330.71875
  Epoch 2 of 30.Change in loss after epoch 2: 294614.03125
  Epoch 3 of 30.Change in loss after epoch 3: 290521.1875
  Epoch 4 of 30.Change in loss after epoch 4: 255603.9375
  Epoch 5 of 30.Change in loss after epoch 5: 266488.75
  Epoch 6 of 30.Change in loss after epoch 6: 248227.125
  Epoch 7 of 30.Change in loss after epoch 7: 246471.75
  Epoch 8 of 30.Change in loss after epoch 8: 250849.0
  Epoch 9 of 30.Change in loss after epoch 9: 238872.25
  Epoch 10 of 30.Change in loss after epoch 10: 218278.25
Saving model with name: references_MoNA_iter_10.model
  Epoch 11 of 30.Change in loss after epoch 11: 219644.0
  Epoch 12 of 30.Change in loss after epoch 12: 230599.0
  Epoch 13 of 30.Change in loss after epoch 13: 213788.75
  Epoch 14 of 30.Change in loss after epoch 14: 242115.75
  Epoch 15 of 30.Change in loss after epoch 15: 224038.25
  Epoch 16 of 30.Change in loss afte

In [6]:
import gensim

model = gensim.models.Word2Vec.load(model_file)

In [7]:
def get_time():
    now = datetime.now()
    return now.strftime("%H:%M:%S")

In [8]:
import time
from datetime import datetime
from spec2vec import Spec2Vec
from matchms import calculate_scores

spec2vec = Spec2Vec(model=model, intensity_weighting_power=0.5,
                    allowed_missing_percentage=5.0)

In [10]:
print("Start", get_time())
similarity_matrix = spec2vec.matrix(reference_documents, reference_documents)
print("Finish", get_time())

Start 21:08:05
Finish 21:08:24


In [18]:
similarity_matrix.shape

(14361, 14361)

In [19]:
from matchms import calculate_scores

print("Start", get_time())
scores = calculate_scores(reference_documents, reference_documents, spec2vec)
print("Finish scores", get_time())

print("Shape of score matrix:", scores.scores.shape)

Start 21:18:20
Finish scores 21:18:38
Shape of score matrix: (14361, 14361)


### Example of how to get the 10 highest scores for reference[0]

In [35]:
index_array = np.argpartition(similarity_matrix[:, 0], -10)[-10:]#, axis=-1)

In [36]:
index_array

array([13711, 12691, 13710, 12690, 11556,  5966, 13709,  5435, 10815,
           0], dtype=int64)

In [38]:
similarity_matrix[:, 0][index_array]

array([0.36647712, 0.36993555, 0.37748032, 0.37931818, 0.37478992,
       0.38239878, 0.39283566, 0.40967875, 0.41016526, 1.        ])

## The looping through 'scores' example we provide in the matchms code examples is not very helpful I realized. It will be very slow!
- better use numpy functions as above the ``np.argpartition``
- or: use ``scores.scores_by_query`` or ``scores.scores_by_reference`` to do analysis

In [43]:
print("Start", get_time())
i = 0
filtered = []
for (reference, query, score) in scores:
    if i > 10:
        break
    if reference != query:
        filtered.append((reference, query, score))
    i += 1
print("Finish filtering", get_time())

sorted_by_score = sorted(filtered, key=lambda elem: elem[2], reverse=True)

Start 21:35:32
Finish filtering 21:35:46


In [44]:
sorted_by_score

[(<spec2vec.SpectrumDocument.SpectrumDocument at 0x2202b122908>,
  <spec2vec.SpectrumDocument.SpectrumDocument at 0x22037b19048>,
  0.2404680228525137),
 (<spec2vec.SpectrumDocument.SpectrumDocument at 0x2202b122908>,
  <spec2vec.SpectrumDocument.SpectrumDocument at 0x22037b19508>,
  0.1565977629618161),
 (<spec2vec.SpectrumDocument.SpectrumDocument at 0x2202b122908>,
  <spec2vec.SpectrumDocument.SpectrumDocument at 0x22037b1b2c8>,
  0.08878629694049822),
 (<spec2vec.SpectrumDocument.SpectrumDocument at 0x2202b122908>,
  <spec2vec.SpectrumDocument.SpectrumDocument at 0x22037b1b048>,
  0.05156770381845285),
 (<spec2vec.SpectrumDocument.SpectrumDocument at 0x2202b122908>,
  <spec2vec.SpectrumDocument.SpectrumDocument at 0x22037b1d6c8>,
  0.03682022147514922),
 (<spec2vec.SpectrumDocument.SpectrumDocument at 0x2202b122908>,
  <spec2vec.SpectrumDocument.SpectrumDocument at 0x22037b1f188>,
  0.009745449479609163),
 (<spec2vec.SpectrumDocument.SpectrumDocument at 0x2202b122908>,
  <spec2vec.