# Obtaining the data from the Mona File

## Getting the path for Mona file

Using the method load_from_msp, the information inside the Mona file is being obtained.

In [1]:
import os

path = os.path.join(os.path.dirname(os.getcwd()), "data")
msp_file = os.path.join(path, "MoNA-export-GC-MS.msp")

## Appliying filters to the spectra

Applied filters are:
* normalize_intensities(s)
* select_by_mz(s, mz_from=0, mz_to=1000)
* select_by_relative_intensity(s, intensity_from=0.05, intensity_to=1.0)

In [2]:
from matchms.importing import load_from_msp

spectrums = [s for s in load_from_msp(msp_file)]
print("Number of Spectra:", len(spectrums))

Number of Spectra: 14847


In [15]:
import numpy as np

spectrums_sample = spectrums[0:300]
all_differences = []

for spectrum in spectrums_sample:
    
    mzs, intensities = spectrum.peaks

    differences = np.zeros([len(mzs), len(mzs)])

    for i in range(len(mzs)):
        count = 0
        for k in range(len(mzs)):
            if mzs[i] < mzs[k]:
                subtr_diff = mzs[i] - mzs[k]
            else:
                subtr_diff = mzs[k] - mzs[i]
            differences[i][k] = subtr_diff
    
    all_differences.append(differences)

all_differences = np.array(all_differences)



In [24]:
all_shifts = []
for spectrum in all_differences:
    shifts = []
    mean = 0
    for idx, matrix_differences in enumerate(spectrum):
        count = 0
        for difference in matrix_differences:
            if difference < 0:
                count += 1
        shifts.append(count)
    all_shifts.append(np.mean(shifts))
all_shifts = np.array(all_shifts)

In [30]:
print(all_shifts)
for s in spectrums_sample:
    print(s.peaks[0].shape)

[ 74.  63.  35.  43.  32.  41.  36.  31.  64.  65.  35.  37.  43.  25.
  32.  30.  31.  33.  24.  19.  18.  25.  22.  24.  24.  14.  16. 117.
  21.  29.  24.  28.  18.  30.  41.   9.  11.  10.  21.  17.  18.  19.
  37.  22.  21.  31.  15.  25.  31.  21.  20.  25.  24.  20.  25.  20.
  26.  20.  33.  21.  27.  27.  29.  27.  20.  20.  16.  15.  19.  15.
  19.  16.  19.  22.  17.  21.  19.  23.  23.  25.  19.  17.  24.  19.
  19.  22.  21.  53.  44.  46.  37.  77.  43.  42.  24.  49.  26.  18.
   6.  37.  34.  18.  41.  54.  53.  41.  46.  77.  69.  86.  67.  53.
 111.  34.  40.  53.  36.  34.  32.  59.  55.  33.  46.  20.  25.  49.
  44.  52.  45.  26.  38.  66.  35.  34.  36.  35. 160. 140. 153.  44.
 187.  65.  59.  89.  45.  79. 159.  48.  24.  24.  22.  20.  24.  19.
  24.  21.  29.  23.  19.  27.  24.  28.  24.  27.  30.  29.  23.  25.
  29.  27.  32.  25.  23.  22.  27.  25.  29.  16.  22.  34.  26.  21.
  22.  22.  28.  30.  36.  24.  34.  27.  31.  36.  39.  38.  34.  37.
  29. 

In [None]:
all_means = []
for spectrum in all_differences:
    means = np.zeros([len(spectrum)])
    for idx, difference in enumerate(spectrum):
        means[idx] = np.mean(difference)
    all_means.append(means)
    
all_means = np.array(all_means)

In [None]:
all_shift_counts = []
for spectrum in all_means:
    shift_count = 0
    for mean in spectrum:
        int_value = int(mean)
        if int_value - mean != 0:
            shift_count += 1
    all_shift_counts.append(shift_count)

all_shift_counts = np.array(all_shift_counts)

## Adding random precursor mz from 150 to 650

In [None]:
def add_random_loss(spectrum):
    random_precursor_mz = np.random.randint(150, 650)
    spectrum.set("precursor_mz", random_precursor_mz)
    return spectrum

spectrums_w_losses = [add_random_loss(s) for s in spectrums]

In [None]:
from matchms.filtering import normalize_intensities
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity

def apply_my_filters(s):
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = select_by_relative_intensity(s, intensity_from=0.05, intensity_to=1.0)
    s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0)
    return s

spectrums = [apply_my_filters(s) for s in spectrums_w_losses]

spectrums = [s for s in spectrums if s is not None]

#Filter spectra with peak intensities length equals or less than zero
spectrums = [s for s in spectrums if len(s.peaks.intensities) > 0]

print("Number of Spectra:", len(spectrums))

## Converting Spectrum objects to Spectrum Documents

In [None]:
from spec2vec import SpectrumDocument

reference_documents = [SpectrumDocument(s) for s in spectrums]

print(len(reference_documents))

## Training the word2vec model

Data used for training the model:\
**Iterarions=10, 20, 30**\
**Workers=2**

Once the model is trained the file "references.model" is stored for further use in the score calculation

In [None]:
from spec2vec.model_building import train_new_word2vec_model
    
model_file = os.path.join(path, "references.model")
model = train_new_word2vec_model(documents=reference_documents, filename=model_file, 
                                 size=500, iterations=[10, 20, 30], 
                                 workers=2, progress_logger=False)

## Setting the reference and query data

All Spectra in the Monna file will be used to try to find them using the Spec2VecParallel method

In [None]:
spectrums_file = os.path.join(path, "MoNA-export-GC-MS.msp")

### Setting the reference Spectrums

In [None]:
reference_spectrums = [apply_my_filters(s) for s in load_from_msp(spectrums_file)]

reference_spectrums = [s for s in reference_spectrums if s is not None]

#Filter spectra with peak intensities length equals or less than zero
reference_spectrums = [s for s in reference_spectrums if len(s.peaks.intensities) > 0]

### Converting the reference Spectrums to Spectrums Documents

In [None]:
reference_documents = [SpectrumDocument(s) for s in reference_spectrums]

### Setting the query Spectrums

In [None]:
query_spectrums = [apply_my_filters(s) for s in load_from_msp(spectrums_file)]

query_spectrums = [s for s in query_spectrums if s is not None]

#Filter spectra with peak intensities length equals or less than zero
query_spectrums = [s for s in query_spectrums if len(s.peaks.intensities) > 0]

### Converting the query Spectrums to Spectrums Documents

In [None]:
query_documents = [SpectrumDocument(s) for s in query_spectrums]

In [None]:
print("Ref docs:", len(reference_documents), "Query docs:", len(query_documents))

### Loading the model file to word2vec for the similarity function

Parameters for the similarity function:\
**intensity_weighting_power=0.5**\
**allowed_missing_percentage=5.0**

In [None]:
import gensim
from matchms import calculate_scores_parallel
from spec2vec import Spec2VecParallel

model_file = os.path.join(path, "references_iter_10.model")
model = gensim.models.Word2Vec.load(model_file)
    
# Define similarity_function
spec2vec = Spec2VecParallel(model=model, intensity_weighting_power=0.5,
                            allowed_missing_percentage=5.0)

### Calculating the similarity of the reference and query documents using Spec2vec parallel method

For the calculation the same Spectrums are being used as reference and query

In [None]:
scores_spec2vec = spec2vec(reference_documents, query_documents)

In [None]:
import numpy as np

filename = os.path.join(path,'similarities_filter05_spec2vec_3iter_fake_losses.npy')
np.save(filename, scores_spec2vec)