# Obtaining the data from the Mona File

## Getting the path for Mona file

Using the method load_from_msp, the information inside the Mona file is being obtained.

In [1]:
import os

path = os.path.join(os.path.dirname(os.getcwd()), "data")
spectrums_file = os.path.join(path, "MoNA-export-GC-MS.msp")

## Appliying filters to the spectra

Applied filters are:
* normalize_intensities(s)
* reduce_to_number_of_peaks(s, **n_required=10**, **ratio_desired=0.5**)
* select_by_mz(s, **mz_from=0**, **mz_to=1000**)
* require_minimum_number_of_peaks(s, **n_required=10**)

In [2]:
from matchms.filtering import normalize_intensities
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import select_by_mz
from matchms.filtering import require_minimum_number_of_peaks
from matchms.importing import load_from_msp

def apply_my_filters(s):
    s = normalize_intensities(s)
    s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=10)
    return s

spectrums = [apply_my_filters(s) for s in load_from_msp(spectrums_file)]

spectrums = [s for s in spectrums if s is not None]

## Converting Spectrum objects to Spectrum Documents

In [3]:
from spec2vec import SpectrumDocument

reference_documents = [SpectrumDocument(s) for s in spectrums]

## Training the word2vec model

Data used for training the model:\
**Iterarions=10, 20, 30**\
**Workers=2**

Once the model is trained the file "references.model" is stored for further use in the score calculation

In [4]:
from spec2vec.model_building import train_new_word2vec_model

model_file = os.path.join(path, "spec2vec_mona_gc_ms.model")
model = train_new_word2vec_model(documents=reference_documents, filename=model_file, 
                                 iterations=[10, 20, 30], workers=2, progress_logger=False)

The value of workers is set from 4 (default) to 2
Saving model with name: /Users/efra/dev/spec2vec_gcms_data_analysis/data/spec2vec_mona_gc_ms_iter_10.model
Saving model with name: /Users/efra/dev/spec2vec_gcms_data_analysis/data/spec2vec_mona_gc_ms_iter_20.model
Saving model with name: /Users/efra/dev/spec2vec_gcms_data_analysis/data/spec2vec_mona_gc_ms.model


## Setting the reference and query data

The first 10 Spectra in the Monna file will be used to try to find them using the Spec2VecParallel method

In [5]:
spectrums_file_10 = os.path.join(path, "MoNA-export-GC-MS-first10.msp")

### Setting the reference Spectrums

In [6]:
reference_spectrums = [apply_my_filters(s) for s in load_from_msp(spectrums_file_10)]

reference_spectrums = [s for s in reference_spectrums if s is not None]

### Converting the reference Spectrums to Spectrums Documents

In [7]:
reference_documents = [SpectrumDocument(s) for s in reference_spectrums]

### Setting the query Spectrums

In [8]:
query_spectrums = [apply_my_filters(s) for s in load_from_msp(spectrums_file_10)]

query_spectrums = [s for s in query_spectrums if s is not None]

### Converting the query Spectrums to Spectrums Documents

In [9]:
query_documents = [SpectrumDocument(s) for s in query_spectrums]

### Loading the model file to word2vec for the similarity function

Parameters for the similarity function:\
**intensity_weighting_power=0.5**\
**allowed_missing_percentage=5.0**

In [10]:
import gensim
from matchms import calculate_scores_parallel
from spec2vec import Spec2VecParallel

model_file = os.path.join(path, "spec2vec_mona_gc_ms_iter_20.model")

model = gensim.models.Word2Vec.load(model_file)

# Define similarity_function
spec2vec_similarity = Spec2VecParallel(model=model, intensity_weighting_power=0.5,
                            allowed_missing_percentage=5.0)

### Calculating the similarity of the reference and query documents using Spec2vec parallel method

For the calculation the same Spectrums are being used as reference and query

In [11]:
similarity_matrix = spec2vec_similarity(reference_documents, query_documents)

### Store similarity matrix

In [12]:
import numpy as np

filename = os.path.join(path, 'similarities_spec2vec_mona_gc_ms_20iter.npy')
np.save(filename, similarity_matrix)