# Obtaining the data from the Mona File

## Getting the path for Mona file

Using the method load_from_msp, the information inside the Mona file is being obtained.

In [71]:
import os

path = os.path.join(os.path.dirname(os.getcwd()), "data")
msp_file = os.path.join(path, "MoNA-export-GC-MS.msp")

## Appliying filters to the spectra

Applied filters are:
* normalize_intensities(s)
* select_by_mz(s, mz_from=0, mz_to=1000)
* select_by_relative_intensity(s, intensity_from=0.05, intensity_to=1.0)

In [72]:
from matchms.importing import load_from_msp

spectrums = [s for s in load_from_msp(msp_file)]
print("Number of Spectra:", len(spectrums))

Number of Spectra: 14847


In [73]:
from matchms.filtering import normalize_intensities
from matchms.filtering import select_by_mz
from matchms.filtering import select_by_relative_intensity

def apply_my_filters(s):
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = select_by_relative_intensity(s, intensity_from=0.01, intensity_to=1.0)
    return s

spectrums = [apply_my_filters(s) for s in spectrums]

spectrums = [s for s in spectrums if s is not None]

#Filter spectra with peak intensities length equals or less than zero
spectrums = [s for s in spectrums if len(s.peaks.intensities) > 0]

print("Number of Spectra:", len(spectrums))

Number of Spectra: 14844


## Converting Spectrum objects to Spectrum Documents

In [74]:
from spec2vec import SpectrumDocument

reference_documents = [SpectrumDocument(s) for s in spectrums]

print(len(reference_documents))

14844


## Training the word2vec model

Data used for training the model:\
**Iterarions=10, 20, 30**\
**Workers=2**

Once the model is trained the file "references.model" is stored for further use in the score calculation

In [75]:
from spec2vec.model_building import train_new_word2vec_model
    
model_file = os.path.join(path, "references.model")
model = train_new_word2vec_model(documents=reference_documents, filename=model_file, 
                                 size=675, iterations=[10, 20, 30], 
                                 workers=2, progress_logger=False)

The value of size is set from 300 (default) to 675
The value of workers is set from 4 (default) to 2
Saving model with name: /Users/efra/dev/spec2vec_gcms_data_analysis/data/references_iter_10.model
Saving model with name: /Users/efra/dev/spec2vec_gcms_data_analysis/data/references_iter_20.model
Saving model with name: /Users/efra/dev/spec2vec_gcms_data_analysis/data/references.model


## Setting the reference and query data

All Spectra in the Monna file will be used to try to find them using the Spec2VecParallel method

In [76]:
spectrums_file = os.path.join(path, "MoNA-export-GC-MS.msp")

### Setting the reference Spectrums

In [77]:
reference_spectrums = [apply_my_filters(s) for s in load_from_msp(spectrums_file)]

reference_spectrums = [s for s in reference_spectrums if s is not None]

#Filter spectra with peak intensities length equals or less than zero
reference_spectrums = [s for s in reference_spectrums if len(s.peaks.intensities) > 0]

### Converting the reference Spectrums to Spectrums Documents

In [78]:
reference_documents = [SpectrumDocument(s) for s in reference_spectrums]

### Setting the query Spectrums

In [79]:
query_spectrums = [apply_my_filters(s) for s in load_from_msp(spectrums_file)]

query_spectrums = [s for s in query_spectrums if s is not None]

#Filter spectra with peak intensities length equals or less than zero
query_spectrums = [s for s in query_spectrums if len(s.peaks.intensities) > 0]

### Converting the query Spectrums to Spectrums Documents

In [80]:
query_documents = [SpectrumDocument(s) for s in query_spectrums]

In [81]:
print("Ref docs:", len(reference_documents), "Query docs:", len(query_documents))

Ref docs: 14844 Query docs: 14844


### Loading the model file to word2vec for the similarity function

Parameters for the similarity function:\
**intensity_weighting_power=0.5**\
**allowed_missing_percentage=5.0**

In [82]:
import gensim
from matchms import calculate_scores_parallel
from spec2vec import Spec2VecParallel

model_file = os.path.join(path, "references_iter_10.model")
model = gensim.models.Word2Vec.load(model_file)
    
# Define similarity_function
spec2vec = Spec2VecParallel(model=model, intensity_weighting_power=0.5,
                            allowed_missing_percentage=5.0)

### Calculating the similarity of the reference and query documents using Spec2vec parallel method

For the calculation the same Spectrums are being used as reference and query

In [83]:
scores_spec2vec = spec2vec(reference_documents, query_documents)

In [84]:
import numpy as np

filename = os.path.join(path,'similarities_filter05_spec2vec_3iter_reference10_size675.npy')
np.save(filename, scores_spec2vec)