# spec2vec for GCMS
## Training a spec2vec model with GC/MS data

Example of how to train a spec2vec model using GC/MS and its results

In [21]:
import os

path = os.path.join(os.path.dirname(os.getcwd()), "data")
msp_file = os.path.join(path, "MoNA-export-GC-MS.msp")

### Reading the msp file and converting it into Spectrum objects

+ normalize intensities
+ reduce number of peaks, required: 10, radio desired: 0.5
+ select by mz, from 0 to 1000
+ filter by peaks, n required: 5

In [22]:
from matchms.filtering import normalize_intensities
from matchms.filtering import reduce_to_number_of_peaks
from matchms.filtering import require_minimum_number_of_peaks
from matchms.filtering import select_by_mz

def apply_my_filters(s):
    s = normalize_intensities(s)
    s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=5)
    return s

### Apply the filters to the GCMS file

In [23]:
from matchms.importing import load_from_msp

spectrums = [apply_my_filters(s) for s in load_from_msp(msp_file)]

spectrums = [s for s in spectrums if s is not None]

print("number of spectrums:", len(spectrums))

number of spectrums: 14361


### Convert the spectra to Spectrum documents

In [24]:
from spec2vec import SpectrumDocument

reference_documents = [SpectrumDocument(s) for s in spectrums]

print(reference_documents[0])

['peak@51.0', 'peak@55.0', 'peak@57.0', 'peak@58.0', 'peak@59.0', 'peak@60.0', 'peak@61.0', 'peak@62.0', 'peak@63.0', 'peak@66.0', 'peak@68.0', 'peak@70.0', 'peak@72.0', 'peak@73.0', 'peak@74.0', 'peak@75.0', 'peak@76.0', 'peak@78.0', 'peak@80.0', 'peak@81.0', 'peak@82.0', 'peak@83.0', 'peak@86.0', 'peak@87.0', 'peak@92.0', 'peak@93.0', 'peak@94.0', 'peak@98.0', 'peak@99.0', 'peak@100.0', 'peak@104.0', 'peak@107.0', 'peak@108.0', 'peak@110.0', 'peak@112.0', 'peak@113.0', 'peak@115.0', 'peak@116.0', 'peak@120.0', 'peak@122.0', 'peak@123.0', 'peak@124.0', 'peak@125.0', 'peak@126.0', 'peak@134.0', 'peak@135.0', 'peak@137.0', 'peak@147.0', 'peak@149.0', 'peak@150.0', 'peak@151.0', 'peak@159.0', 'peak@162.0', 'peak@163.0', 'peak@173.0', 'peak@174.0', 'peak@175.0', 'peak@177.0', 'peak@187.0', 'peak@188.0', 'peak@189.0', 'peak@190.0', 'peak@191.0', 'peak@198.0', 'peak@199.0', 'peak@200.0', 'peak@201.0', 'peak@202.0', 'peak@203.0', 'peak@207.0', 'peak@214.0', 'peak@217.0', 'peak@218.0', 'peak@

### Train the word2vec model with the Spectrum documents

In [25]:
from spec2vec.model_building import train_new_word2vec_model

model_file = "references.model"
model = train_new_word2vec_model(reference_documents, filename=model_file, iterations=[10, 20, 30], 
                                workers=2, progress_logger=True)

The value of workers is set from 4 (default) to 2
  Epoch 1 of 30.Change in loss after epoch 1: 630247.5625
  Epoch 2 of 30.Change in loss after epoch 2: 582928.4375
  Epoch 3 of 30.Change in loss after epoch 3: 553430.375
  Epoch 4 of 30.Change in loss after epoch 4: 523650.625
  Epoch 5 of 30.Change in loss after epoch 5: 478426.75
  Epoch 6 of 30.Change in loss after epoch 6: 471096.75
  Epoch 7 of 30.Change in loss after epoch 7: 473565.5
  Epoch 8 of 30.Change in loss after epoch 8: 472290.5
  Epoch 9 of 30.Change in loss after epoch 9: 441157.0
  Epoch 10 of 30.Change in loss after epoch 10: 434242.5
Saving model with name: references_iter_10.model
  Epoch 11 of 30.Change in loss after epoch 11: 425502.5
  Epoch 12 of 30.Change in loss after epoch 12: 438653.0
  Epoch 13 of 30.Change in loss after epoch 13: 439195.5
  Epoch 14 of 30.Change in loss after epoch 14: 437625.0
  Epoch 15 of 30.Change in loss after epoch 15: 414303.0
  Epoch 16 of 30.Change in loss after epoch 16: 4331