# 1.  Creating a chromatogram and fragment pickle file from an mzML

In [1]:
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from pathlib import Path

In [4]:
import sys
sys.path.append('../..')

In [5]:
from vimms.DataGenerator import  get_data_source, get_spectral_feature_database
from vimms.Common import *

In [6]:
# set_log_level_info()
set_log_level_debug()

### Download beer and urine files

In [7]:
url = 'http://researchdata.gla.ac.uk/870/2/example_data.zip'
base_dir = os.path.join(os.getcwd(), 'example_data')

In [8]:
if not os.path.isdir(base_dir): # if not exist then download the example data and extract it
    print('Creating %s' % base_dir)    
    out_file = 'example_data.zip'
    download_file(url, out_file)
    extract_zip_file(out_file, delete=True)
else:
    print('Found %s' % base_dir)

Found /Users/simon/git/vimms/demo/01. Data/example_data


### Generate Spectral Feature Database

In this section we demonstrate how ViMMS constructs the spectral feature database containing information, such as the densities of m/z, RT and intensities, scan durations, MS2 peaks, from the example Beer mzML files. The spectral feature database will be used to sample for various features during the simulation later.

The following two methods get_data_source and get_spectral_feature_database from ViMMS will be used.

- get_data_source loads a DataSource object that stores information on a set of .mzML files
- get_spectral_feature_database extracts relevant features from .mzML files that have been loaded into the DataSource.  

The parameter below should work for most cases, however for different data, it might be necessary to adjust the min_rt and max_rt values.

In [9]:
filename = None                    # if None, use all mzML files found
min_ms1_intensity = 0              # min MS1 intensity threshold to include a data point for density estimation
min_ms2_intensity = 0              # min MS2 intensity threshold to include a data point for density estimation
min_rt = 0                         # min RT to include a data point for density estimation
max_rt = 1440                      # max RT to include a data point for density estimation
bandwidth_mz_intensity_rt = 1.0    # kernel bandwidth parameter to sample (mz, RT, intensity) values during simulation
bandwidth_n_peaks = 1.0            # kernel bandwidth parameter to sample number of peaks per scan during simulation

Load fullscan data and train spectral feature database

In [10]:
mzml_path = Path(base_dir, 'beers', 'fullscan', 'mzML')
xcms_output = Path(mzml_path, 'extracted_peaks_ms1.csv')
out_file = Path(base_dir, 'peak_sampler_mz_rt_int_19_beers_fullscan.p')

In [11]:
ds_fullscan = get_data_source(mzml_path, filename, xcms_output)

2020-09-10 19:29:02.712 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_18_fullscan1.mzML
2020-09-10 19:29:03.808 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_10_fullscan1.mzML
2020-09-10 19:29:04.979 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_17_fullscan1.mzML
2020-09-10 19:29:06.109 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_16_fullscan1.mzML
2020-09-10 19:29:07.139 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_11_fullscan1.mzML
2020-09-10 19:29:08.374 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_19_fullscan1.mzML
2020-09-10 19:29:09.454 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_13_fullscan1.mzML
2020-09-10 19:29:10.566 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_14_fullscan1.mzML
2020-09-10 19:29:11.696 | INFO     | vimms.DataGenerator:load_data:161 - Loading

In [12]:
ps_fullscan = get_spectral_feature_database(ds_fullscan, filename, min_ms1_intensity, min_ms2_intensity, min_rt, max_rt,
               bandwidth_mz_intensity_rt, bandwidth_n_peaks, out_file)

2020-09-10 19:29:25.345 | DEBUG    | vimms.DataGenerator:__init__:436 - Extracted 0 MS2 scans
2020-09-10 19:29:25.346 | DEBUG    | vimms.DataGenerator:_compute_intensity_props:614 - Computing parent intensity proportions
2020-09-10 19:29:25.346 | DEBUG    | vimms.DataGenerator:__init__:445 - Extracting scan durations
2020-09-10 19:29:25.348 | DEBUG    | vimms.DataGenerator:_kde:626 - Training KDEs for ms_level=1
2020-09-10 19:29:25.350 | DEBUG    | vimms.DataGenerator:_kde:637 - Retrieving mz_intensity_rt values from <vimms.DataGenerator.DataSource object at 0x11723b2d0>
2020-09-10 19:29:25.351 | INFO     | vimms.DataGenerator:get_data:278 - Using values from XCMS peaklist
2020-09-10 19:29:25.443 | DEBUG    | vimms.DataGenerator:_kde:637 - Retrieving n_peaks values from <vimms.DataGenerator.DataSource object at 0x11723b2d0>
2020-09-10 19:30:16.997 | DEBUG    | vimms.DataGenerator:_kde:626 - Training KDEs for ms_level=2
2020-09-10 19:30:16.998 | DEBUG    | vimms.DataGenerator:_kde:637 -

In [13]:
ps_fullscan.get_peak(1, 10) # try to sample 10 MS1 peaks

[Peak mz=263.0090 rt=712.48 intensity=4312825.37 ms_level=1,
 Peak mz=1012.1795 rt=896.08 intensity=622684.57 ms_level=1,
 Peak mz=209.1711 rt=282.29 intensity=293156.40 ms_level=1,
 Peak mz=258.0727 rt=249.11 intensity=141313.06 ms_level=1,
 Peak mz=175.9108 rt=1262.87 intensity=2096907.69 ms_level=1,
 Peak mz=308.6492 rt=243.19 intensity=198660.24 ms_level=1,
 Peak mz=347.4767 rt=282.54 intensity=27455.68 ms_level=1,
 Peak mz=122.3940 rt=288.40 intensity=374172.28 ms_level=1,
 Peak mz=228.6600 rt=501.78 intensity=185453.10 ms_level=1,
 Peak mz=374.3074 rt=635.84 intensity=779903.98 ms_level=1]

Load fragmentation data and train spectral feature database

In [14]:
mzml_path = Path(base_dir, 'beers', 'fragmentation', 'mzML')
xcms_output = Path(mzml_path, 'extracted_peaks_ms1.csv')
out_file = Path(base_dir, 'peak_sampler_mz_rt_int_19_beers_fragmentation.p')

In [15]:
ds_fragmentation = get_data_source(mzml_path, filename, xcms_output)

2020-09-10 19:30:37.152 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_5_T10_POS.mzML
2020-09-10 19:30:45.613 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_2_T10_POS.mzML
2020-09-10 19:30:53.611 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_19_T10_POS.mzML
2020-09-10 19:31:03.083 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_3_T10_POS.mzML
2020-09-10 19:31:11.053 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_4_T10_POS.mzML
2020-09-10 19:31:21.253 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_18_T10_POS.mzML
2020-09-10 19:31:30.155 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_12_T10_POS.mzML
2020-09-10 19:31:38.435 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_15_T10_POS.mzML
2020-09-10 19:31:47.215 | INFO     | vimms.DataGenerator:load_data:161 - Loading Beer_multibeers_9_T

In [None]:
ps = get_spectral_feature_database(ds_fragmentation, filename, min_ms1_intensity, min_ms2_intensity, min_rt, max_rt,
               bandwidth_mz_intensity_rt, bandwidth_n_peaks, out_file)

2020-09-10 19:33:27.192 | DEBUG    | vimms.DataGenerator:__init__:436 - Extracted 138969 MS2 scans
2020-09-10 19:33:27.193 | DEBUG    | vimms.DataGenerator:_compute_intensity_props:614 - Computing parent intensity proportions
2020-09-10 19:33:38.120 | DEBUG    | vimms.DataGenerator:__init__:445 - Extracting scan durations
2020-09-10 19:33:38.130 | DEBUG    | vimms.DataGenerator:_kde:626 - Training KDEs for ms_level=1
2020-09-10 19:33:38.131 | DEBUG    | vimms.DataGenerator:_kde:637 - Retrieving mz_intensity_rt values from <vimms.DataGenerator.DataSource object at 0x149031a10>
2020-09-10 19:33:38.131 | INFO     | vimms.DataGenerator:get_data:278 - Using values from XCMS peaklist
2020-09-10 19:33:38.215 | DEBUG    | vimms.DataGenerator:_kde:637 - Retrieving n_peaks values from <vimms.DataGenerator.DataSource object at 0x149031a10>
2020-09-10 19:36:55.036 | DEBUG    | vimms.DataGenerator:_kde:626 - Training KDEs for ms_level=2
2020-09-10 19:36:55.037 | DEBUG    | vimms.DataGenerator:_kde:

In [None]:
ps.get_peak(1, 10)

In [None]:
ps.get_peak(2, 10)