# 2. Creating data from an existing mzML file

In [1]:
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from pathlib import Path
import glob

In [4]:
import sys
sys.path.append('../..')

In [5]:
from vimms.DataGenerator import extract_hmdb_metabolite, get_data_source, get_spectral_feature_database
from vimms.MassSpec import IndependentMassSpectrometer
from vimms.Controller import SimpleMs1Controller
from vimms.Common import *
from vimms.Roi import make_roi, RoiToChemicalCreator, extract_roi

In [6]:
# set_log_level_info()
set_log_level_debug()

### Load Existing Chromatogram and Fragment Pickle File

In order to do this you can either use the existing pickle file or you can generate your own pickle file using Section 1

In [8]:
data_dir = os.path.abspath(os.path.join(os.getcwd(),'..','..','tests','integration','fixtures'))
ps = load_obj(Path(data_dir,'peak_sampler_mz_rt_int_beerqcb_fullscan.p'))

### Download beer and urine files

In [8]:
url = 'http://researchdata.gla.ac.uk/870/2/example_data.zip'
base_dir = os.path.join(os.getcwd(), 'example_data')

In [9]:
if not os.path.isdir(base_dir): # if not exist then download the example data and extract it
    print('Creating %s' % base_dir)    
    out_file = 'example_data.zip'
    download_file(url, out_file)
    extract_zip_file(out_file, delete=True)
else:
    print('Found %s' % base_dir)

Found C:\Users\Vinny\work\vimms\demo\01. Data\example_data


### Extract data from files

In [10]:
filename = None                    # if None, use all mzML files found
min_ms1_intensity = 0              # min MS1 intensity threshold to include a data point for density estimation
min_ms2_intensity = 0              # min MS2 intensity threshold to include a data point for density estimation
min_rt = 0                         # min RT to include a data point for density estimation
max_rt = 1440                      # max RT to include a data point for density estimation
bandwidth_mz_intensity_rt = 1.0    # kernel bandwidth parameter to sample (mz, RT, intensity) values during simulation
bandwidth_n_peaks = 1.0            # kernel bandwidth parameter to sample number of peaks per scan during simulation

In [11]:
roi_mz_tol = 10
roi_min_length = 2
roi_min_intensity = 1.75E5
roi_start_rt = min_rt
roi_stop_rt = max_rt

Beer files

In [12]:
file_names = Path(base_dir, 'beers', 'fragmentation', 'mzML').glob('*.mzML')
out_dir = Path(base_dir,'beers', 'datasets')
mzml_path = Path(base_dir, 'beers', 'fragmentation', 'mzML')

extract_roi(list(file_names), out_dir, 'beer_%d.p', mzml_path, ps)

2020-07-17 16:23:56.760 | DEBUG    | vimms.Roi:__init__:382 -      0/ 11373
2020-07-17 16:23:57.817 | INFO     | vimms.Roi:__init__:406 - Found 11373 ROIs above thresholds
2020-07-17 16:23:57.818 | INFO     | vimms.Common:create_if_not_exist:48 - Created C:\Users\Vinny\work\vimms\demo\01. Data\example_data\beers\datasets
2020-07-17 16:23:57.819 | INFO     | vimms.Common:save_obj:61 - Saving <class 'list'> to C:\Users\Vinny\work\vimms\demo\01. Data\example_data\beers\datasets\beer_10.p
2020-07-17 16:24:14.280 | DEBUG    | vimms.Roi:__init__:382 -      0/  9306
2020-07-17 16:24:15.096 | INFO     | vimms.Roi:__init__:406 - Found 9306 ROIs above thresholds
2020-07-17 16:24:15.168 | INFO     | vimms.Common:save_obj:61 - Saving <class 'list'> to C:\Users\Vinny\work\vimms\demo\01. Data\example_data\beers\datasets\beer_11.p
2020-07-17 16:24:31.935 | DEBUG    | vimms.Roi:__init__:382 -      0/ 10706
2020-07-17 16:24:32.879 | INFO     | vimms.Roi:__init__:406 - Found 10706 ROIs above thresholds


Urine files

In [13]:
file_names = Path(base_dir, 'urines', 'fragmentation', 'mzML').glob('*.mzML')
out_dir = Path(base_dir,'urines', 'datasets')
mzml_path = Path(base_dir, 'urines', 'fragmentation', 'mzML')

extract_roi(list(file_names), out_dir, 'urine_%d.p', mzml_path, ps)

2020-07-17 16:30:23.745 | DEBUG    | vimms.Roi:__init__:382 -      0/ 15887
2020-07-17 16:30:25.319 | INFO     | vimms.Roi:__init__:406 - Found 15887 ROIs above thresholds
2020-07-17 16:30:25.320 | INFO     | vimms.Common:create_if_not_exist:48 - Created C:\Users\Vinny\work\vimms\demo\01. Data\example_data\urines\datasets
2020-07-17 16:30:25.321 | INFO     | vimms.Common:save_obj:61 - Saving <class 'list'> to C:\Users\Vinny\work\vimms\demo\01. Data\example_data\urines\datasets\urine_2.p
2020-07-17 16:30:46.326 | DEBUG    | vimms.Roi:__init__:382 -      0/ 18082
2020-07-17 16:30:48.183 | INFO     | vimms.Roi:__init__:406 - Found 18082 ROIs above thresholds
2020-07-17 16:30:48.272 | INFO     | vimms.Common:save_obj:61 - Saving <class 'list'> to C:\Users\Vinny\work\vimms\demo\01. Data\example_data\urines\datasets\urine_3.p
2020-07-17 16:31:07.731 | DEBUG    | vimms.Roi:__init__:382 -      0/ 15626
2020-07-17 16:31:09.471 | INFO     | vimms.Roi:__init__:406 - Found 15626 ROIs above thresho