# 3. Generating a set of random chemicals

In [1]:
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from pathlib import Path

In [4]:
import sys
sys.path.append('../..')

In [5]:
from vimms.DataGenerator import extract_hmdb_metabolite
from vimms.Common import *
from vimms.Chemicals import ChemicalCreator

In [6]:
# set_log_level_info()
set_log_level_debug()

### Load Existing Chromatogram and Fragment Pickle File

In order to do this you can either use the existing pickle file or you can generate your own pickle file using Section 1

In [8]:
data_dir = os.path.abspath(os.path.join(os.getcwd(),'..','..','tests','fixtures'))
ps = load_obj(Path(data_dir,'peak_sampler_mz_rt_int_beerqcb_fullscan.p'))

Define an output folder containing our results

In [9]:
out_dir = Path(os.getcwd(), 'results', 'MS1_single')

### Download HMDB sample

In [10]:
compound_file = Path(data_dir, 'hmdb_compounds.p')
hmdb_compounds = load_obj(compound_file)
if hmdb_compounds is None: # if file does not exist

    # download the entire HMDB metabolite database
    url = 'http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip'

    out_file = download_file(url)
    compounds = extract_hmdb_metabolite(out_file, delete=True)
    save_obj(compounds, compound_file)

else:
    print('Loaded %d DatabaseCompounds from %s' % (len(hmdb_compounds), compound_file))

Loaded 114087 DatabaseCompounds from /Users/simon/git/vimms/tests/fixtures/hmdb_compounds.p


In [11]:
hmdb = load_obj(Path(data_dir, 'hmdb_compounds.p'))

### Create a new HMDB sample

In [12]:
ROI_Sources = [str(Path(data_dir, 'beer_t10_simulator_files'))]

# minimum MS1 intensity of chemicals
min_ms1_intensity = 1.75E5

# m/z and RT range of chemicals
rt_range = [(0, 1440)]
mz_range = [(0, 1050)]

# the number of chemicals in the sample
n_chems = 6500

# maximum MS level (we do not generate fragmentation peaks when this value is 1)
ms_level = 1

In [13]:
chems = ChemicalCreator(ps, ROI_Sources, hmdb)
hmdb_dataset = chems.sample(mz_range, rt_range, min_ms1_intensity, n_chems, ms_level)
save_obj(hmdb_dataset, Path(out_dir, 'hmdb_dataset.p'))

2020-09-10 19:35:10.620 | DEBUG    | vimms.Chemicals:__init__:209 - Sorting database compounds by masses
2020-09-10 19:35:13.583 | DEBUG    | vimms.Chemicals:sample:244 - 6500 chemicals to be created.
2020-09-10 19:35:14.888 | DEBUG    | vimms.Chemicals:_sample_formulae:318 - Sampling formula 0/6500
2020-09-10 19:35:18.066 | DEBUG    | vimms.Chemicals:_sample_formulae:318 - Sampling formula 500/6500
2020-09-10 19:35:21.185 | DEBUG    | vimms.Chemicals:_sample_formulae:318 - Sampling formula 1000/6500
2020-09-10 19:35:24.383 | DEBUG    | vimms.Chemicals:_sample_formulae:318 - Sampling formula 1500/6500
2020-09-10 19:35:27.606 | DEBUG    | vimms.Chemicals:_sample_formulae:318 - Sampling formula 2000/6500
2020-09-10 19:35:30.913 | DEBUG    | vimms.Chemicals:_sample_formulae:318 - Sampling formula 2500/6500
2020-09-10 19:35:34.052 | DEBUG    | vimms.Chemicals:_sample_formulae:318 - Sampling formula 3000/6500
2020-09-10 19:35:37.883 | DEBUG    | vimms.Chemicals:_sample_formulae:318 - Sampli

In [14]:
for chem in hmdb_dataset[0:10]:
    print(chem)

KnownChemical - 'C10H20N2OS' rt=1303.86 max_intensity=752114.64
KnownChemical - 'C17H36' rt=483.17 max_intensity=854236.05
KnownChemical - 'C25H36O2' rt=196.12 max_intensity=918508.13
KnownChemical - 'C12H22O8' rt=478.12 max_intensity=1663121.94
KnownChemical - 'C18H17NO3' rt=734.31 max_intensity=331080.62
KnownChemical - 'C9H9N' rt=903.02 max_intensity=386942.42
KnownChemical - 'C3H7O4P' rt=276.06 max_intensity=965863.16
KnownChemical - 'C15H17NO10' rt=434.00 max_intensity=699439.57
KnownChemical - 'C23H26N2O6' rt=601.19 max_intensity=222509.67
KnownChemical - 'C48H82NO8P' rt=619.80 max_intensity=179245.03
