# 3. Generating a set of random chemicals

In [1]:
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from pathlib import Path

In [4]:
import sys
sys.path.append('../..')

In [5]:
from vimms.DataGenerator import extract_hmdb_metabolite
from vimms.Common import *
from vimms.Chemicals import ChemicalCreator

In [6]:
# set_log_level_info()
set_log_level_debug()

### Load Existing Chromatogram and Fragment Pickle File

In order to do this you can either use the existing pickle file or you can generate your own pickle file using Section 1

In [7]:
data_dir = os.path.abspath(os.path.join(os.getcwd(),'..','..','tests','integration','fixtures'))
ps = load_obj(Path(data_dir,'peak_sampler_mz_rt_int_beerqcb_fullscan.p'))

Define an output folder containing our results

In [9]:
out_dir = Path(os.getcwd(), 'results', 'MS1_single')

### Download HMDB sample

In [11]:
compound_file = Path(data_dir, 'hmdb_compounds.p')
hmdb_compounds = load_obj(compound_file)
if hmdb_compounds is None: # if file does not exist

    # download the entire HMDB metabolite database
    url = 'http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip'

    out_file = download_file(url)
    compounds = extract_hmdb_metabolite(out_file, delete=True)
    save_obj(compounds, compound_file)

else:
    print('Loaded %d DatabaseCompounds from %s' % (len(hmdb_compounds), compound_file))

Loaded 114087 DatabaseCompounds from C:\Users\Vinny\work\vimms\tests\integration\fixtures\hmdb_compounds.p


In [13]:
hmdb = load_obj(Path(data_dir, 'hmdb_compounds.p'))

### Create a new HMDB sample

In [20]:
ROI_Sources = [str(Path(data_dir, 'beer_t10_simulator_files'))]

# minimum MS1 intensity of chemicals
min_ms1_intensity = 1.75E5

# m/z and RT range of chemicals
rt_range = [(0, 1440)]
mz_range = [(0, 1050)]

# the number of chemicals in the sample
n_chems = 6500

# maximum MS level (we do not generate fragmentation peaks when this value is 1)
ms_level = 1

In [21]:
chems = ChemicalCreator(ps, ROI_Sources, hmdb)
hmdb_dataset = chems.sample(mz_range, rt_range, min_ms1_intensity, n_chems, ms_level)
save_obj(hmdb_dataset, Path(out_dir, 'hmdb_dataset.p'))

2020-08-17 11:19:01.805 | DEBUG    | vimms.Chemicals:__init__:242 - Sorting database compounds by masses
2020-08-17 11:19:04.369 | DEBUG    | vimms.Chemicals:sample:276 - 6500 chemicals to be created.
2020-08-17 11:19:05.627 | DEBUG    | vimms.Chemicals:_sample_formulae:350 - Sampling formula 0/6500
2020-08-17 11:19:08.278 | DEBUG    | vimms.Chemicals:_sample_formulae:350 - Sampling formula 500/6500
2020-08-17 11:19:10.878 | DEBUG    | vimms.Chemicals:_sample_formulae:350 - Sampling formula 1000/6500
2020-08-17 11:19:13.914 | DEBUG    | vimms.Chemicals:_sample_formulae:350 - Sampling formula 1500/6500
2020-08-17 11:19:17.182 | DEBUG    | vimms.Chemicals:_sample_formulae:350 - Sampling formula 2000/6500
2020-08-17 11:19:20.476 | DEBUG    | vimms.Chemicals:_sample_formulae:350 - Sampling formula 2500/6500
2020-08-17 11:19:23.746 | DEBUG    | vimms.Chemicals:_sample_formulae:350 - Sampling formula 3000/6500
2020-08-17 11:19:27.282 | DEBUG    | vimms.Chemicals:_sample_formulae:350 - Sampli

In [22]:
for chem in hmdb_dataset[0:10]:
    print(chem)

KnownChemical - 'C2HCl3O2' rt=582.09 max_intensity=1616270.76
KnownChemical - 'H4Sn' rt=219.42 max_intensity=452090.88
KnownChemical - 'C11H16O9' rt=495.29 max_intensity=299551.79
KnownChemical - 'C25H44O7P2' rt=268.84 max_intensity=283682.78
KnownChemical - 'C7H9N' rt=391.54 max_intensity=1801275.67
KnownChemical - 'C5H6O4' rt=224.69 max_intensity=293003.09
KnownChemical - 'C24H30O6' rt=634.02 max_intensity=2469915.41
KnownChemical - 'C20H14O3' rt=418.58 max_intensity=207972.85
KnownChemical - 'C30H49N6O10' rt=262.84 max_intensity=465245.01
KnownChemical - 'C12H20N4O3' rt=670.49 max_intensity=468316.50
