## bbSelectBuild usage example
- bbSelectBuild is a package for building the bbSelect databases
- It can be used quickly (see full_usage_example.ipynb) or each step can be run separately, as exemplified below. 

In [1]:
import sys
import os
import logging
sys.path.append('../')
from bbSelectBuild import bbSelectDBbuilder

In [2]:
## mGAP parameters
cell_size = 1
num_cells = 20
ncpu = len(os.sched_getaffinity(0)) -1

rxn_smarts = '[C:1](=[O:2])[OH]>>[C:1](=[O:2])[15CH3]'
smiles_file = '../data/enamine_acids/enamine_acids_filtered.csv'
smiles_file_sep = ','
align_mols_sdf_path = '../data/enamine_acids/enamine_acids_filtered_aligned_test.sdf'
output_root = '../data/enamine_acids/enamine_acids_filtered_omega_test'
conformer_sdf = '../data/enamine_acids/enamine_acids_filtered_clipped_conformers.sdf'
clipped_smiles_path = '../data/enamine_acids/enamine_acids_filtered_clipped.tsv'

bbSelectDB = bbSelectDBbuilder(cell_size = cell_size, 
                num_cells = num_cells,
                pharmacophore_family_dict = 'default', 
                ncpu = ncpu)

# Load in the smiles file
smiles_file = smiles_file
logging.info('loading smiles file')
bbSelectDB.load_smiles_file(smiles_file, sep = smiles_file_sep)

# Clip the compounds. If this has already been done they can be loaded in seperately (see below)
logging.info('loading clipped smiles file')
#bbSelectDB.enumerate_clipped_smiles(rxn_smarts)
bbSelectDB.load_clipped_smiles_file(clipped_smiles_path, sep = '\t')

2024-02-20 17:01:51,001 - root - INFO - loading smiles file
2024-02-20 17:01:51,060 - root - INFO - loading clipped smiles file


In [3]:
# Can generate conformers inside the script or load from outside.
# If they've already been aligned and saved, they can be loaded later and this part skipped entirely

#bbSelectDB.generate_conformers()
bbSelectDB.load_conformer_sdf(conformer_sdf)

2024-02-20 17:01:51,084 - root - INFO - loading conformer sdf
2024-02-20 17:03:05,922 - root - INFO - 563029 conformers loaded in 74.83570504188538 seconds


In [4]:
# Align molecules. Can also be loaded in if already performed

logging.info('aligning mols')
bbSelectDB.align_mols(output_file_path = align_mols_sdf_path)

#bbSelectDB.load_aligned_conformer_sdf('./data/enamine_acids/enamine_acids/enamine_acids_filtered_aligned.sdf')

2024-02-20 17:03:05,929 - root - INFO - aligning mols
2024-02-20 17:03:05,931 - root - INFO - Running alignment over 29 cores
2024-02-20 17:04:09,305 - root - INFO - 563029 compounds aligned over 29 CPU cores in 63.37388229370117 seconds
2024-02-20 17:04:20,425 - root - INFO - writing aligned mols to ../data/enamine_acids/enamine_acids_filtered_aligned_test.sdf
2024-02-20 17:04:59,752 - root - INFO - wrote aligned mols in 39.33 seconds


In [5]:
logging.info('getting pharmacophore_features')
bbSelectDB.get_pharmacophore_features()

2024-02-20 17:04:59,779 - root - INFO - getting pharmacophore_features
2024-02-20 17:04:59,781 - root - INFO - Capturing pharmacophores over 29 cores
2024-02-20 17:06:18,203 - root - INFO - 563029 pharmacophores captured over 29 cores in 78.42165303230286 seconds


In [6]:
logging.info('generating fingerprints')
bbSelectDB.get_fingerprint_dictionary()

2024-02-20 17:06:18,297 - root - INFO - generating fingerprints
2024-02-20 17:06:22,661 - root - INFO - 19275 fingerprints generated over 1 CPU core in 4.361240386962891 seconds


In [8]:
logging.info('writing bbgap DB')
bbSelectDB.write_bbSelect_DB(output_root)

2024-02-20 17:09:51,032 - root - INFO - writing bbgap DB
2024-02-20 17:09:56,117 - root - INFO - 19275 fingerprints written in 5.081454753875732 seconds
2024-02-20 17:09:56,118 - root - INFO - Writing reference file
2024-02-20 17:09:59,823 - root - INFO - Reference file written
