# Add Missing Tags
Some of the moelcules in our database are part of existing collections and, for some reason, they are not labelled as such. 
This notebook goes through and assigns all of the molecules to each of our collections

In [14]:
from moldesign.store.mongo import MoleculePropertyDB
from multiprocessing import Pool
from functools import partial
from typing import List, Iterator, Set
from pathlib import Path
from rdkit import Chem
from tqdm import tqdm

Turn of RDKit's logging

In [15]:
import rdkit.rdBase as rkrb
import rdkit.RDLogger as rkl
logger = rkl.logger()
logger.setLevel(rkl.ERROR)
rkrb.DisableLog('rdApp.error')

Configuration

In [16]:
ncov_path = Path('/eagle/CVD-Mol-AI/release/v1.0/canonical_smiles/')

## Connect to the database
Get a connection to the MongoDB

In [17]:
mongo = MoleculePropertyDB.from_connection_info(port=27855)
print(f'Connected to database with {mongo.collection.count_documents({})} molecules')

Connected to database with 25246 molecules


In [22]:
nmp = mongo.get_molecule_record(smiles='CN1C(=O)c2ccccc2C1=O')
Path('nmp-reduced.xyz').write_text(nmp.data['small_basis']['reduced'].xyz)

1441

In [33]:
nmpp.reduction_potential

{'dfb-acn-smb-geom': 1.713531530879095,
 'dfb-vacuum-smb-geom': 0.12035400022383365,
 'smb-acn-vertical': 2.1774512017101717,
 'smb-vacuum-no-zpe': -0.4035548207926126,
 'smb-vacuum-vertical': 0.44476412682949634}

In [34]:
nmp.reduction_potential

{'dfb-acn-smb-geom': 2.877379962527396,
 'dfb-vacuum-smb-geom': 1.0718877101356208,
 'smb-acn-vertical': 2.118791342425684,
 'smb-vacuum-no-zpe': 0.41305979852865576,
 'smb-vacuum-vertical': 0.16575085973157608}

In [31]:
with open('nmp.json', 'w') as fp:
    print(nmp.json(indent=True), file=fp)

In [28]:
nmpp = mongo.get_molecule_record(smiles='COCCOCC(CN1C(=O)c2ccccc2C1=O)OC')
Path('nmp-plus-reduced.xyz').write_text(nmpp.data['small_basis']['reduced'].xyz)

3018

In [30]:
with open('nmp-plus.json', 'w') as fp:
    print(nmpp.json(indent=True), file=fp)

## Match to Enumerations from [nCov-group](https://2019-ncovgroup.github.io/data/)
These are located on ALCF's filesystems in a known format. They're huge file, so we're going to implement an out-of-core algorithm

In [None]:
def read_in_chunks(path: Path, chunk_size: int = 10000) -> Iterator[List[str]]:
    """Iterate a chunk of SMILES strings from an nCov-format file
    
    Args:
        path: Path to the file
        chunk_size: Number of molecules per chunk
    Yields:
        A chunk of molecules
    """
    
    chunk = []  # Initial chunk
    with path.open() as fp:
        for line in fp:  # Iterate over each line in the file
            smiles = line.split(",")[-1]
            chunk.append(smiles.strip())
            
            # If the chunk is big enough, yield it!
            if len(chunk) >= chunk_size:
                yield chunk
                chunk = []
        
    return chunk

In [None]:
def match_molecules(source_mols: List[str], keys_to_match: Set[str]) -> List[str]:
    """Find which molecules in a source appear in a set of InChI keys to match against
    
    Args:
        source_mols: List of SMILES strings from a certain collection
        keys_to_match: List of of InChI keys to check if are contained in the database
    Returns:
        List of `keys_to_match` that are found in the source molecules
    """
    
    # Keep track of the matches 
    hits = []
    
    # Loop through the source molecules
    for smiles in source_mols:
        mol = Chem.MolFromSmiles(smiles)
        
        # Skip if it failed to parse
        if mol is None:
            continue
            
        # Compute an InChI key
        try:
            key = Chem.MolToInchiKey(mol)
        except Chem.KekulizeException:
            continue

        # See if it's in set to match
        if key in keys_to_match:
            hits.append(key)
            
    return hits

Get all of the existing molecules as InChI keys

In [None]:
inchi_keys = mongo.get_molecules(output_key='key')
print(f'Found {len(inchi_keys)} molecules')

Loop over a few collections

In [None]:
for coll_key, coll_name in [('QM9', 'QM9'), ('PCH', 'PubChem'), ('ZIN', 'ZINC15')]:
    count_first = mongo.collection.count_documents({'subsets': {'$in': [coll_name]}})
    print(f'Found {count_first} in {coll_name} to start with')
    
    # Get the path to the file
    file = ncov_path / coll_key / f'{coll_key}.csv'
    
    # Create the generator
    chunk_gen = read_in_chunks(file)
    
    # Process the chunks in parallel
    fun = partial(match_molecules, keys_to_match=inchi_keys)
    all_hits = set()
    with Pool(32) as p:
        for hits in tqdm(p.imap_unordered(fun, chunk_gen), desc=coll_key):
            all_hits.update(hits)
    print(f'Matched {len(all_hits)} molecules to those in the {coll_key} file')
    
    # Update the MongoDB
    result = mongo.collection.update_many({'key': {'$in': list(all_hits)}}, {'$addToSet': {'subsets': coll_name}})
    print(f'Updated {result.modified_count} of {result.matched_count} matched records')

## Match to Substructures
See which molecules in the database match a certain substructure

In [None]:
def matches_substruct(smiles: str, substruct: str) -> bool:
    """Determine whether a SMILES string matches a certain substructure
    
    Args:
        smiles: SMILES string to match
        substruct: Structure to match against
    """
    
    mol = Chem.MolFromSmiles(smiles)
    smarts = Chem.MolFromSmarts(substruct)
    
    return mol.HasSubstructMatch(smarts)

Get the SMILES strings from the database

In [None]:
smiles_strings = mongo.get_molecules(output_key='identifier.smiles')
print(f'Found {len(smiles_strings)} molecules')

Loop over a few different patterns

In [None]:
for pattern, name in [('O=C2c1ccccc1C(=O)N2', 'phthalimide'),
                      ('c1cc1', 'cyclopropenium'),
                      ('Oc1ccc(O)cc1', 'DMB')]:
    # Get those smiles strings which match the pattern
    matches = [x for x in smiles_strings if matches_substruct(x, pattern)]
    
    # Update database
    result = mongo.collection.update_many({'identifier.smiles': {'$in': matches}}, {'$addToSet': {'subsets': name}})
    print(f'Updated {result.modified_count} of {result.matched_count} records that match {name}')

## Print out the Summary
List membership for molecules

In [None]:
list(mongo.collection.aggregate([
    {'$unwind': '$subsets'},
    {'$group': {
        '_id': '$subsets',
        'count': {'$sum': 1},
    }}
]))