In [1]:
# get the freesolv database
import requests
from openeye import oechem
from perses.utils.openeye import createOEMolFromSMILES, createOEMolFromIUPAC
import tqdm


In [2]:
def separate_cycles(smiles_list, nrings = [0,1,2]):
    from openeye import oechem
    """
    -input list is a list of smiles
    -outputs a dictionary with keys as nrings and the smiles associated with each
    """
    ring_mols = {i:[] for i in nrings}
    for item_index in tqdm.trange(len(smiles_list)):
        mol = oechem.OEGraphMol()
        smile = smiles_list[item_index]
        oechem.OESmilesToMol(mol, smile)
        
        num_rings, parts = oechem.OEDetermineRingSystems(mol)
        
        try:
            ring_mols[num_rings].append(smile)
        except:
            print(f"{smile} does not live in any of the {nrings} ring systems.  Skipping...")
    
    return ring_mols 
            

In [4]:
def freesolv_to_smiles(url = 'https://raw.githubusercontent.com/MobleyLab/FreeSolv/master/database.txt'):
    """
    the following will turn the freesolv dataset (given url) into a smiles list (tuples with (smile, iupac))
    """
    smiles_list = []
    whole_dataset = requests.get(url).text.split('\n')[:-1]
    whole_dataset = [i for i in whole_dataset if i[0] != '#']
    for item_index in tqdm.trange(len(whole_dataset)):
        line = whole_dataset[item_index]
        details = line.split(';')
        smile = details[1]
        smiles_list.append(smile)
    
    return smiles_list
        

Begin parsing data and organizing

In [5]:
smiles_list = freesolv_to_smiles()

100%|████████████████████████████████████████████████████████████████████████████| 642/642 [00:00<00:00, 672849.37it/s]


In [6]:
ring_mols = separate_cycles(smiles_list)

100%|█████████████████████████████████████████████████████████████████████████████| 642/642 [00:00<00:00, 17399.37it/s]


begin the process of finding all halide matches

In [7]:
def natoms_nbonds_bool(molA, molB, check_halides = True, check_connectivity = True, hydrogens_halides = [1,9,17,35,53]):
    """
    given two smiles, the function will determine whether the molecules share the same number of molecules and bonds.
    if check_halides == True, the function will then check whether the molecules share the same number of elements 
    (minus hydrogens and halides)
    """
    #first check
    if not (molA.NumAtoms() == molB.NumAtoms() and molA.NumBonds() == molB.NumBonds()):
        return False
    
    if check_connectivity:
        #will check whether the elements are connected in the right order...
        molA_connectivity = [sorted([bond.GetBgn().GetAtomicNum(), bond.GetEnd().GetAtomicNum()]) for bond in molA.GetBonds()]
        molB_connectivity = [sorted([bond.GetBgn().GetAtomicNum(), bond.GetEnd().GetAtomicNum()]) for bond in molB.GetBonds()]

        molA_connectivity = [i for i in molA_connectivity if not bool(set(i).intersection(hydrogens_halides))]
        molB_connectivity = [i for i in molB_connectivity if not bool(set(i).intersection(hydrogens_halides))]
        if len(molA_connectivity) != len(molB_connectivity):
            return False
        if not all(a in molB_connectivity for a in molA_connectivity):
            return False
    
#     elif check_halides:
#         atomic_numsA = [atom.GetAtomicNum() for atom in molA.GetAtoms() if atom.GetAtomicNum() not in hydrogens_halides]
#         atomic_numsB = [atom.GetAtomicNum() for atom in molB.GetAtoms() if atom.GetAtomicNum() not in hydrogens_halides]
#         if sorted(atomic_numsA) != sorted(atomic_numsB):
    return True
        
        
    

In [8]:
def molecule_neighbors(mol, neglect_atoms = [1,9,17,35,53]):
    """
    -given a mol, the function will loop through every atom in the molecule and return a list of neighbor atomic numbers
    (neglecting hydrogens and halides if specified).  the loop will neglect hydrogens and halides if specified.
    -returns a list of list of atom atomic numbers
    """
    atoms = mol.GetAtoms()
    neighbors_list = []
    for atom in atoms:
        if atom.GetAtomicNum() in neglect_atoms:
            continue
        neighbors = atom.GetAtoms()
        neighbor_atomic_nums = [atom.GetAtomicNum() for atom in neighbors if atom.GetAtomicNum() not in neglect_atoms]
        neighbors_list.append(sorted(neighbor_atomic_nums))
        
    return neighbors_list

In [9]:
def remove_common_elements(a, b):
    new_a, new_b = [], []
    #we iteratively remove the common elements between two lists a and b, returning the resultant lists
    for element in a:
        if element not in b:
            new_a.append(element)
        else:
            b.remove(element)
    return new_a, b



In [10]:
def check_halide_substitution(molA, molB, check_substitution = [1,9,17,35,53]):
    """
    -will check a tuple for whether it has the given substitution
    -bool
    """
    
    #the first thing we have to do is check that the molecules have the same number of atoms and bonds
    #and that the connectivity is the same (with the exception of the given substitution)
    if natoms_nbonds_bool(molA, molB, check_halides = True, check_connectivity = True, hydrogens_halides = check_substitution):
        
        #now we can go deeper and check if both molecules share the neighbors lists (with exceptions)
        neighbors_listA, neighbors_listB = molecule_neighbors(molA, neglect_atoms = check_substitution), molecule_neighbors(molB, neglect_atoms = check_substitution)
        new_a, new_b = remove_common_elements(neighbors_listA, neighbors_listB)
        if new_a == [] and new_b == []:
            return True
    return False
    

we need to separate the 0 ring list (from the ring_mols dict) into another list of carbons


In [11]:
def separate_by_ncarbons(smiles_list):
    """
    -the following function will separate a smiles list into a dict of keys = number_of_carbons
    """
    ncarbons_dict = {} 
    mol = oechem.OEGraphMol()
    for smile in smiles_list:
        oechem.OESmilesToMol(mol, smile)
        atoms = mol.GetAtoms()
        carbons = 0
        for atom in atoms:
            if atom.IsCarbon():
                carbons += 1
        
        if carbons not in ncarbons_dict.keys():
            ncarbons_dict[carbons] = [smile]
        else:
            ncarbons_dict[carbons].append(smile)
    return ncarbons_dict
        
        
        

In [12]:
chain_dict = separate_by_ncarbons(ring_mols[0])

now we can start (i think)


In [13]:
def run(smiles_list):
    """
    the following function will execute the smiles list into combinations of 2 and return a list of tuples (of oemols) as 
    viable halide jumps...
    """
    from itertools import combinations
    from perses.utils import openeye
    
    molA, molB = oechem.OEGraphMol(), oechem.OEGraphMol()

    substitution_smiles = []
    combos = list(combinations(smiles_list, 2))
    
    for i in tqdm.trange(len(combos)):
        smileA, smileB = combos[i][0], combos[i][1]
        oechem.OESmilesToMol(molA, smileA)
        oechem.OESmilesToMol(molB, smileB)
        oechem.OEAssignAromaticFlags(molA, oechem.OEAroModelOpenEye); oechem.OEAddExplicitHydrogens(molA)
        oechem.OEAssignAromaticFlags(molB, oechem.OEAroModelOpenEye); oechem.OEAddExplicitHydrogens(molB)

        halogens = 0
        for atom in molA.GetAtoms():
            if atom.IsHalogen():
                halogens += 1
        for atom in molA.GetAtoms():
            if atom.IsHalogen():
                halogens += 1

        if halogens == 0:
            continue

        if check_halide_substitution(molA, molB, check_substitution = [1,9,17,35,53]):
            substitution_smiles.append((molA, molB))
    
    return substitution_smiles
    
    
    

In [14]:
#make a combo dict for the chain_dict
chain_dict_combinations = {}
for key, value in chain_dict.items():
    comb_list = run(value)
    chain_dict_combinations[key] = comb_list
    
    

100%|██████████████████████████████████████████████████████████████████████████████| 378/378 [00:00<00:00, 5670.74it/s]
100%|███████████████████████████████████████████████████████████████████████████| 1035/1035 [00:00<00:00, 11530.83it/s]
100%|███████████████████████████████████████████████████████████████████████████| 1081/1081 [00:00<00:00, 13137.32it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 171/171 [00:00<00:00, 7143.61it/s]
100%|███████████████████████████████████████████████████████████████████████████| 1035/1035 [00:00<00:00, 12985.77it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 703/703 [00:00<00:00, 10673.34it/s]
100%|████████████████████████████████████████████████████████████████████████████| 1275/1275 [00:00<00:00, 9279.66it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 28/28 [00:00<00:00, 4679.09it/s]
100%|███████████████████████████████████

In [15]:
import pickle
with open('carbon_chain_halide_subs.pkl', 'wb') as handle:
    pickle.dump(chain_dict_combinations, handle)
    

In [16]:
#make a combo dict for the 1 and 2 ring systems
ring_dict_combinations = {}
for key, value in ring_mols.items():
    if key != 0:
        comb_list = run(value)
        ring_dict_combinations[key] = comb_list

100%|██████████████████████████████████████████████████████████████████████████| 44253/44253 [00:05<00:00, 7859.80it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 276/276 [00:00<00:00, 3179.02it/s]


In [17]:
with open('ring_halide_subs.pkl', 'wb') as handle:
    pickle.dump(ring_dict_combinations, handle)

We now want to depict all of the molecules...


In [20]:
chain_dict_combo_lengths = [(key, len(value)) for key, value in chain_dict_combinations.items()]

In [22]:
chain_dict_combo_lengths

[(7, 6),
 (4, 13),
 (2, 178),
 (8, 0),
 (3, 38),
 (5, 7),
 (6, 5),
 (10, 0),
 (9, 0),
 (1, 104),
 (0, 0),
 (12, 0),
 (11, 0)]

In [23]:
ring_dict_combo_lengths = [(key, len(value)) for key, value in ring_dict_combinations.items()]

In [24]:
ring_dict_combo_lengths 

[(1, 229), (2, 79)]