# Scanning Freesolv for Heterocycles

We are going to import the ring_mols dict and only look at 1 an 2 ring systems.  Then we will make another dict where we separate by the number of members in the ring...

After this, each entry in the dict will be checked to make sure that they have the same number of heavy atoms.  perturbations will be conducted on systems where (among the ring members) there is a different neighborhood list.

In [1]:
# get the freesolv database
import requests
from openeye import oechem
from perses.utils.openeye import createOEMolFromSMILES, createOEMolFromIUPAC
import tqdm

In [2]:
mol = createOEMolFromIUPAC('naphthalene')

In [3]:
oechem.OEFindRingAtomsAndBonds(mol)

In [4]:
nrings, parts = oechem.OEDetermineRingSystems(mol)

In [5]:
mol = oechem.OEGraphMol()
oechem.OESmilesToMol(mol, "C(O)(=O)c1cccc2c1[nH]c(C3CCCc4c3cccc4)c2")

nraromsystems, parts = oechem.OEDetermineRingSystems(mol)

(nraromsystems, parts)

(2, [0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1])

In [6]:
print("Aliphatic atoms:", end=" ")
for atom in mol.GetAtoms():
    if parts[atom.GetIdx()] == 0:
        print(atom.GetIdx(), end=" ")
print()

print("Number of aromatic ring systems =", nraromsystems)

for ringidx in range(1, nraromsystems + 1):
    print(ringidx, ". aromatic ring system:", end=" ")
    for atom in mol.GetAtoms():
        if parts[atom.GetIdx()] == ringidx:
            print(atom.GetIdx(), end=" ")
    print()

Aliphatic atoms: 0 1 2 
Number of aromatic ring systems = 2
1 . aromatic ring system: 3 4 5 6 7 8 9 10 21 
2 . aromatic ring system: 11 12 13 14 15 16 17 18 19 20 


In [7]:
mol = createOEMolFromIUPAC('naphthalene')

In [8]:
for atom in mol.GetAtoms():
    if atom.IsInRing():
        print('yes')
    size = oechem.OEAtomGetSmallestRingSize(atom)
    if size == 0:
        print(atom.GetIdx(), "acyclic")
    else:
        print(atom.GetIdx(), "smallest ring size=", size)

yes
0 smallest ring size= 6
yes
1 smallest ring size= 6
yes
2 smallest ring size= 6
yes
3 smallest ring size= 6
yes
4 smallest ring size= 6
yes
5 smallest ring size= 6
yes
6 smallest ring size= 6
yes
7 smallest ring size= 6
yes
8 smallest ring size= 6
yes
9 smallest ring size= 6
10 acyclic
11 acyclic
12 acyclic
13 acyclic
14 acyclic
15 acyclic
16 acyclic
17 acyclic


In [9]:
def check_for_same_ring_structure(molA, molB):
    """
    -argument: oemolA, oemolB
    -return: bool, True if the architecture is the same (regardless of the atom)
    """
        
    #now the rigorous check to make sure that for each ring, the size/smallest ring sizes are the same
    sizeA = sorted([oechem.OEAtomGetSmallestRingSize(atom) for atom in molA.GetAtoms() if oechem.OEAtomGetSmallestRingSize(atom) != 0])
    sizeB = sorted([oechem.OEAtomGetSmallestRingSize(atom) for atom in molB.GetAtoms() if oechem.OEAtomGetSmallestRingSize(atom) != 0])
    
    if sizeA != sizeB:
        return False

    
    atomic_numsA = sorted([atom.GetAtomicNum() for atom in molA.GetAtoms() if atom.IsInRing()])
    atomic_numsB = sorted([atom.GetAtomicNum() for atom in molB.GetAtoms() if atom.IsInRing()])
    if atomic_numsA != atomic_numsB and len(atomic_numsA) == len(atomic_numsB):
        return True
    else:
        return False
        
        
    
    

In [16]:
def scan_for_heterocycles(combinations):
    """
    -argument: list of smiles tuples of potential matches
    -return: list of smiles that have the same ring system but differ by 1 or more constituent element
    """
    subs_list = []
    molA, molB = oechem.OEGraphMol(), oechem.OEGraphMol() 

    for idx in tqdm.trange(len(combinations)):
        smileA, smileB = combinations[idx][0], combinations[idx][1]
        oechem.OESmilesToMol(molA, smileA)
        oechem.OESmilesToMol(molB, smileB)
        
        if check_for_same_ring_structure(molA, molB):
            subs_list.append([smileA, smileB])
    return subs_list
            
        
        

In [17]:
import numpy as np
from itertools import combinations
ring_mols = np.load('ring_mols.pkl')

In [18]:
heterocycle_subs = {}
for key in ring_mols.keys():
    if key == 0:
        continue
    
    refined_list = [i for i in ring_mols[key] if not any(q in i for q in ['@', '+', '-'])]
    combos = list(combinations(refined_list, 2))
    
    subs_list = scan_for_heterocycles(combos)
    
    heterocycle_subs[key] = subs_list
    

100%|██████████████████████████████████████████████████████████████████████████| 32896/32896 [00:05<00:00, 6352.90it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 210/210 [00:00<00:00, 3239.56it/s]


In [19]:
heterocycle_subs


{1: [[' C1CCC(CC1)N', ' Cc1cc(cnc1)C'],
  [' C1CCC(CC1)N', ' Cc1ccncc1'],
  [' C1CCC(CC1)N', ' C1COCCN1'],
  [' C1CCC(CC1)N', ' Cc1ccncc1C'],
  [' C1CCC(CC1)N', ' CC(C)Cc1cnccn1'],
  [' C1CCC(CC1)N', ' c1c(c(=O)[nH]c(=O)[nH]1)I'],
  [' C1CCC(CC1)N', ' CN1CCNCC1'],
  [' C1CCC(CC1)N', ' c1cc(cnc1)Cl'],
  [' C1CCC(CC1)N', ' c1ccncc1'],
  [' C1CCC(CC1)N', ' CCNc1nc(nc(n1)SC)NC(C)C'],
  [' C1CCC(CC1)N', ' Cc1c(c(=O)n(c(=O)[nH]1)C(C)(C)C)Cl'],
  [' C1CCC(CC1)N', ' Cc1c(nc(nc1OC(=O)N(C)C)N(C)C)C'],
  [' C1CCC(CC1)N', ' c1c(c(=O)[nH]c(=O)[nH]1)F'],
  [' C1CCC(CC1)N', ' C1CCOCC1'],
  [' C1CCC(CC1)N', ' c1c(c(=O)[nH]c(=O)[nH]1)Br'],
  [' C1CCC(CC1)N', ' CN1CCN(CC1)C'],
  [' C1CCC(CC1)N', ' Cc1ccccn1'],
  [' C1CCC(CC1)N', ' c1ccnc(c1)Cl'],
  [' C1CCC(CC1)N', ' c1c(=O)[nH]c(=O)[nH]c1Cl'],
  [' C1CCC(CC1)N', ' Cc1cccc(n1)C'],
  [' C1CCC(CC1)N', ' CCc1cccnc1'],
  [' C1CCC(CC1)N', ' c1cnccc1C#N'],
  [' C1CCC(CC1)N', ' CCc1cnccn1'],
  [' C1CCC(CC1)N', ' CCOP(=S)(OCC)Oc1cc(nc(n1)C(C)C)C'],
  [' C1CCC(C

In [20]:
for key in heterocycle_subs.keys():
    print(len(heterocycle_subs[key]))

7697
19


In [21]:
import pickle
with open('heterocycle_subs.pkl', 'wb') as handle:
    pickle.dump(heterocycle_subs, handle)