In [61]:
from __future__ import print_function

import glob

from rdkit import Chem
from rdkit.Chem import AllChem, Lipinski

In [33]:
alkene = []
alkyne = []
alcohol_1 = []
alcohol = []
ald_ket = []
amine_1 = []
acid = []
ester = []
amide_1 = []
amide = []
nitrile = []
halide = []

# excluded sterically hindered structures (e.g. neopentyl, tertiary carbon)
s_alkene = Chem.MolFromSmarts('[CX3]=[CX3]')
s_alkyne = Chem.MolFromSmarts('[CX2]#[CX2]')
s_alcohol = Chem.MolFromSmarts('[OX2H1][CX4]')
s_alcohol_1 = Chem.MolFromSmarts('[OX2H1][CX4H2][c,C&!H0]')
s_ketone = Chem.MolFromSmarts('[#6][CX3](=O)[#6]')
s_aldehyde = Chem.MolFromSmarts('[CX3H1](=O)[#6]')
s_amine_1 = Chem.MolFromSmarts('[NX3H2][CX4H2][c,C&!H0]') # primary amine, N linked to primary carbon
s_acid = Chem.MolFromSmarts('[CX3](=O)[OX2H1]')
s_ester = Chem.MolFromSmarts('[c,C&!H0][CX3](=O)[OX2H0][c,C&!H0]')
s_amide = Chem.MolFromSmarts('[NX3][CX3](=[OX1])[c,C&!H0]')
s_amide_1 = Chem.MolFromSmarts('[NX3H2][CX3](=[OX1])[c,C&!H0]')
s_nitrile = Chem.MolFromSmarts('[NX1]#[CX2][c,C&!H0]')
s_halide = Chem.MolFromSmarts('[CX4][F,Cl,Br,I]') # but only fluoride in GDB


for file in glob.iglob("data/gdb11/*.smi"): # excluded size 11
    print(file)
    with open(file) as f:
        for line in f:
            mol = Chem.MolFromSmiles(line.split()[0])
            smi = Chem.MolToSmiles(mol)
            cnt_hetatm = Lipinski.NumHeteroatoms(mol)
            
            double = len(mol.GetSubstructMatches(s_alkene))
            triple = len(mol.GetSubstructMatches(s_alkyne))
            
            if double == 0 and triple == 0:
                if cnt_hetatm == 2:
                    if mol.HasSubstructMatch(s_acid):
                        acid.append(smi)
                    elif mol.HasSubstructMatch(s_ester):
                        ester.append(smi)
                    elif mol.HasSubstructMatch(s_amide):
                        if mol.HasSubstructMatch(s_amide_1):
                            amide_1.append(smi)
                        amide.append(smi)

                elif cnt_hetatm == 1:
                    if mol.HasSubstructMatch(s_alcohol):
                        if mol.HasSubstructMatch(s_alcohol_1):
                            alcohol_1.append(smi)
                        alcohol.append(smi)
                    elif mol.HasSubstructMatch(s_aldehyde):
                        ald_ket.append(smi)
                    elif mol.HasSubstructMatch(s_ketone):
                        ald_ket.append(smi)
                    elif mol.HasSubstructMatch(s_amine_1):
                        amine_1.append(smi)
                    elif mol.HasSubstructMatch(s_nitrile):
                        nitrile.append(smi)
                    elif mol.HasSubstructMatch(s_halide):
                        halide.append(smi)
            
            elif cnt_hetatm == 0:
                if double == 1 and triple == 0: alkene.append(smi)
                if double == 0 and triple == 1: alkyne.append(smi)

data/gdb11/gdb11_size05.smi
data/gdb11/gdb11_size02.smi
data/gdb11/gdb11_size07.smi
data/gdb11/gdb11_size03.smi
data/gdb11/gdb11_size04.smi
data/gdb11/gdb11_size08.smi
data/gdb11/gdb11_size10.smi
data/gdb11/gdb11_size06.smi
data/gdb11/gdb11_size01.smi
data/gdb11/gdb11_size09.smi


In [34]:
subst_dict = {
    'alkene': alkene,
    'alkyne': alkyne,
    'alcohol_1': alcohol_1,
    'alcohol': alcohol,
    'ald_ket': ald_ket,
    'amine_1': amine_1,
    'acid': acid,
    'ester': ester,
    'amide_1': amide_1,
    'amide': amide,
    'nitrile': nitrile, 
    'halide': halide
}

for name, subst in subst_dict.iteritems():
    print(name, len(subst), sep='     \t')

alkene     	7781
ald_ket     	3398
alcohol_1     	1026
nitrile     	997
acid     	357
amide_1     	274
alcohol     	6097
amide     	2479
alkyne     	1862
halide     	6097
ester     	1036
amine_1     	1026


In [60]:
import cPickle, gzip

file_handles = []

for name, subst in subst_dict.iteritems():
    with gzip.open('data/subst/'+name+'.pkl.gz', 'wb') as f:
        cPickle.dump(subst, f, 2)

In [73]:
_CHLORO = AllChem.ReactionFromSmarts('[F:1]>>[Cl:1]')
_BROMO = AllChem.ReactionFromSmarts('[F:1]>>[Br:1]')
_IODO = AllChem.ReactionFromSmarts('[F:1]>>[I:1]')

halide_1 = []

for halide_smi in halide:
    _F = Chem.MolFromSmiles(halide_smi)
    
    if _F.HasSubstructMatch(Chem.MolFromSmarts('[F][CX4H2][c,C&!H0]')):
        _Cl = Chem.MolToSmiles(_CHLORO.RunReactants((_F,))[0][0])
        _Br = Chem.MolToSmiles(_BROMO.RunReactants((_F,))[0][0])
        _I = Chem.MolToSmiles(_IODO.RunReactants((_F,))[0][0])
        halide_1 += [_Cl, _Br, _I]

print(len(halide_1))

3078


In [74]:
with gzip.open('data/subst/halide_1.pkl.gz', 'wb') as f:
    cPickle.dump(halide_1, f, 2)