<h1>Extract SMARTS</h1>

In [1]:
import pandas as pd
import numpy as np
import pymongo
import sys
import os
from __future__ import print_function
from datetime import datetime

TOP = '/'.join(os.getcwd().split('/')[:-2])+'/'
LIB = TOP+'lib'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

DAT_DIR = TOP + 'data/'
FIG_DIR = TOP + 'figs/'

if not os.path.exists(DAT_DIR): os.mkdir(DAT_DIR)
if not os.path.exists(FIG_DIR): os.mkdir(FIG_DIR)
    
from db.mongo import *

from rax.genrapred import *
import db.etl as etl
from db.fpsim import *
from rdkit import Chem

import pymongo
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compound']
physprop=DB['physprop']

In [2]:
with open(DAT_DIR + 'epa_categories.xml','r') as f:
    xml=f.read()

In [3]:
xml=xml.replace('\n','')

In [4]:
xml

'<Scheme z:Id="1" xmlns="http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine" xmlns:i="http://www.w3.org/2001/XMLSchema-instance" xmlns:z="http://schemas.microsoft.com/2003/10/Serialization/"><CounterProfile z:Id="2">Not categorized</CounterProfile><Credits z:Id="3"><Adopted z:Id="4">QSAR Toolbox 2.0 beta, April 2010</Adopted><Author z:Id="5"/><Changelog z:Id="6">{\\rtf1\\ansi\\ansicpg1252\\uc1\\htmautsp\\deff2{\\fonttbl{\\f0\\fcharset0 Times New Roman;}{\\f2\\fcharset0 Segoe UI;}{\\f3\\fcharset0 Calibri;}}{\\colortbl\\red0\\green0\\blue0;\\red255\\green255\\blue255;}\\loch\\hich\\dbch\\pard\\plain\\ltrpar\\itap0{\\lang1033\\fs22\\f3\\cf0 \\cf0\\ql{\\f3 {\\ltrch SMARTS language for describing molecular patterns, i.e. structural boundaries, structural alerts has been implemented in OECD QSAR Toolbox 4.0.\\~ As a result "US-EPA New Chemical Categories" profiler has been rewritten but without modifying the knowledge and/or the logic it is based on. Only small distinctions are exp

In [5]:
import xml.etree.ElementTree as ET
e=ET.parse(DAT_DIR+'epa_categories.xml').getroot()

In [6]:
parent_map = {c:p for p in e.iter() for c in p}

In [7]:
import operator as op
op_dict={
    'GreaterThan': op.gt,
    'GreaterThanOrEqualTo': op.ge,
    'LessThan': op.lt,
    'LessThanOrEqualTo': op.le
}
prop_dict={
    'log Kow':'logp',
    'Molecular Weight':'mol_weight',
    'Molecular weight':'mol_weight',
    'Water Solubility': 'ws'
}

In [8]:
def define_smart_match(smart):
    pattern=Chem.MolFromSmarts(smart)
    if not pattern:
        return None
    def smart_match(x):
        mol=x['mol']
        ret=True if mol.GetSubstructMatches(pattern) else False
        return ret
    return smart_match
def define_compare(prop,operand,value):
    def compare(x):
        ret = op_dict[operand](x[prop_dict[prop]],value)
        return ret
    return compare

In [9]:
class Query:
    
    def __init__(self,xml,qid=None):
        self.xml=xml
        self.id=qid
        self.logic=None
        self.subqueries=[]
        self.category=None
        
    def write_query(self,qtype,tree):
        self.type=qtype
        if qtype=='b:StructureQuery':
            qstring=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ComplexSearch').text
            qstring=re.sub('false','False',qstring)
            qstring=re.sub('true','True',qstring)
            qdict=ast.literal_eval(qstring)
            smart=qdict['queries'][0]['smart']
            self.smart=smart
            if '[Ch3,#1]' in self.smart:
                split=re.search(r'(.*)\[([^\(\)]*),([^\(\)].*)\]$',self.smart)
                split1=split.group(1)
                split2=split.group(1)+'['+split.group(2)+']'
                smart_match1=define_smart_match(split1)
                smart_match2=define_smart_match(split2)
                def smart_match(x):
                    return any([smart_match1(x),smart_match2(x)])
            else:
                smart_match=define_smart_match(smart)
            self.query=smart_match
        elif qtype=='b:ParameterQuery':
            self.operand=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}Operand').text
            self.prop=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ParameterName').text
            self.value=float(self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}Value').text)
            compare=define_compare(self.prop,self.operand,self.value)
            self.query=compare
        elif qtype=='LogicalQuery':
            self.logic=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Logic').text
            elements=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Elements')
            node_ids=[elem.attrib['{http://schemas.microsoft.com/2003/10/Serialization/}Ref']\
                      for elem in elements.findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query')\
                      if '{http://schemas.microsoft.com/2003/10/Serialization/}Ref' in elem.attrib]
            if self.logic=='Not':
                node_id=node_ids[0] #Should only be one
                sq=tree[node_id]
                self.subqueries=[sq]
                def func(x):
                    return not(sq.query(x))
                self.query=func
            elif self.logic=='And':
                sqs=[tree[node_id] for node_id in node_ids]
                self.subqueries=sqs
                def func(x):
                    return all([sq.query(x) for sq in self.subqueries])
                self.query=func
            else:
                sqs=[tree[node_id] for node_id in node_ids]
                self.subqueries=sqs
                for orquery in elements.findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query'):
                    if '{http://www.w3.org/2001/XMLSchema-instance}type' in orquery.attrib:
                        extra_sq=Query(orquery)
                        extra_sq.write_query('b:StructureQuery',tree)
                        sqs.append(extra_sq)      
                def func(x):
                    return any([sq.query(x) for sq in self.subqueries])
                self.query=func
    
    def print_tree(self,x,tabs=0):
        qinfo=(self.id,self.type)
        if self.type=='b:StructureQuery':
            qinfo=qinfo+(self.smart,)
        elif self.type=='b:ParameterQuery':
            qinfo=qinfo+(self.prop,self.value,self.operand)
        elif self.type=='LogicalQuery':
            qinfo=qinfo+(self.logic,)
        try:
            qinfo=qinfo+(self.query(x),)
        except:
            qinfo=qinfo+('does not process',)
        print('\t'*tabs+str(qinfo))
        for sq in self.subqueries:
            sq.print_tree(x,tabs+1)

In [10]:
all_tests={}
bad_smarts=set()
bad_cats=set()
import re
import ast
for elem in e.iter('{http://schemas.microsoft.com/2003/10/Serialization/Arrays}anyType'):
    category=elem.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Caption').text
    queries=elem.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Expression')\
        .find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Queries')\
        .findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query')
    contents=[query.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Content') for query in queries]
    query_tree={}
    for query in contents:
        attributes=query.attrib
        if '{http://schemas.microsoft.com/2003/10/Serialization/}Id' not in attributes:
            continue
        query_id=attributes['{http://schemas.microsoft.com/2003/10/Serialization/}Id']
        query_type=attributes['{http://www.w3.org/2001/XMLSchema-instance}type']
        q=Query(query,query_id)
        q.category=category
        q.write_query(query_type,query_tree)
        if not q.query or not all([sq.query for sq in q.subqueries]): #Smarts did not compile, sqs needed bc of hidden sqs in or queries
            bad_cats.add(category)
            if q.type=='b:StructureQuery':
                bad_smarts.add(q.smart)
        query_tree[query_id]=q
    all_tests[category]=query_tree[query_id] #Final one should always be the top level query hopefully

In [11]:
bad_cats

{'Aliphatic Amines',
 'Alkoxysilanes',
 'Aminobenzothiazole Azo Dyes',
 'Anionic Surfactants',
 'Dianilines',
 'Dithiocarbamates (Acute toxicity)',
 'Dithiocarbamates (Chronic toxicity)',
 'Ethylene Glycol Ethers',
 'Neutral Organics',
 'Nonionic Surfactants',
 'Organotins (Acute toxicity)',
 'Organotins (Chronic toxicity)',
 'Persistent, Bioaccumulative and Toxic (PBT) Chemicals',
 'Polynitroaromatics (Acute toxicity)',
 'Polynitroaromatics (Chronic toxicity)',
 'Substituted Triazines (Acute toxicity)',
 'Substituted Triazines (Chronic toxicity)',
 'Triarylmethane Pigments/Dyes with Non-solubilizing Groups',
 'beta-Naphthylamines, Sulfonated'}

In [238]:
tests=all_tests.copy()
for category in bad_cats:
    del tests[category]

In [184]:
# import dill
# with open(DAT_DIR+'tests.pkl','w') as f:
#     dill.dump(tests,f)

<h1>Test on OECD_NCC_ToxVal</h1>

In [62]:
tv=pd.read_excel(DAT_DIR+'toxval_epa_categories.xlsx')
len(tv)

11299

In [63]:
casns=list(tv['CAS Number'])
toxval_sids=dsstox.find({'casrn':{'$in':casns}})
sids_dict={record['casrn']:record['dsstox_sid'] for record in toxval_sids}
sids=sids_dict.values()
toxval_logp=physprop.find({'dsstox_sid':{'$in':sids}},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_LogP':1})
logp_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_LogP'][0] for record in toxval_logp \
           if 'OPERA_LogP' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
toxval_ws=physprop.find({'dsstox_sid':{'$in':sids}},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_WS':1})
ws_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_WS'][0] for record in toxval_ws \
           if 'OPERA_WS' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
toxval_weight=dsstox.find({'dsstox_sid':{'$in':sids}})
weight_dict={record['dsstox_sid']:record['mol_weight'] for record in toxval_weight}

In [64]:
tv['dsstox_sid']=tv['CAS Number'].map(sids_dict)
tv['logp']=tv['dsstox_sid'].map(logp_dict)
tv['ws']=tv['dsstox_sid'].map(ws_dict)
tv['mol_weight']=tv['dsstox_sid'].map(weight_dict)
tv=tv.drop(['Molecular Formula','Predefined substance type','Additional Ids','Composition','CAS Smiles relation'],'columns')
tv=tv.rename(columns={'SMILES':'smiles'})
tv=tv[pd.notnull(tv).all(axis=1)]
tv=tv.drop_duplicates()
len(tv)

9837

In [65]:
#from pymongo import InsertOne
toxval_cats={}
for i,row in tv.iterrows():
    record=dict(row)
    sid=record['dsstox_sid']
    mol=Chem.MolFromSmiles(record['smiles'])
    if not mol:
        print(i)
        continue
    record['mol']=mol
    categories=[category for category,test in tests.iteritems() if test.query(record)]
    if len(categories)==0:
        toxval_cats[sid]='Not categorized'
    else:
        toxval_cats[sid]=categories='|'.join(categories)

777


In [66]:
tv['categories']=tv['dsstox_sid'].map(toxval_cats)
tv=tv[pd.notnull(tv['categories'])]
len(tv)

9836

In [67]:
len(toxval_cats)

9836

In [25]:
mymatchset=set(tv[~tv['categories'].str.contains('\|')]['categories'].unique())
theirmatchset=set(tv[~tv['US-EPA New Chemical Categories'].str.contains('\|')]['US-EPA New Chemical Categories'].unique())
missmatchset=theirmatchset-mymatchset

In [26]:
import re
esc_bad_cats=[re.escape(c) for c in bad_cats]
mismatched=tv[~(tv['US-EPA New Chemical Categories'].str.contains('|'.join(esc_bad_cats))) & (tv['US-EPA New Chemical Categories']!=tv['categories'])]
with pd.option_context('display.max_rows', None):
    mismatched

Unnamed: 0,CAS Number,smiles,US-EPA New Chemical Categories,dsstox_sid,logp,ws,mol_weight,categories
179,117-84-0,CCCCCCCCOC(=O)c1ccccc1C(=O)OCCCCCCCC,Not categorized,DTXSID1021956,7.74397,1.98749e-06,390.564,Esters (Chronic toxicity)
265,103-23-1,CCCCC(CC)COC(=O)CCCCC(=O)OCC(CC)CCCC,Not categorized,DTXSID0020606,6.85289,3.90355e-06,370.574005,Esters (Chronic toxicity)
267,117-81-7,CCCCC(CC)COC(=O)c1ccccc1C(=O)OCC(CC)CCCC,Not categorized,DTXSID5020607,7.60297,2.40728e-06,390.564,Esters (Chronic toxicity)
462,82657-04-3,Cc1c(COC(=O)C2C(C=C(Cl)C(F)(F)F)C2(C)C)cccc1-c...,Not categorized,DTXSID9020160,6.58958,2.57909e-05,422.87,Esters (Chronic toxicity)
486,10453-86-8,CC(C)=CC1C(C(=O)OCc2coc(Cc3ccccc3)c2)C1(C)C,Esters (Chronic toxicity),DTXSID7022253,4.94434,6.17537e-06,338.446991,Esters (Acute toxicity)
510,39515-41-8,CC1(C)C(C(=O)OC(C#N)c2cccc(Oc3ccccc3)c2)C1(C)C,Esters (Chronic toxicity),DTXSID0024002,4.20779,1.15486e-05,349.429993,Esters (Acute toxicity)
524,1689-99-2,CCCCCCCC(=O)Oc1c(Br)cc(cc1Br)C#N,Esters (Chronic toxicity),DTXSID7023932,4.37739,9.86796e-06,403.114014,Esters (Acute toxicity)
714,630-08-0,C=O,Aldehydes (Acute toxicity),DTXSID5027273,0.237403,11.8802,28.01,Not categorized
747,87-29-6,Nc1ccccc1C(=O)OCC=Cc1ccccc1,Anilines (Acute toxicity)|Esters (Acute toxicity),DTXSID3020330,3.90328,8.61323e-05,253.300995,Esters (Acute toxicity)|Anilines (Acute toxicity)
794,4221-68-5,Oc1ccc(cc1C1CCCCC1)C1(CCCCC1)c1ccc(O)c(c1)C1CC...,Not categorized,DTXSID6051306,8.3388,2.50594e-06,432.64801,Phenols (Chronic toxicity)


In [27]:
len(mismatched)

308

In [139]:
#mismatched.to_excel(DAT_DIR+'mismatched_categories.xlsx')

In [136]:
# with open(DAT_DIR+'bad_categories.txt','w') as f:
#     f.write('\n'.join(list(bad_cats)))

<h1>Get Category Fingerprints</h1>
Moved to script

There was a bug where some substances got skipped. Need to fill in the blanks or rerun

In [69]:
import pymongo
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compound']
physprop=DB['physprop']
epa_cats=DB['epa_categories']

In [109]:
# dsstox_smiles=dsstox.find({},{'_id':0,'dsstox_sid':1,'smiles':1,'mol_weight':1})
# dsstox_logp=physprop.find({},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_LogP':1})
# #logp_dict_temp={record['dsstox_sid']:record.get('predicted_props',{}) for record in dsstox_logp if 'dsstox_sid' in record.keys()}
# #logp_dict={sid:props['OPERA_LogP'][0] for sid,props in logp_dict_temp.iteritems() if 'OPERA_LogP' in props.keys()}
# logp_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_LogP'][0] for record in dsstox_logp \
#            if 'OPERA_LogP' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
# dsstox_ws=physprop.find({},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_WS':1})
# ws_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_WS'][0] for record in dsstox_ws \
#            if 'OPERA_WS' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}

In [117]:
dsstox_smiles=dsstox.find({'dsstox_sid':'DTXSID60871632'},{'_id':0,'dsstox_sid':1,'smiles':1,'mol_weight':1})

In [118]:
dsstox_smiles.count()
epa_cats.count()

1

729994

In [119]:
from pymongo import InsertOne
i=0
inserts=[]
while i<dsstox_smiles.count():
#     if inserts:
#         epa_cats.bulk_write(inserts)
#         inserts=[]
    j=0
    dsstox_smiles=dsstox.find({'dsstox_sid':'DTXSID60871632'},{'_id':0,'dsstox_sid':1,'smiles':1,'mol_weight':1},batch_size=20000)
    dsstox_smiles.skip(i)
    while j<=5000: #Must do it this way because cursor will time out
        i+=1
        j+=1
        record=dsstox_smiles.next()
        sid=record['dsstox_sid']
        try:
            record['logp']=logp_dict[sid]
            record['ws']=ws_dict[sid]
        except:
            continue
        smiles=record['smiles']
        try:
            mol=Chem.MolFromSmiles(smiles)
        except:
            print(sid + ' no smiles')
            continue
        if not mol:
            continue
        record['mol']=mol
        categories=[category for category,test in tests.iteritems() if test.query(record)]
        categories_record={'dsstox_sid':sid,'categories':categories}
        print(categories_record)
        break
        #inserts.append(InsertOne(categories_record))

<pymongo.cursor.Cursor at 0x7faf8d5c4910>

{'dsstox_sid': u'DTXSID60871632', 'categories': []}


In [78]:
notebook_categories={}
for i,row in tv.iterrows():
    record=dict(row)
    sid=record['dsstox_sid']
    mol=Chem.MolFromSmiles(record['smiles'])
    if not mol:
        print(i)
        continue
    record['mol']=mol
    notebook_categories[sid]={category for category,test in tests.iteritems() if test.query(record)} 

In [74]:
db_categories=epa_cats.find({'dsstox_sid':{'$in':list(tv['dsstox_sid'])}})

In [75]:
script_categories={record['dsstox_sid']:set(record['categories']) for record in db_categories}

In [80]:
len(notebook_categories)
len(script_categories)

9836

9793

In [104]:
missing_sids=set(notebook_categories.keys())-set(script_categories.keys())
#{record['dsstox_sid']:record['smiles'] for record in dsstox.find({'dsstox_sid':{'$in':list(missing_sids)}})}
[(sid,dsstox.find_one({'dsstox_sid':sid})['smiles'],tv[tv['dsstox_sid']==sid].iloc[0]['smiles']) for sid in missing_sids]

[(u'DTXSID0038338',
  u'FAIL',
  u'CC1(C)C(C=C(Cl)Cl)C1C(=O)OCc1cccc(Oc2ccccc2)c1'),
 (u'DTXSID20873576',
  u'CC(C)CC(=O)O[C@@H]1C[C@@H]2CC[C@@]1(C)C2(C)C',
  u'CC(C)C=C(O)OC1CC2CCC1(C)C2(C)C'),
 (u'DTXSID10872513',
  u'Cl.CC1=CC=C(C=C1)C(=C/CN1CCCC1)\\C1=CC=CC=N1',
  u'Cc1ccc(cc1)C(=CCN1CCCC1)c1ccccn1'),
 (u'DTXSID4023466', u'FAIL', u'CC(N)C(O)c1ccccc1'),
 (u'DTXSID90858931', u'FAIL', u'COc1cccc(c1)C1(O)CCCCC1CN(C)C'),
 (u'DTXSID9044166',
  u'FAIL',
  u'ClC1C2C(C3OC13Cl)C1(Cl)C(Cl)=C(Cl)C2(Cl)C1(Cl)Cl'),
 (u'DTXSID10873601',
  u'CCSC(C)C1(CC)C(=O)NC(=S)NC1=O',
  u'CCSC(C)C1(CC)C(=O)NC(=S)NC1=O'),
 (u'DTXSID0042612', u'FAIL', u'CNc1cc(OC)c(cc1Cl)C(=O)NC1CCN(Cc2ccccc2)C1C'),
 (u'DTXSID20870444', u'CCCCC(CC)COC=C', u'CCCCC(CC)COC=C'),
 (u'DTXSID20239158',
  u'FAIL',
  u'COC1(NC(=O)CSC(F)F)C2OCC(CSc3nnnn3CCO)=C(N2C1=O)C(O)=O'),
 (u'DTXSID10873949',
  u'O=C(OCCCOC(=O)C1=CC=CC=C1)C1=CC=CC=C1',
  u'O=C(OCCCOC(=O)c1ccccc1)c1ccccc1'),
 (u'DTXSID60194898',
  u'FAIL',
  u'CC1(C)SC2C(NC(=O)C(NC(=

In [123]:
for sid in set(notebook_categories.keys())&set(script_categories.keys()):
    if notebook_categories[sid]!=script_categories[sid]:
        print (sid,notebook_categories[sid],script_categories[sid])

DTXSID7035725 set(['Phenols (Acute toxicity)']) set([])
DTXSID30217583 set([]) set([u'Hydrazines and Related Compounds'])
DTXSID5058432 set(['Thiols (Acute toxicity)']) set([])
DTXSID7042061 set([]) set([u'Esters (Acute toxicity)'])
DTXSID8021270 set(['Phenols (Acute toxicity)']) set([])
DTXSID9020794 set(['Thiols (Acute toxicity)']) set([])
DTXSID7041336 set(['Phenols (Acute toxicity)']) set([])
DTXSID60186011 set(['Thiols (Acute toxicity)']) set([])
DTXSID0052124 set([]) set([u'Boron Compounds'])
DTXSID9042174 set([]) set([u'Hydrazines and Related Compounds'])
DTXSID8028898 set(['Phenols (Acute toxicity)']) set([])
DTXSID0058354 set(['Phenols (Acute toxicity)']) set([])
DTXSID3051672 set([]) set([u'Boron Compounds'])
DTXSID4021343 set(['Thiols (Acute toxicity)']) set([])
DTXSID40236129 set(['Thiols (Acute toxicity)']) set([])
DTXSID90177497 set(['Phenols (Acute toxicity)']) set([u'Phenols (Acute toxicity)', u'Azides (Acute toxicity)'])
DTXSID10217122 set(['Thiols (Acute toxicity)']) 

In [139]:
dsstox.find_one({'dsstox_sid':'DTXSID7035725'},{'smiles':1})
record=dict(tv[tv['dsstox_sid']=='DTXSID60873772'].iloc[0])
record['mol']=Chem.MolFromSmiles(record['smiles'])

{u'_id': ObjectId('58fe61a6f0e291b4c06a6805'),
 u'smiles': u'[Na+].CCC(C)(C)C1=CC=C([O-])C=C1'}

In [141]:
tests['Phenols (Acute toxicity)'].print_tree(record)

('605', 'LogicalQuery', 'And', False)
	('595', 'LogicalQuery', 'And', False)
		('585', 'b:StructureQuery', 'c1ccccc1[#8h]', False)
		('589', 'LogicalQuery', 'Not', True)
			('587', 'b:StructureQuery', 'c12c(cccc1)cccc2', False)
		('593', 'LogicalQuery', 'Not', True)
			('591', 'b:StructureQuery', 'c1(-c2ccccc2)ccccc1', False)
	('603', 'LogicalQuery', 'And', True)
		('597', 'b:ParameterQuery', 'Molecular weight', 1000.0, 'LessThan', True)
		('600', 'b:ParameterQuery', 'log Kow', 7.38000011444092, 'LessThan', True)


<h1>Test on ToxCast</h1>

In [13]:
tc=pd.read_excel(DAT_DIR+'OECD_NCC_TXCST.xlsx')

In [14]:
len(tc)
tc.head()

4746

Unnamed: 0,#,CAS Number,Chemical name(s),SMILES,Predefined substance type,Additional Ids,Composition,CAS Smiles relation,US-EPA New Chemical Categories
0,1,18699-02-0,[4-(acetylamino)phenyl]acetic acid|4-acetamido...,CC(=O)Nc1ccc(CC(O)=O)cc1,Mono constituent,EC Number:3810891,,High,Not categorized
1,2,50594-66-6,5-[2-chloro-4-(trifluoromethyl)phenoxy]-2-nitr...,OC(=O)c1cc(Oc2ccc(cc2Cl)C(F)(F)F)ccc1[N+]([O-])=O,Mono constituent,EC Number:3823358,,High,Neutral Organics
2,3,2113-61-3,4-Aminodiphenyl.HCl|4-Biphenylamine hydrochlor...,Cl.Nc1ccc(cc1)-c1ccccc1,Multi constituent,,C:2; A:0; I:0,Low,Anilines (Acute toxicity)|Not categorized
3,4,61-82-5,"1H-[1,2,4]Triazol-3-ylamine|1H-1,2,4-Triazol-3...",Nc1nc[nH]n1,Mono constituent,EC Number:3774339,,High,Not categorized
4,5,134-03-2,(+)-sodium l-ascorbate (sodium ascorbate) (l-a...,[Na+].OCC(O)C1OC(=O)C(O)=C1[O-],Mono constituent,EC Number:3778578,,High,Not categorized


In [16]:
casns=list(tc['CAS Number'])
toxval_sids=dsstox.find({'casrn':{'$in':casns}})
sids_dict={record['casrn']:record['dsstox_sid'] for record in toxval_sids}
sids=sids_dict.values()
toxval_logp=physprop.find({'dsstox_sid':{'$in':sids}},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_LogP':1})
logp_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_LogP'][0] for record in toxval_logp \
           if 'OPERA_LogP' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
toxval_ws=physprop.find({'dsstox_sid':{'$in':sids}},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_WS':1})
ws_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_WS'][0] for record in toxval_ws \
           if 'OPERA_WS' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
toxval_weight=dsstox.find({'dsstox_sid':{'$in':sids}})
weight_dict={record['dsstox_sid']:record['mol_weight'] for record in toxval_weight}

In [17]:
tc['dsstox_sid']=tc['CAS Number'].map(sids_dict)
tc['logp']=tc['dsstox_sid'].map(logp_dict)
tc['ws']=tc['dsstox_sid'].map(ws_dict)
tc['mol_weight']=tc['dsstox_sid'].map(weight_dict)
tc=tc.drop(['Predefined substance type','Additional Ids','Composition','CAS Smiles relation'],'columns')
tc=tc.rename(columns={'SMILES':'smiles'})
tc=tc[pd.notnull(tc).all(axis=1)]
tc=tc.drop_duplicates()
len(tc)

4257

In [166]:
toxcast_cats={}
for i,row in tc.iterrows():
    record=dict(row)
    sid=record['dsstox_sid']
    mol=Chem.MolFromSmiles(record['smiles'])
    if not mol:
        print(i)
        continue
    record['mol']=mol
    categories=[category for category,test in tests.iteritems() if test.query(record)]
    if len(categories)==0:
        toxcast_cats[sid]='Not categorized'
    else:
        toxcast_cats[sid]=categories='|'.join(categories)

498
965
1616
1753
1900
2282
2487
2756
3772
4056
4144
4453
4481
4681


In [224]:
4257-818

3439

In [214]:
tc['categories']=tc['dsstox_sid'].map(toxcast_cats)
tc=tc[pd.notnull(tc['categories'])]

In [223]:
esc_bad_cats=[re.escape(c) for c in bad_cats]
tc[tc['US-EPA New Chemical Categories'].str.contains('|'.join(esc_bad_cats))]

Unnamed: 0,#,CAS Number,Chemical name(s),smiles,US-EPA New Chemical Categories,dsstox_sid,logp,ws,mol_weight,categories
1,2,50594-66-6,5-[2-chloro-4-(trifluoromethyl)phenoxy]-2-nitr...,OC(=O)c1cc(Oc2ccc(cc2Cl)C(F)(F)F)ccc1[N+]([O-])=O,Neutral Organics,DTXSID0020022,4.558690,6.620120e-03,361.660004,Not categorized
5,6,22839-47-0,"Aspartame|L-Phenylalanine, L-.alpha.-aspartyl-...",COC(=O)C(Cc1ccccc1)NC(=O)C(N)CC(O)=O,Aliphatic Amines|Esters (Acute toxicity),DTXSID0020107,-0.506647,1.145820e-01,294.307007,Esters (Acute toxicity)
9,10,88-73-3,1-Chloro-2-nitrobenzene|2-CHLORO-1-NITROBENZEN...,[O-][N+](=O)c1ccccc1Cl,Neutral Organics,DTXSID0020280,2.641930,2.903340e-02,157.550003,Not categorized
14,15,150-68-5,"3-(4-chlorophenyl)-1,1-dimethylurea|3-(p-Chlor...",CN(C)C(=O)Nc1ccc(Cl)cc1,Neutral Organics,DTXSID0020311,2.145090,1.257080e-01,198.649994,Not categorized
16,17,1897-45-6,"1,3-Benzenedicarbonitrile, 2,4,5,6-tetrachloro...",Clc1c(Cl)c(C#N)c(Cl)c(C#N)c1Cl,Neutral Organics,DTXSID0020319,2.890060,1.300940e-03,265.899994,Not categorized
17,18,4998-76-9,"Cyclohexanamine, hydrochloride (1:1)|Cyclohexy...",Cl.NC1CCCCC1,Aliphatic Amines|Not categorized,DTXSID0020361,0.929745,5.209240e+00,135.639999,Not categorized
19,20,52-89-1,cysteine hydrochloride|Cysteine hydrochloride ...,Cl.NC(CS)C(O)=O,Aliphatic Amines|Not categorized|Thiols (Acute...,DTXSID0020367,-2.336350,1.699750e+00,157.610001,Thiols (Acute toxicity)
20,21,120-36-5,"(RS)-2-(2,4-dichlorophenoxy)propionic acid|2-(...",CC(Oc1ccc(Cl)cc1Cl)C(O)=O,Neutral Organics,DTXSID0020440,2.833250,4.801180e-02,235.059998,Not categorized
21,22,94-75-7,"(2,4-Dichloro-phenoxy)-acetic acid|(2,4-dichlo...",OC(=O)COc1ccc(Cl)cc1Cl,Neutral Organics,DTXSID0020442,2.500740,5.585260e-02,221.029999,Not categorized
22,23,330-54-1,"3-(3,4-dichlorophenyl)-1,1-dimethylurea|dicycl...",CN(C)C(=O)Nc1ccc(Cl)c(Cl)c1,Neutral Organics,DTXSID0020446,2.619970,1.590820e-01,233.089996,Not categorized


In [215]:
mismatched_indexes=[]
for i,row in tc.iterrows():
    theirset=set(row['US-EPA New Chemical Categories'].split('|'))
    if theirset&bad_cats:
        continue
    myset=set(row['categories'].split('|'))
    if myset!=theirset:
        mismatched_indexes.append(i)  

In [217]:
with pd.option_context('display.max_colwidth',100):
    tc.loc[mismatched_indexes]

Unnamed: 0,#,CAS Number,Chemical name(s),smiles,US-EPA New Chemical Categories,dsstox_sid,logp,ws,mol_weight,categories
2,3,2113-61-3,4-Aminodiphenyl.HCl|4-Biphenylamine hydrochloride|DTXSID0020072,Cl.Nc1ccc(cc1)-c1ccccc1,Anilines (Acute toxicity)|Not categorized,DTXSID0020072,3.038600,6.784230e-03,205.690002,Anilines (Acute toxicity)
13,14,3165-93-3,4-Chloro-2-methylaniline hydrochloride|4-Chloro-o-toluidine hydrochloride|4-Chloro-o-toluidine.H...,Cl.Cc1cc(Cl)ccc1N,Anilines (Acute toxicity)|Not categorized,DTXSID0020288,1.893530,6.452560e-02,178.059998,Anilines (Acute toxicity)
31,32,103-23-1,"|bis(2-ethylhexyl)|ester|bis(2-ethylhexyl)adipate|1,6-bis(2-ethylhexyl)|hexanedioic acid, bis(2-...",CCCCC(CC)COC(=O)CCCCC(=O)OCC(CC)CCCC,Not categorized,DTXSID0020606,6.852890,3.903550e-06,370.574005,Esters (Chronic toxicity)
103,104,79902-63-9,"2,2-dimethylbutanoic acid (1s,3r,7s,8s,8ar)-1,2,3,7,8,8a-hexahydro-3,7-dimethyl-8-[2-[(2r,4r)-te...",CCC(C)(C)C(=O)OC1CC(C)C=C2C=CC(C)C(CCC3CC(O)CC(=O)O3)C12,Esters (Chronic toxicity),DTXSID0023581,4.312860,1.434020e-07,418.574005,Esters (Acute toxicity)
115,116,39515-41-8,"�-cyano-3-phenoxybenzyl 2,2,3,3-tetramethylcyclopropanecarboxylate|(RS)-α-cyano-3-phenoxybenzyl ...",CC1(C)C(C(=O)OC(C#N)c2cccc(Oc3ccccc3)c2)C1(C)C,Esters (Chronic toxicity),DTXSID0024002,4.207790,1.154860e-05,349.429993,Esters (Acute toxicity)
125,126,532-82-1,"1,3-Benzenediamine, 4-(phenylazo)-, monohydrochloride|4-[(Z)-phenyldiazenyl]benzene-1,3-diamine ...",Cl.Nc1ccc(N=Nc2ccccc2)c(N)c1,Anilines (Acute toxicity)|Not categorized,DTXSID0024559,2.379850,1.375280e-03,248.710007,Anilines (Acute toxicity)
217,218,79241-46-6,(2r)-2-[4-[[5-(trifluoromethyl)-2-pyridinyl]oxy]phenoxy]propanoic acid butyl ester|Butyl (2R)-2-...,CCCCOC(=O)C(C)Oc1ccc(Oc2ccc(cn2)C(F)(F)F)cc1,Esters (Chronic toxicity),DTXSID0034855,4.493030,1.207630e-03,383.367004,Esters (Acute toxicity)
379,380,139340-56-0,Darbufelone mesylate|DTXSID0047246,CS(O)(=O)=O.CC(C)(C)c1cc(C=C2SC(=N)NC2=O)cc(c1O)C(C)(C)C,Not categorized|Phenols (Acute toxicity),DTXSID0047246,3.837250,1.154720e-02,428.559998,Phenols (Acute toxicity)
503,504,20325-40-0,"3,3'-Dimethoxybenzidine dihydrochloride|3,3'-Dimethoxybenzidine.2HCl|3,3'-dimethoxybiphenyl-4,4'...",Cl.Cl.COc1cc(ccc1N)-c1ccc(N)c(OC)c1,Anilines (Acute toxicity)|Not categorized,DTXSID1020485,1.990030,5.305470e-03,317.209991,Anilines (Acute toxicity)
534,535,989-38-8,"2-[6-(Ethylamino)-3-(ethylimino)-2,7-dimethyl-3H-xanthen-9-yl]benzoic acid, Ethyl ester, Monohyd...",[Cl-].CCNc1cc2[o+]c3cc(NCC)c(C)cc3c(-c3ccccc3C(=O)OCC)c2cc1C,Not categorized,DTXSID1021243,5.819210,7.965290e-04,479.019989,Esters (Chronic toxicity)


In [218]:
record=tc.loc[614]
record['mol']=Chem.MolFromSmiles(record['smiles'])
tests['Acrylates/Methacrylates (Acute toxicity)'].print_tree(record)

('70', 'LogicalQuery', 'And', True)
	('67', 'b:ParameterQuery', 'Molecular weight', 1000.0, 'LessThan', True)
	('64', 'b:ParameterQuery', 'log Kow', 5.0, 'LessThan', True)
	('62', 'b:StructureQuery', '[#6h2]=[#6]([#6](=[#8])[#8])[Ch3,#1]', True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.loc[key] = value


In [219]:
tc.loc[mismatched_indexes].to_excel(DAT_DIR+'mismatched_toxcast.xlsx')

<h1>Fix bad SMARTS</h1>

In [489]:
bad_cats

{'Aliphatic Amines',
 'Alkoxysilanes',
 'Aminobenzothiazole Azo Dyes',
 'Anionic Surfactants',
 'Dianilines',
 'Dithiocarbamates (Acute toxicity)',
 'Dithiocarbamates (Chronic toxicity)',
 'Ethylene Glycol Ethers',
 'Neutral Organics',
 'Nonionic Surfactants',
 'Organotins (Acute toxicity)',
 'Organotins (Chronic toxicity)',
 'Persistent, Bioaccumulative and Toxic (PBT) Chemicals',
 'Polynitroaromatics (Acute toxicity)',
 'Polynitroaromatics (Chronic toxicity)',
 'Substituted Triazines (Acute toxicity)',
 'Substituted Triazines (Chronic toxicity)',
 'Triarylmethane Pigments/Dyes with Non-solubilizing Groups',
 'beta-Naphthylamines, Sulfonated'}

In [19]:
record=tc.loc[614]
record['mol']=Chem.MolFromSmiles(record['smiles'])
for cat in bad_cats:
    print(cat+'\n')
    print(all_tests[cat].print_tree(record))
    print('\n')

Ethylene Glycol Ethers

('1438', 'b:StructureQuery', '[#8]{>-1}$[[#1],$[[#6X4]]{..7}]{1..;x}.$[c,$[[#6X4]]{..7},[#1]]{1..;x}{<-1}.$[[#6h2]{<-1}[#6h2][#8]{>-1}]{1..3}', 'does not process')
None


Substituted Triazines (Acute toxicity)

('690', 'LogicalQuery', 'And', 'does not process')
	('682', 'b:StructureQuery', '[a~1]1[a~1][a~1][a~1][a~1][a~1]1.$[$[[#6]]{3},$[[#7]]{3}]{~1}', 'does not process')
	('684', 'b:ParameterQuery', 'log Kow', 5.0, 'LessThan', True)
	('687', 'b:ParameterQuery', 'Molecular weight', 1000.0, 'LessThan', True)
None


Dithiocarbamates (Acute toxicity)

('923', 'LogicalQuery', 'And', 'does not process')
	('913', 'LogicalQuery', 'Or', 'does not process')
		('909', 'b:StructureQuery', '[#16v2][#6](=[#16])[#7v3]([#6X4]$[[#6X4]]{1..;x})[#6X4]$[[#6X4]]{1..;x}', 'does not process')
		('911', 'b:StructureQuery', '[#16v2][#6](=[#16])[#7hv3][#6h2][#6h2][#7hv3][#6](=[#16])[#16v2]', False)
	('921', 'LogicalQuery', 'And', True)
		('915', 'b:ParameterQuery', 'log Kow', 5.0, 'Les

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [None]:
new_tests={}

In [539]:
#Aliphatic amines
alphamine=Chem.MolFromSmarts('N[!c]')
aniline=Chem.MolFromSmiles('NC1=CC=CC=C1') #Must be MolFromSmiles bc MolFromSmarts does not recognize aromatic atoms in kekule forms
aniline.GetSubstructMatches(alphamine)
def test(x):
    mol=x['mol']
    return True if mol.GetSubstructMatches(alphamine)
new_tests['Aliphatic amines']=test

()

In [546]:
#Alkoxysilanes
alkoxy=Chem.MolFromSmarts('[CX4]O[SiX4]')
tetraethyl_orthosilicate=Chem.MolFromSmiles('CCO[Si](OCC)(OCC)OCC')
tetraethyl_orthosilicate.GetSubstructMatches(alkoxy)
def test(x):
    mol=x['mol']

((1, 2, 3), (5, 4, 3), (8, 7, 3), (11, 10, 3))

In [672]:
dsstox.find_one({'name':'2-Aminobenzothiazole'})

{u'_id': ObjectId('58fe6153f0e291b4c06a4381'),
 u'casrn': u'136-95-8',
 u'chemspider_id': 8382,
 u'created_at': datetime.datetime(2017, 4, 24, 20, 32, 19),
 u'dsstox_cid': u'DTXCID804467',
 u'dsstox_sid': u'DTXSID1024467',
 u'gsid': 24467,
 u'inchi_key': u'UHGULLIUJBCTEF-UHFFFAOYSA-N',
 u'iupac': u'1,3-Benzothiazol-2-amine',
 u'mol_weight': 150.1999969482422,
 u'name': u'2-Aminobenzothiazole',
 u'pubchem_cid': 8706,
 u'smiles': u'NC1=NC2=C(S1)C=CC=C2',
 u'synonyms': [u'2-Benzothiazolamine',
  u'1,3-Benzothiazol-2-ylamine',
  u'2(3H)-Benzothiazolimine',
  u'2-Aminobenzo[d]thiazole',
  u'2-Aminobenzothiazol',
  u'2-AMINOBENZTHIAZOL',
  u'2-Benzothiazol amine',
  u'2-Benzothiazolylamine',
  u'2-Iminobenzothiazoline',
  u'Benzo[d]thiazol-2-amine',
  u'Benzothiazol-2-ylamin',
  u'Benzothiazol-2-ylamine',
  u'Benzothiazole, 1-amino-',
  u'Benzothiazole, 2-amino-',
  u'benzothiazole-2-ylamine',
  u'Benzothiazoline, 2-imino-',
  u'benzotiazol-2-ilamina',
  u'NSC 4670',
  u'o-Aminobenzothiazole

In [673]:
#Aminobenzothiazole Azo Dyes
azodye=Chem.MolFromSmiles('NC1=NC2=C(S1)C=CC=C2')
aminobenzothiazole=Chem.MolFromSmiles('NC1=NC2=C(S1)C=CC=C2')
aminobenzothiazole.GetSubstructMatches(azodye)
def test(x):
    mol=x['mol']

((0, 1, 2, 3, 4, 5, 6, 7, 8, 9),)

In [49]:
#Anionic Surfactants
sulfate=Chem.MolFromSmiles('C.OS(O)(O)O')
sulfonate=Chem.MolFromSmiles('C.OS(=O)=O')
phosphate=Chem.MolFromSmiles('C.O=P(O)(O)O')
carboxylic=Chem.MolFromSmiles('C.OC(=O)')
#Or these together
acetic=Chem.MolFromSmiles('CC(O)=O')
acetic.GetSubstructMatches(carboxylic)
def test(x):
    mol=x['mol']

((0, 2, 1, 3),)

In [678]:
#Dianilines
dianiline1=Chem.MolFromSmarts('c1ccccc1[C,O,N,S]c1ccccc1')
dianiline2=Chem.MolFromSmarts('[$(c1[cH1][cH1]c(N)[cH1][cH1]1),$(c1[cH1]c(N)[cH1][cH1][cH1]1)]')
methylenedianiline=Chem.MolFromSmiles('Cl.Cl.NC2=C(C=C(CC1=CC=C(N)C=C1)C=C2)')
methylenedianiline.GetSubstructMatches(dianiline1)
methylenedianiline.GetSubstructMatches(dianiline2)
def test(x):
    mol=x['mol']

((5, 4, 3, 16, 15, 6, 7, 8, 9, 10, 11, 13, 14),)

((6,), (8,))

In [28]:
#Ethylene Glycol Ethers
ege=Chem.MolFromSmarts('[$(OCC),$(OCCOCCC),$(OCCOCC)][$(C),$(CC),$(CCC),$(CCCC),$(CCCCC),$(CCCCCC),$(CCCCCCC)]'\
                       '.[$(C),$(CC),$(CCC),$(CCCC),$(CCCCC),$(CCCCCC),$(CCCCCCC)]')
c=Chem.MolFromSmiles('CCCCOCC')
c.GetSubstructMatches(ege)
def test(x):
    mol=x['mol']

((4, 3, 0),
 (4, 3, 1),
 (4, 3, 2),
 (4, 3, 5),
 (4, 3, 6),
 (4, 5, 0),
 (4, 5, 1),
 (4, 5, 2),
 (4, 5, 6))

In [139]:
#It doesn't care about overlap!!!!
co=Chem.MolFromSmarts('[$(CCO),$(N)][$(CO),$(N)]')
m=Chem.MolFromSmiles('CCO')
m.GetSubstructMatches(co)

((0, 1),)

In [33]:
#Have to enumerate then
smarts=[]
for i in range(1,7):
    for j in range(1,7):
        for k in range(1,3):
            smarts.append('C'*i+'OCC'*k+'C'*j)
def test(x):
    mol=x['mol']

In [54]:
#Neutral Organics
#Contains alcohols,ketons,ethers,alkyl halides,aryl halides,aromatic hydrocarbons
alcohol=Chem.MolFromSmarts('C.C[OX2H]')
ether=Chem.MolFromSmarts('[OD2](C)C')
ketone=Chem.MolFromSmarts('C[CX3](=O)C')
halide=Chem.MolFromSmarts('C[Cl,Br]')
aromatichydrocarbon=Chem.MolFromSmarts('c')
def test(x):
    mol=x['mol']

()

In [37]:
#Nonionic Surfactants
nonsurf1=Chem.MolFromSmarts('COCCO')
nonsurf2=Chem.MolFromSmarts('COCCOC')
def test(x):
    mol=x['mol']

In [None]:
#Organotins (Acute toxicity) and Organotins (Chronic toxicity)
organotin1=Chem.MolFromSmarts('CSn')
def test(x):
    mol=x['mol']

In [57]:
#Persistent, Bioaccumulative and Toxic (PBT) Chemicals
#MW<1000
#OPERA_HL > np.log(60)
#Ready biodegradability ?????
#LogP>4.2
def test(x):
    mol=x['mol']

4.0943445622221004

In [91]:
#Polynitroaromatics (Acute toxicity) and Polynitroaromatics (Chronic toxicity)
#MW < 1000
polynitroaromatic=Chem.MolFromSmarts('N[$(c1c(N)cccc1),$(c1cc(N)ccc1),$(c1ccc(N)cc1)]')
polynitroaromatic
eg=Chem.MolFromSmiles('NC1=C(C=C(C=C1Br)[N+]([O-])=O)[N+]([O-])=O')
eg.GetSubstructMatches(polynitroaromatic)
def test(x):
    mol=x['mol']

ArgumentError: Python argument types in
    Mol.GetSubstructMatches(Mol, NoneType)
did not match C++ signature:
    GetSubstructMatches(RDKit::ROMol self, RDKit::ROMol query, bool uniquify=True, bool useChirality=False, bool useQueryQueryMatches=False, unsigned int maxMatches=1000)

In [84]:
#Substituted Triazines (Acute toxicity) and Substituted Triazines (Chronic toxicity)
#logp<5
#MW<1000
subtriazine=Chem.MolFromSmarts('[$(n1nnccc1.[!H]),$(n1ncncc1.[!H]),$(n1cncnc1.[!H])]')
eg=Chem.MolFromSmiles('n1nnc(N)cc1')
eg.GetSubstructMatches(subtriazine)
def test(x):
    mol=x['mol']

((0,), (2,))

In [88]:
#Triarylmethane Pigments/Dyes with Non-solubilizing Groups
triphenylmethane=Chem.MolFromSmarts('c1cc([N,O])ccc1C(c1ccc([N,O])cc1)(c1ccc([N,O])cc1)')
diphenylnaphthylmethane=Chem.MolFromSmarts('c1cc([N,O])ccc1C(c1ccc([N,O])cc1)c1ccc([N,O])c2ccccc21')
eg=Chem.MolFromSmiles('c1ccccc1O')
eg.GetSubstructMatches(triphenylmethane)
def test(x):
    mol=x['mol']

((6, 5, 0, 1, 2, 3, 4),)

In [132]:
#beta-Naphthylamines, Sulfonated
sulfnaph=Chem.MolFromSmarts('Nc1c([H,OH])c2cccccc2cc1')
smarts=[]
prefix='Nc1c([H,OH])'
suffix='cc1'
for c1 in range(1,4):
    for c2 in range(c1+1,5):
        smarts.append(prefix+'c2'+'c'*c1+'([H,O,N])'+'c'*(c2-c1)+'([O,OOOSCC])'+'c'*(4-c2)+'c2'+suffix)
        smarts.append(prefix+'c2'+'c'*c1+'([O,OOOSCC])'+'c'*(c2-c1)+'([H,O,N])'+'c'*(4-c2)+'c2'+suffix)
def test(x):
    mol=x['mol']

In [133]:
smarts

['Nc1c([H,OH])c2c([H,O,N])c([O,OOOSCC])ccc2cc1',
 'Nc1c([H,OH])c2c([O,OOOSCC])c([H,O,N])ccc2cc1',
 'Nc1c([H,OH])c2c([H,O,N])cc([O,OOOSCC])cc2cc1',
 'Nc1c([H,OH])c2c([O,OOOSCC])cc([H,O,N])cc2cc1',
 'Nc1c([H,OH])c2c([H,O,N])ccc([O,OOOSCC])c2cc1',
 'Nc1c([H,OH])c2c([O,OOOSCC])ccc([H,O,N])c2cc1',
 'Nc1c([H,OH])c2cc([H,O,N])c([O,OOOSCC])cc2cc1',
 'Nc1c([H,OH])c2cc([O,OOOSCC])c([H,O,N])cc2cc1',
 'Nc1c([H,OH])c2cc([H,O,N])cc([O,OOOSCC])c2cc1',
 'Nc1c([H,OH])c2cc([O,OOOSCC])cc([H,O,N])c2cc1',
 'Nc1c([H,OH])c2ccc([H,O,N])c([O,OOOSCC])c2cc1',
 'Nc1c([H,OH])c2ccc([O,OOOSCC])c([H,O,N])c2cc1']