<h1>Extract SMARTS</h1>

In [None]:
import pandas as pd
import numpy as np
import pymongo
import sys
import os
from __future__ import print_function
from datetime import datetime

TOP = '/'.join(os.getcwd().split('/')[:-2])+'/'
LIB = TOP+'lib'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

DAT_DIR = TOP + 'data/'
FIG_DIR = TOP + 'figs/'

if not os.path.exists(DAT_DIR): os.mkdir(DAT_DIR)
if not os.path.exists(FIG_DIR): os.mkdir(FIG_DIR)
    
from db.mongo import *

from rax.genrapred import *
import db.etl as etl
from db.fpsim import *
from rdkit import Chem

import pymongo
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compound']
physprop=DB['physprop']

In [None]:
with open(DAT_DIR + 'epa_categories.xml','r') as f:
    xml=f.read()

In [None]:
xml=xml.replace('\n','')

In [None]:
xml

In [None]:
import xml.etree.ElementTree as ET
e=ET.parse(DAT_DIR+'epa_categories.xml').getroot()

In [None]:
parent_map = {c:p for p in e.iter() for c in p}

In [None]:
import operator as op
op_dict={
    'GreaterThan': op.gt,
    'GreaterThanOrEqualTo': op.ge,
    'LessThan': op.lt,
    'LessThanOrEqualTo': op.le
}
prop_dict={
    'log Kow':'logp',
    'Molecular Weight':'mol_weight',
    'Molecular weight':'mol_weight',
    'Water Solubility': 'ws'
}

In [None]:
def define_smart_match(smart):
    pattern=Chem.MolFromSmarts(smart)
    if not pattern:
        return None
    def smart_match(x):
        mol=x['mol']
        ret=True if mol.GetSubstructMatches(pattern) else False
        return ret
    return smart_match
def define_compare(prop,operand,value):
    def compare(x):
        ret = op_dict[operand](x[prop_dict[prop]],value)
        return ret
    return compare

In [None]:
class Query:
    
    def __init__(self,xml,qid=None):
        self.xml=xml
        self.id=qid
        self.logic=None
        self.subqueries=[]
        self.category=None
        
    def write_query(self,qtype,tree):
        self.type=qtype
        if qtype=='b:StructureQuery':
            qstring=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ComplexSearch').text
            qstring=re.sub('false','False',qstring)
            qstring=re.sub('true','True',qstring)
            qdict=ast.literal_eval(qstring)
            smart=qdict['queries'][0]['smart']
            self.smart=smart
            if '[Ch3,#1]' in self.smart:
                split=re.search(r'(.*)\[([^\(\)]*),([^\(\)].*)\]$',self.smart)
                split1=split.group(1)
                split2=split.group(1)+'['+split.group(2)+']'
                smart_match1=define_smart_match(split1)
                smart_match2=define_smart_match(split2)
                def smart_match(x):
                    return any([smart_match1(x),smart_match2(x)])
            else:
                smart_match=define_smart_match(smart)
            self.query=smart_match
        elif qtype=='b:ParameterQuery':
            self.operand=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}Operand').text
            self.prop=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ParameterName').text
            self.value=float(self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}Value').text)
            compare=define_compare(self.prop,self.operand,self.value)
            self.query=compare
        elif qtype=='LogicalQuery':
            self.logic=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Logic').text
            elements=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Elements')
            node_ids=[elem.attrib['{http://schemas.microsoft.com/2003/10/Serialization/}Ref']\
                      for elem in elements.findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query')\
                      if '{http://schemas.microsoft.com/2003/10/Serialization/}Ref' in elem.attrib]
            if self.logic=='Not':
                node_id=node_ids[0] #Should only be one
                sq=tree[node_id]
                self.subqueries=[sq]
                def func(x):
                    return not(sq.query(x))
                self.query=func
            elif self.logic=='And':
                sqs=[tree[node_id] for node_id in node_ids]
                self.subqueries=sqs
                def func(x):
                    return all([sq.query(x) for sq in self.subqueries])
                self.query=func
            else:
                sqs=[tree[node_id] for node_id in node_ids]
                self.subqueries=sqs
                for orquery in elements.findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query'):
                    if '{http://www.w3.org/2001/XMLSchema-instance}type' in orquery.attrib:
                        extra_sq=Query(orquery)
                        extra_sq.write_query('b:StructureQuery',tree)
                        sqs.append(extra_sq)      
                def func(x):
                    return any([sq.query(x) for sq in self.subqueries])
                self.query=func
    
    def print_tree(self,x,tabs=0):
        qinfo=(self.id,self.type)
        if self.type=='b:StructureQuery':
            qinfo=qinfo+(self.smart,)
        elif self.type=='b:ParameterQuery':
            qinfo=qinfo+(self.prop,self.value,self.operand)
        elif self.type=='LogicalQuery':
            qinfo=qinfo+(self.logic,)
        try:
            qinfo=qinfo+(self.query(x),)
        except:
            qinfo=qinfo+('does not process',)
        print('\t'*tabs+str(qinfo))
        for sq in self.subqueries:
            sq.print_tree(x,tabs+1)

In [None]:
all_tests={}
bad_smarts=set()
bad_cats=set()
import re
import ast
for elem in e.iter('{http://schemas.microsoft.com/2003/10/Serialization/Arrays}anyType'):
    category=elem.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Caption').text
    queries=elem.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Expression')\
        .find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Queries')\
        .findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query')
    contents=[query.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Content') for query in queries]
    query_tree={}
    for query in contents:
        attributes=query.attrib
        if '{http://schemas.microsoft.com/2003/10/Serialization/}Id' not in attributes:
            continue
        query_id=attributes['{http://schemas.microsoft.com/2003/10/Serialization/}Id']
        query_type=attributes['{http://www.w3.org/2001/XMLSchema-instance}type']
        q=Query(query,query_id)
        q.category=category
        q.write_query(query_type,query_tree)
        if not q.query or not all([sq.query for sq in q.subqueries]): #Smarts did not compile, sqs needed bc of hidden sqs in or queries
            bad_cats.add(category)
            if q.type=='b:StructureQuery':
                bad_smarts.add(q.smart)
        query_tree[query_id]=q
    all_tests[category]=query_tree[query_id] #Final one should always be the top level query hopefully

In [None]:
tests=all_tests.copy()
for category in bad_cats:
    del tests[category]

In [None]:
# import dill
# with open(DAT_DIR+'tests.pkl','w') as f:
#     dill.dump(tests,f)

<h1>Test on OECD_NCC_ToxVal</h1>

In [None]:
tv=pd.read_excel(DAT_DIR+'toxval_epa_categories.xlsx')
len(tv)

In [None]:
def rowify(i):
    row=dict(tv.loc[i])
    row['mol']=Chem.MolFromSmiles(row['smiles'])
    return row

In [None]:
casns=list(tv['CAS Number'])
toxval_sids=dsstox.find({'casrn':{'$in':casns}})
sids_dict={record['casrn']:record['dsstox_sid'] for record in toxval_sids}
sids=sids_dict.values()
toxval_logp=physprop.find({'dsstox_sid':{'$in':sids}},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_LogP':1})
logp_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_LogP'][0] for record in toxval_logp \
           if 'OPERA_LogP' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
toxval_ws=physprop.find({'dsstox_sid':{'$in':sids}},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_WS':1})
ws_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_WS'][0] for record in toxval_ws \
           if 'OPERA_WS' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
toxval_weight=dsstox.find({'dsstox_sid':{'$in':sids}})
weight_dict={record['dsstox_sid']:record['mol_weight'] for record in toxval_weight}

In [None]:
tv['dsstox_sid']=tv['CAS Number'].map(sids_dict)
tv['logp']=tv['dsstox_sid'].map(logp_dict)
tv['ws']=tv['dsstox_sid'].map(ws_dict)
tv['mol_weight']=tv['dsstox_sid'].map(weight_dict)
tv=tv.drop(['Molecular Formula','Predefined substance type','Additional Ids','Composition','CAS Smiles relation'],'columns')
tv=tv.rename(columns={'SMILES':'smiles'})
tv=tv[pd.notnull(tv).all(axis=1)]
tv=tv.drop_duplicates()
len(tv)

In [None]:
#from pymongo import InsertOne
toxval_cats={}
for i,row in tv.iterrows():
    record=dict(row)
    sid=record['dsstox_sid']
    mol=Chem.MolFromSmiles(record['smiles'])
    if not mol:
        print(i)
        continue
    record['mol']=mol
    categories=[category for category,test in tests.iteritems() if test.query(record)]
    if len(categories)==0:
        toxval_cats[sid]='Not categorized'
    else:
        toxval_cats[sid]=categories='|'.join(categories)

In [None]:
tv['categories']=tv['dsstox_sid'].map(toxval_cats)
tv=tv[pd.notnull(tv['categories'])]
len(tv)

In [None]:
len(toxval_cats)

In [None]:
mymatchset=set(tv[~tv['categories'].str.contains('\|')]['categories'].unique())
theirmatchset=set(tv[~tv['US-EPA New Chemical Categories'].str.contains('\|')]['US-EPA New Chemical Categories'].unique())
missmatchset=theirmatchset-mymatchset

In [None]:
import re
esc_bad_cats=[re.escape(c) for c in bad_cats]
mismatched=tv[~(tv['US-EPA New Chemical Categories'].str.contains('|'.join(esc_bad_cats))) & (tv['US-EPA New Chemical Categories']!=tv['categories'])]
with pd.option_context('display.max_rows', None):
    mismatched

In [None]:
len(mismatched)

In [None]:
#mismatched.to_excel(DAT_DIR+'mismatched_categories.xlsx')

In [None]:
# with open(DAT_DIR+'bad_categories.txt','w') as f:
#     f.write('\n'.join(list(bad_cats)))

<h1>Test on ToxCast</h1>

In [None]:
tc=pd.read_excel(DAT_DIR+'OECD_NCC_TXCST.xlsx')

In [None]:
len(tc)
tc.head()

In [None]:
casns=list(tc['CAS Number'])
toxval_sids=dsstox.find({'casrn':{'$in':casns}})
sids_dict={record['casrn']:record['dsstox_sid'] for record in toxval_sids}
sids=sids_dict.values()
toxval_logp=physprop.find({'dsstox_sid':{'$in':sids}},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_LogP':1})
logp_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_LogP'][0] for record in toxval_logp \
           if 'OPERA_LogP' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
toxval_ws=physprop.find({'dsstox_sid':{'$in':sids}},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_WS':1})
ws_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_WS'][0] for record in toxval_ws \
           if 'OPERA_WS' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
toxval_weight=dsstox.find({'dsstox_sid':{'$in':sids}})
weight_dict={record['dsstox_sid']:record['mol_weight'] for record in toxval_weight}

In [None]:
tc['dsstox_sid']=tc['CAS Number'].map(sids_dict)
tc['logp']=tc['dsstox_sid'].map(logp_dict)
tc['ws']=tc['dsstox_sid'].map(ws_dict)
tc['mol_weight']=tc['dsstox_sid'].map(weight_dict)
tc=tc.drop(['Predefined substance type','Additional Ids','Composition','CAS Smiles relation'],'columns')
tc=tc.rename(columns={'SMILES':'smiles'})
tc=tc[pd.notnull(tc).all(axis=1)]
tc=tc.drop_duplicates()
len(tc)

In [None]:
toxcast_cats={}
for i,row in tc.iterrows():
    record=dict(row)
    sid=record['dsstox_sid']
    mol=Chem.MolFromSmiles(record['smiles'])
    if not mol:
        print(i)
        continue
    record['mol']=mol
    categories=[category for category,test in tests.iteritems() if test.query(record)]
    if len(categories)==0:
        toxcast_cats[sid]='Not categorized'
    else:
        toxcast_cats[sid]=categories='|'.join(categories)

In [None]:
tc['categories']=tc['dsstox_sid'].map(toxcast_cats)
tc=tc[pd.notnull(tc['categories'])]

In [None]:
esc_bad_cats=[re.escape(c) for c in bad_cats]
tc[tc['US-EPA New Chemical Categories'].str.contains('|'.join(esc_bad_cats))]

In [None]:
mismatched_indexes=[]
for i,row in tc.iterrows():
    theirset=set(row['US-EPA New Chemical Categories'].split('|'))
    if theirset&bad_cats:
        continue
    myset=set(row['categories'].split('|'))
    if myset!=theirset:
        mismatched_indexes.append(i)  

In [None]:
with pd.option_context('display.max_colwidth',100):
    tc.loc[mismatched_indexes]

<h1>Fix bad SMARTS</h1>

In [None]:
bad_cats

In [None]:
new_tests={}

In [None]:
#Aliphatic amines
def create_test():
    primamine=Chem.MolFromSmarts('[NX3;H2;!$(NC=[O,N,S]);!$(NCN)][CX3]')
    secamine=Chem.MolFromSmarts('[NX3;H1;!$(NC=[O,N,S]);!$(NCN)](C)[CX3]')
    tertamine=Chem.MolFromSmarts('[N;!$(NC=[O,N,S]);!$(NCN)](C)(C)[CX3]')
    def test(x):
        mol=x['mol']
        smiles=x['smiles']
        mw=x['mol_weight']
        return 'c' not in smiles and mw<1000 and '1' not in smiles and (mol.HasSubstructMatch(primamine) or mol.HasSubstructMatch(secamine)\
        or mol.HasSubstructMatch(tertamine)) 
    return test
new_tests['Aliphatic Amines']=create_test()

In [None]:
#Alkoxysilanes
def create_test():
    alkoxy=Chem.MolFromSmarts('[CX4]O[SiX4]')
    def test(x):
        mol=x['mol']
        mw=x['mol_weight']
        return mw<1000 and mol.HasSubstructMatch(alkoxy)
    return test
new_tests['Alkoxysilanes']=create_test()

In [None]:
#Aminobenzothiazole Azo Dyes
def create_test():
    azodye=Chem.MolFromSmiles('N=NC1=NC2=C(S1)C=CC=C2')
    def test(x):
        mol=x['mol'] 
        return mol.HasSubstructMatch(azodye)
    return test
new_tests['Aminobenzothiazole Azo Dyes']=create_test()

In [None]:
#Anionic Surfactants
def create_test():
    sulfate=Chem.MolFromSmarts('COS(=O)(=O)[OH,O-]')
    sulfonate=Chem.MolFromSmarts('CS(=O)(=O)[OH,O-]')
    phosphate=Chem.MolFromSmarts('COP([OH1])([OH1])=O')
    carboxylic=Chem.MolFromSmarts('[CX3;!$(Cc)](=O)[OX2H1]')
    silicic=Chem.MolFromSmarts('[Si][OX2H]')
    def test(x):
        mol=x['mol']
        smiles=x['smiles']
        if set(smiles)-set(['C','c','O','P','S','i','[',']','(',')','=']):
            return False
        m=re.compile('\(.?C.?\)')
        if m.findall(smiles):
            return False
        rgroup_indexes=[i for i,atom in enumerate(smiles) if atom=='C']
        return (mol.HasSubstructMatch(sulfate) or mol.HasSubstructMatch(sulfonate)\
        or mol.HasSubstructMatch(phosphate) or mol.HasSubstructMatch(carboxylic)\
        or mol.HasSubstructMatch(silicic))\
        and sorted(rgroup_indexes)==range(min(rgroup_indexes),max(rgroup_indexes)+1) #Tests for straight alkyl chains
    return test
new_tests['Anionic Surfactants']=create_test()

In [None]:
smiles='CCCCC(CC)CCC(CC(C)C)OS(O)(=O)=O'
rgroup_indexes=[i for i,atom in enumerate(smiles) if atom=='C']
sorted(rgroup_indexes)==range(min(rgroup_indexes),max(rgroup_indexes)+1)

In [None]:
#Benzotriazoles
def create_test():
    benzotriazole=Chem.MolFromSmarts('n1c2ccccc2nn1')
    def test(x):
        mol=x['mol']
        return mol.HasSubstructMatch(benzotriazole)
    return test
new_tests['Benzotriazoles']=create_test()

In [None]:
#Dianilines
def create_test():
    dianiline=Chem.MolFromSmarts('c1cc([NH2])ccc1[CH2,O,N,S]c1ccccc1')
    not_dianiline1=Chem.MolFromSmarts('c1ccccc1[A]~[A]')
    not_dianiline2=Chem.MolFromSmarts('c1ccccc1[A](c)c')
    def test(x):
        mol=x['mol']
        return not mol.HasSubstructMatch(not_dianiline1) and not mol.HasSubstructMatch(not_dianiline2)\
        and len(mol.GetSubstructMatches(dianiline))==2 #lol
    return test
new_tests['Dianilines']=create_test()

In [None]:
def create_test():
    ethylenebisdithiocarbamate=Chem.MolFromSmiles('SC(=S)NCCNC(=S)S')
    dithiocarbamates=[]
    for i in range(1,5):
        for j in range(1,5):
            mol=Chem.MolFromSmiles('C'*i + 'NC(=S)S' + 'C'*j)
            dithiocarbamates.append(mol)
    dithiocarbamates.append(ethylenebisdithiocarbamate)
    def test(x):
        mol=x['mol']
        return x['mol_weight']<1000 and x['logp']<5 and any([mol.HasSubstructMatch(dithiocarbamate) for dithiocarbamate in dithiocarbamates])
    return test
new_tests['Dithiocarbamates (Acute toxicity)']=create_test()
def create_test():
    ethylenebisdithiocarbamate=Chem.MolFromSmiles('SC(=S)NCCNC(=S)S')
    dithiocarbamates=[]
    for i in range(1,5):
        for j in range(1,5):
            mol=Chem.MolFromSmiles('C'*i + 'NC(=S)S' + 'C'*j)
            dithiocarbamates.append(mol)
    dithiocarbamates.append(ethylenebisdithiocarbamate)
    def test(x):
        mol=x['mol']
        return x['mol_weight']<1000 and x['logp']>=5 and x['logp']<19 and any([mol.HasSubstructMatch(dithiocarbamate) for dithiocarbamate in dithiocarbamates])
    return test
new_tests['Dithiocarbamates (Chronic toxicity)']=create_test()

In [None]:
#Ethylene Glycol Ethers
#Have to enumerate       
def create_test():
    match_mols=[]
    for i in range(1,8):
        for j in range(0,8):
            for k in range(1,4):
                smart='C'*i+'OCC'*k+'O'+'C'*j
                match_mols.append(Chem.MolFromSmiles(smart))
    phenyl_mols=[]
    for i in range(0,7):
        for k in range(1,3):
            for l in range(0,3): #Technically could be any number but this is difficult to implement
                phenyl_smart='c1ccccc1'+'C'*l+'OCC'*k+'O'+'C'*i
                phenyl_mols.append(Chem.MolFromSmiles(phenyl_smart))
    
    def test(x):
        smiles=x['smiles']
        if set(smiles)-set(['C','c','O','1','(',')']):
            return False
        if smiles.count('O')<2:
            return False
        os=[i for i,o in enumerate(smiles) if o=='O']
        between_os=[smiles[(start+1):end] for start,end in zip(os,os[1:])]
        if any([between!='CC' for between in between_os]):
            return False
        m=re.compile('1.*O.*1')
        if m.findall(smiles):
            return False
        carbon1=smiles[0:min(os)]
        carbon2=smiles[(max(os)+1):]
        if carbon1.count('C')>7 or carbon2.count('C')>7:
            return False
        if carbon1.count('c')>6 or carbon2.count('c')>6:
            return False
        if not carbon1 and not carbon2:
            return False
        else:
            return True
    return test
new_tests['Ethylene Glycol Ethers']=create_test()

In [None]:
#Neutral Organics
#Verhaar scheme, see paper called Classifying Environmental Pollutants
def create_test():
    def test(x):
        mol=x['mol']
        if mol.HasSubstructMatch(Chem.MolFromSmarts('[!C;!c;!N;!O;!F;!Cl;!Br,I]')): #Rule 0.1 and 1.1
            return False
        logp=x['logp']
        if logp>8:
            return False
        mw=x['mol_weight']
        if mw>1000:
            return False
        if not mol.HasSubstructMatch(Chem.MolFromSmarts('[!C;!c]')): #Rule 1.3
            return True
        elif not mol.HasSubstructMatch(Chem.MolFromSmarts('[!C;!c;!Cl;!Br;!F]'))\
        and not mol.HasSubstructMatch(Chem.MolFromSmarts('[Cl,Br,F]C[$(C=C),$(Cc)]')): #Rule 1.4
            return True
        elif not mol.HasSubstructMatch(Chem.MolFromSmarts('[!C;!c;!O;!Cl;!Br;!F]')): #Rule 1.5
            if mol.HasSubstructMatch(Chem.MolFromSmarts('COC'))\
            and not mol.HasSubstructMatch(Chem.MolFromSmarts('COOC'))\
            and not mol.HasSubstructMatch(Chem.MolFromSmarts('C1OC1')): #Rule 1.5.1 and 1.7
                return True
            elif mol.HasSubstructMatch(Chem.MolFromSmarts('[C;!$(C=O)][OH]'))\
            and not mol.HasSubstructMatch(Chem.MolFromSmarts('C=CCO'))\
            and not mol.HasSubstructMatch(Chem.MolFromSmarts('C#CCO'))\
            and not mol.HasSubstructMatch(Chem.MolFromSmarts('cCO')): #Rule 1.5.2, 1.5.3, and 1.7CCCCOCCOCCO
                return True
            elif mol.HasSubstructMatch(Chem.MolFromSmarts('[C;!$(CO)]=O'))\
            and not mol.HasSubstructMatch(Chem.MolFromSmarts('[$(cC),$(C=C)]C=O'))\
            and not mol.HasSubstructMatch(Chem.MolFromSmarts('[Cl,Br]C=O'))\
            and not mol.HasSubstructMatch(Chem.MolFromSmarts('[Cl,Br]CC=O')): #Rule 1.5.4 and 1.7
                return True
            else:
                return False
        elif not mol.HasSubstructMatch(Chem.MolFromSmarts('[!C;!N]'))\
        and mol.HasSubstructMatch(Chem.MolFromSmarts('C[NH,NH0]')): #Rule 1.6
            return True
        else: 
            return False
    return test
new_tests['Neutral Organics']=create_test()

In [None]:
#Nonionic Surfactants
# nonsurf1=Chem.MolFromSmarts('COCCO')
# nonsurf2=Chem.MolFromSmarts('COCCOC')
# def test(x):
#     mol=x['mol']
#     return mol.HasSubstructMatch(nonsurf1) or mol.HasSubstructMatch(nonsurf2)
import re
def test(x):
    smiles=x['smiles']
    if '(' in smiles:
        return False
    split_smiles=smiles.split('O')
    if len(split_smiles)==1:
        return False
    mol=x['mol']
    if not mol.HasSubstructMatch(Chem.MolFromSmiles('COC')) or mol.HasSubstructMatch(Chem.MolFromSmiles('C=O')):
        return False
    return not any([re.search(r'[^C]',c) for c in split_smiles])
new_tests['Nonionic Surfactants']=test

In [None]:
#Nonionic Surfactants
import math
def create_test():
    def test(x):
        mol=x['mol']
        atoms=[a for a in x['smiles'].lower() if a.isalpha()]
        if set(atoms)-set(['c','o']):
            return False
        return mol.HasSubstructMatch(Chem.MolFromSmarts('[CH3][CR0][CR0][CR0][CR0][CR0]')) and\
        atoms.count('o')>1 and\
        (math.floor(len(mol.GetSubstructMatches(Chem.MolFromSmarts('O[CH2][CH2]')))/2)+1)==len(mol.GetSubstructMatches(Chem.MolFromSmarts('[O]')))
    return test
new_tests['Nonionic Surfactants']=create_test()

In [None]:
#Organotins (Acute toxicity) and Organotins (Chronic toxicity)

def create_test():
    organotin=Chem.MolFromSmarts('C[Sn]') 
    def test(x):
        mol=x['mol']
        return x['mol_weight']<1000 and mol.HasSubstructMatch(organotin) and x['logp']<=13.7
    return test
new_tests['Organotins (Acute toxicity)']=create_test()
def create_test():
    organotin=Chem.MolFromSmarts('C[Sn]') 
    def test(x):
        mol=x['mol']
        return x['mol_weight']<1000 and mol.HasSubstructMatch(organotin) and x['logp']>=13.7
    return test
new_tests['Organotins (Chronic toxicity)']=create_test()

In [None]:
#Persistent, Bioaccumulative and Toxic (PBT) Chemicals
#MW<1000
#OPERA_HL > np.log(60)
#Ready biodegradability ?????
#LogP>4.2
#Not sure this one is worth doing without ready biodegradability
# def test(x):
#     mol=x['mol']
# new_tests['Persistent, Bioaccumulative and Toxic (PBT) Chemicals']=test

In [None]:
#Polynitroaromatics (Acute toxicity) and Polynitroaromatics (Chronic toxicity)
#MW < 1000

def create_test():
    polynitroaromatic=Chem.MolFromSmarts('ON(=O)[$(c1c(N(O)=O)cccc1),$(c1cc(N(O)=O)ccc1),$(c1ccc(N(O)=O)cc1),$(c1cncc(N(O)=O)c1)]')
    def test(x):
        mol=x['mol']
        return x['mol_weight']<1000 and mol.HasSubstructMatch(polynitroaromatic) and x['logp']<7
    return test
new_tests['Polynitroaromatics (Acute toxicity)']=create_test()
def create_test():
    polynitroaromatic=Chem.MolFromSmarts('N[$(c1c(N)cccc1),$(c1cc(N)ccc1),$(c1ccc(N)cc1),$(c1cncc(N)c1)]')
    def test(x):
        mol=x['mol']
        return x['mol_weight']<1000 and mol.HasSubstructMatch(polynitroaromatic) and x['logp']>=10
    return test
new_tests['Polynitroaromatics (Chronic toxicity)']=create_test()

In [None]:
#Substituted Triazines (Acute toxicity) and Substituted Triazines (Chronic toxicity)
#logp<5
#MW<1000
def create_test():
    subtriazine=Chem.MolFromSmarts('[$(n1nnccc1.[!#1]),$(n1ncncc1.[!#1]),$(n1cncnc1.[!#1])]')#[!H] did not work as expected with aromatics
    def test(x):
        mol=x['mol']
        return x['mol_weight']<1000 and mol.HasSubstructMatch(subtriazine) and x['logp']<5
    return test
new_tests['Substituted Triazines (Acute toxicity)']=create_test()
def create_test():
    subtriazine=Chem.MolFromSmarts('[$(n1nnccc1.[!#1]),$(n1ncncc1.[!#1]),$(n1cncnc1.[!#1])]')#[!H] did not work as expected with aromatics
    def test(x):
        mol=x['mol']
        return x['mol_weight']<1000 and mol.HasSubstructMatch(subtriazine) and x['logp']>5 and x['logp']<=8
    return test
new_tests['Substituted Triazines (Chronic toxicity)']=create_test()

In [None]:
def convert_ppb(x): #OPERA results stored as mol/L
    ws=x['ws']
    mol_weight=x['mol_weight']
    return ws*mol_weight*10**6

In [None]:
#Triarylmethane Pigments/Dyes with Non-solubilizing Groups
def create_test():
    para_permutations='[NH2,O,$([NH1][CH3]),$([NH1][CH2][CH3]),$(N([CH3])[CH3]),$(N([CH3])[CH2][CH3]),$(N([CH2][CH3])[CH2][CH3])]'
    triphenylmethane=Chem.MolFromSmarts('[cH]1[cH]c({})[cH][cH]c1C(c2[cH][cH]c({})[cH][cH]2)=C3[CH]=[CH]C(=[NH,O])[CH]=[CH]3'.format(para_permutations,para_permutations))
    diphenylnaphthylmethane=Chem.MolFromSmarts('[cH]1[cH]c({})[cH][cH]c1C(c2[cH][cH]c({})[cH]3[cH][cH][cH][cH][cH]32)=C3[CH]=[CH]C(=[NH,O])[CH]=[CH]3'.format(para_permutations,para_permutations))
    def test(x):
        mol=x['mol']
        return convert_ppb(x)>1 and (mol.HasSubstructMatch(triphenylmethane) or (mol.HasSubstructMatch(diphenylnaphthylmethane)))
    return test
new_tests['Triarylmethane Pigments/Dyes with Non-solubilizing Groups']=create_test()

In [None]:
#beta-Naphthylamines, Sulfonated
def create_test():
    smarts=[]
    match_mols=[]
    prefix='[NH2]c1[cH1,$(cO)]'
    suffix='[cH][cH]1'
    for c1 in range(1,4):
        for c2 in range(c1+1,5):
            smarts.append(prefix+'c2'+'[cH]'*(c1-1)+'[cH,$(c[OH]),$(c[NH2])]'+'[cH]'*(c2-c1-1)+'c([$(S(=O)(=O)[OH]),$(S(=O)(=O)[CH2][CH2]S[OH3])])'+'[cH]'*(4-c2)+'c2'+suffix)
            smarts.append(prefix+'c2'+'[cH]'*(c1-1)+'c([$(S(=O)(=O)[OH]),$(S(=O)(=O)[CH2][CH2]S[OH3])])'+'[cH]'*(c2-c1-1)+'[cH1,$(c[OH]),$(c[NH2])]'+'[cH]'*(4-c2)+'c2'+suffix)
    match_mols=[Chem.MolFromSmarts(smart) for smart in smarts]
    def test(x):
        mol=x['mol']
        naph_matches=[True for match in match_mols[:] if mol.HasSubstructMatch(match) and match.HasSubstructMatch(mol)]
        return any(naph_matches)
    return test
new_tests['beta-Naphthylamines, Sulfonated']=create_test()

In [None]:
#Aldehydes
#Turns out these are just wrong in the toolbox, although does compile
def create_test():
    formaldehyde=Chem.MolFromSmarts('[CH2](=O)') #Needs to be special case because buggy way RDKit handles hydrogens
    aldehyde=Chem.MolFromSmarts('[CH1](=[O])[C,c]')
    def test(x):
        mol=x['mol']
        mw=x['mol_weight']
        logp=x['logp']
        return (mol.HasSubstructMatch(formaldehyde) or mol.HasSubstructMatch(aldehyde)) and mw<1000 and logp<=6
    return test
new_tests['Aldehydes (Acute toxicity)']=create_test()
def create_test():
    formaldehyde=Chem.MolFromSmarts('[CH2](=O)') #Needs to be special case because buggy way RDKit handles hydrogens
    aldehyde=Chem.MolFromSmarts('[CH1](=[O])[C,c]')
    def test(x):
        mol=x['mol']
        mw=x['mol_weight']
        logp=x['logp']
        return (mol.HasSubstructMatch(formaldehyde) or mol.HasSubstructMatch(aldehyde)) and mw<1000 and logp>6
    return test
new_tests['Aldehydes (Chronic toxicity)']=create_test()

In [None]:
#Benzotriazoles
#Not a valid smarts from toolbox
def create_test():
    benzotriazole=Chem.MolFromSmarts('N1N=NC2=C1C=CC=C2')
    def test(x):
        mol=x['mol']
        return mol.HasSubstructMatch(benzotriazole)
    return test
new_tests['Benzotriazoles']=create_test()

In [None]:
#Imides
#Doesn't work if carbons are part of aromatic
def create_test():
    imide=Chem.MolFromSmarts('C(=O)NC(=O)')
    not_imide=Chem.MolFromSmarts('c1C(=O)NC(=O)ccccc1')
    def test(x):
        mol=x['mol']
        mw=x['mol_weight']
        logp=x['logp']
        return mol.HasSubstructMatch(imide) and not mol.HasSubstructMatch(not_imide) and logp<=5 and mw<1000
    return test
new_tests['Imides (Acute toxicity)']=create_test()
def create_test():
    imide=Chem.MolFromSmarts('C(=O)NC(=O)')
    not_imide=Chem.MolFromSmarts('c1(C(=O)NC(=O))ccccc1')
    def test(x):
        mol=x['mol']
        mw=x['mol_weight']
        logp=x['logp']
        return mol.HasSubstructMatch(imide) and not mol.HasSubstructMatch(not_imide) and logp>5 and logp<8 and mw<1000
    return test
new_tests['Imides (Chronic toxicity)']=create_test()

In [None]:
#Hydrazines and related compounds
def create_test():
    hydra1=Chem.MolFromSmarts('[NX3][NX3]')
    hydra2=Chem.MolFromSmarts('[CX3]=[NX2][NX2]')
    hydra3=Chem.MolFromSmarts('[CX3](=O)[NX2][NX3]')
    hydra4=Chem.MolFromSmarts('[NX2][CX3](=O)[NX2][NX3]')
    def test(x):
        mol=x['mol']
        mw=x['mol_weight']
        return (mol.HasSubstructMatch(hydra1) or mol.HasSubstructMatch(hydra2)\
               or mol.HasSubstructMatch(hydra3) or mol.HasSubstructMatch(hydra4)) and mw<500
    return test
new_tests['Hydrazines and Related Compounds']=create_test()

In [None]:
#Thiols
def create_test():
    thiol=Chem.MolFromSmarts('[C,c][SX2H]')
    def test(x):
        mol=x['mol']
        mw=x['mol_weight']
        logp=x['logp']
        return mol.HasSubstructMatch(thiol) and mw<1000 and logp<6.5
    return test
new_tests['Thiols (Acute toxicity)']=create_test()
def create_test():
    thiol=Chem.MolFromSmarts('[C,c][SX2H]')
    def test(x):
        mol=x['mol']
        mw=x['mol_weight']
        logp=x['logp']
        return mol.HasSubstructMatch(thiol) and mw<1000 and logp>=6.5 and logp<9
    return test
new_tests['Thiols (Chronic toxicity)']=create_test()

In [None]:
#Acrylamides
def create_test():
    acrylamide1=Chem.MolFromSmarts('[CH2]=[CH1]C(=O)[NH,NH2]')
    acrylamide2=Chem.MolFromSmarts('[CH2]=C([CH3])C(=O)[NH,NH2]')
    def test(x):
        mol=x['mol']
        mw=x['mol_weight']
        logp=x['logp']
        return (mol.HasSubstructMatch(acrylamide1) or mol.HasSubstructMatch(acrylamide2)) and mw<1000 and logp<8
    return test
new_tests['Acrylamides']=create_test()

In [None]:
#Acrylates/Methacrylates
def create_test():
    acrylate=Chem.MolFromSmarts('[CH2]=[CH]C(=O)O')
    methacrylate=Chem.MolFromSmarts('[CH2]=C([CH3])C(=O)O')
    def test(x):
        mol=x['mol']
        mw=x['mol_weight']
        logp=x['logp']
        return (mol.HasSubstructMatch(acrylate) or mol.HasSubstructMatch(methacrylate)) and logp<=5 and mw<1000
    return test
new_tests['Acrylates/Methacrylates (Acute toxicity)']=create_test()
def create_test():
    acrylate=Chem.MolFromSmarts('[CH2]=[CH]C(=O)O')
    methacrylate=Chem.MolFromSmarts('[CH2]=C([CH3])C(=O)O')
    def test(x):
        mol=x['mol']
        mw=x['mol_weight']
        logp=x['logp']
        return (mol.HasSubstructMatch(acrylate) or mol.HasSubstructMatch(methacrylate)) and logp>5 and logp<8 and mw<1000
    return test
new_tests['Acrylates/Methacrylates (Chronic toxicity)']=create_test()

In [None]:
#Epoxides
#Grace's advice
def create_test():
    epoxide=Chem.MolFromSmarts('c1oc1')
    aziridine=Chem.MolFromSmarts('c1cn1([CH3,$(CH2CH3)])')
    def test(x):
        mol=x['mol']
        mw=x['mol_weight']
        return mw<1000 and (mol.HasSubstructMatch(epoxide) or mol.HasSubstructMatch(aziridine))
    return test
new_tests['Epoxides']=create_test()

In [None]:
new_tests.update({k:q.query for k,q in tests.iteritems()})

In [None]:
#del new_tests['Persistent, Bioaccumulative and Toxic (PBT) Chemicals'] #Focus on others right now

In [None]:
import dill
with open(DAT_DIR+'category_tests.dill','w') as f:
    dill.dump(new_tests,f)

<h1>New tests on ToxVal</h1>

In [None]:
toxval_cats={}
for i,row in tv.iterrows():
    record=dict(row)
    sid=record['dsstox_sid']
    mol=Chem.MolFromSmiles(record['smiles'])
    if not mol:
        print(i)
        continue
    record['mol']=mol
    categories=[category for category,test in new_tests.iteritems() if test(record)]
    if len(categories)>1 and 'Neutral Organics' in categories:
        categories.remove('Neutral Organics')
    if len(categories)==0:
        toxval_cats[sid]='Not categorized'
    else:
        toxval_cats[sid]=categories='|'.join(categories)

In [None]:
tv['categories']=tv['dsstox_sid'].map(toxval_cats)
tv=tv[pd.notnull(tv['categories'])]

In [None]:
#They're wrong
# tv=tv.loc[tv['categories']!="Acrylates/Methacrylates (Acute toxicity)|Anionic Surfactants"] #Acryl/methacryl are always anionic surfactants, they missed that
# tv=tv.drop(46) #It's a carboxylic acid
# tv=tv.drop(137) #It's a carboxylic acid
# tv=tv.drop(179) #It's an ester
# tv=tv.drop(220) #Not an EGE
# tv=tv.drop(87) #Not an aliphatic amine
# tv=tv.drop(4) #Is a carboxylic acid
# for i,row in tv.iterrows():
#     carboxylic=Chem.MolFromSmarts('[CX3;!$(Cc)](=O)[OX2H1]')
#     if Chem.MolFromSmiles(row['smiles']).HasSubstructMatch(carboxylic):
#         tv=tv.drop(i)
# tv=tv.drop(294) #Aliphatic amine and dithiocarbamate
# tv=tv.drop(387) #Not an aliphatic amine
# tv=tv.drop(419)
# tv=tv.drop(420)
# tv=tv.loc[tv.apply(lambda row: False if "Dithiocarbamates" in row['categories'] else True,axis=1)] #They suck at dithiocarbamates
# tv=tv.loc[tv.apply(lambda row: False if "Aliphatic Amines" in row['US-EPA New Chemical Categories'] else True,axis=1)] #They suck at these
# tv=tv.drop(609) #N in wrong place
# tv=tv.drop(685) #Substitutions not allowed on bridging atom
# tv=tv.drop(713) #Sulfide substitutions are okay
# tv=tv.drop(792) #Is an amine
# drops=[24,17,6,0,794,848,874,898,1097,837,1223,1655,1674,1737,1706,1750,3194,1957,1958,1971,2074,2185,2479,2466,2627,2849\
#        ,2977,3439,3441,3509,3851,3944,4010,4139,1900,4165,4491,4650,5215,5564,6109,6112,6115,6178,6846,6892,7405,7463\
#       ,7597,8811,8855,8997,9015,9020,9166,9228,9235,9290,9363,9366,9510,9515,9961,10562,10664,10783,10855,11151,11256\
#        ,11258]
# tv=tv.drop(drops,errors='ignore')

In [None]:
mismatched_indexes=[]
for i,row in tv.iterrows():
    different=set(['Esters (Acute toxicity)','Esters (Chronic toxicity)','Phenols (Chronic toxicity)',\
                   'Phenols (Acute toxicity)','Hydrazines and Related Compounds','Thiols (Acute toxicity)',\
                   'Thiols (Chronic toxicity)','Anionic Surfactants','Aliphatic Amines','Neutral Organics']) #Agree to disagree 
    theirset=set(row['US-EPA New Chemical Categories'].split('|'))-set(['Persistent, Bioaccumulative and Toxic (PBT) Chemicals'])
    theirset=theirset-different
    if not theirset:
        continue
    myset=set(row['categories'].split('|'))-different
    if not myset:
        myset=set(['Not categorized'])
    if myset^theirset-different:
        mismatched_indexes.append(i)  

In [None]:
with pd.option_context('display.max_colwidth',200,'display.max_rows',None):
    tv.loc[mismatched_indexes]

In [None]:
len(mismatched_indexes)

In [None]:
dsstox.find_one({'casrn':'15894-70-9'})

In [None]:
row=rowify(4924)
all_tests['Neutral Organics'].print_tree(row)

<h1>New tests on ToxCast</h1>

In [None]:
toxcast_cats={}
for i,row in tc.iterrows():
    record=dict(row)
    sid=record['dsstox_sid']
    mol=Chem.MolFromSmiles(record['smiles'])
    if not mol:
        print(i)
        continue
    record['mol']=mol
    categories=[category for category,test in new_tests.iteritems() if test(record)]
    if len(categories)==0:
        toxcast_cats[sid]='Not categorized'
    else:
        toxcast_cats[sid]=categories='|'.join(categories)

In [None]:
tc['categories']=tc['dsstox_sid'].map(toxcast_cats)
tc=tc[pd.notnull(tc['categories'])]

In [None]:
mismatched_indexes=[]
for i,row in tv.iterrows():
    different=set(['Esters (Acute toxicity)','Esters (Chronic toxicity)','Phenols (Chronic toxicity)',\
                   'Phenols (Acute toxicity)','Hydrazines and Related Compounds','Thiols (Acute toxicity)',\
                   'Thiols (Chronic toxicity)','Anionic Surfactants','Aliphatic Amines']) #Agree to disagree 
    theirset=set(row['US-EPA New Chemical Categories'].split('|'))-set(['Neutral Organics','Persistent, Bioaccumulative and Toxic (PBT) Chemicals','Triarylmethane Pigments/Dyes with Non-solubilizing Groups','Nonionic Surfactants'])
    theirset=theirset-different
    if not theirset:
        continue
    myset=set(row['categories'].split('|'))-different
    if not myset:
        myset=set(['Not categorized'])
    if myset^theirset-different:
        mismatched_indexes.append(i)  

In [None]:
with pd.option_context('display.max_colwidth',200,'display.max_rows',None):
    tv.loc[mismatched_indexes]