<h1>Extract SMARTS</h1>

In [None]:
import pandas as pd
import numpy as np
import pymongo
import sys
import os
from __future__ import print_function
from datetime import datetime

TOP = '/'.join(os.getcwd().split('/')[:-2])+'/'
LIB = TOP+'lib'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

DAT_DIR = TOP + 'data/'
FIG_DIR = TOP + 'figs/'

if not os.path.exists(DAT_DIR): os.mkdir(DAT_DIR)
if not os.path.exists(FIG_DIR): os.mkdir(FIG_DIR)
    
from db.mongo import *

from rax.genrapred import *
import db.etl as etl
from db.fpsim import *
from rdkit import Chem

import pymongo
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compound']
physprop=DB['physprop']

In [None]:
with open(DAT_DIR + 'epa_categories.xml','r') as f:
    xml=f.read()

In [None]:
xml=xml.replace('\n','')

In [None]:
xml

In [None]:
import xml.etree.ElementTree as ET
e=ET.parse(DAT_DIR+'epa_categories.xml').getroot()

In [None]:
parent_map = {c:p for p in e.iter() for c in p}

In [None]:
import operator as op
op_dict={
    'GreaterThan': op.gt,
    'GreaterThanOrEqualTo': op.ge,
    'LessThan': op.lt,
    'LessThanOrEqualTo': op.le
}
prop_dict={
    'log Kow':'logp',
    'Molecular Weight':'mol_weight',
    'Molecular weight':'mol_weight',
    'Water Solubility': 'ws'
}

In [None]:
def define_smart_match(smart):
    pattern=Chem.MolFromSmarts(smart)
    if not pattern:
        return None
    def smart_match(x):
        mol=x['mol']
        ret=True if mol.GetSubstructMatches(pattern) else False
        return ret
    return smart_match
def define_compare(prop,operand,value):
    def compare(x):
        ret = op_dict[operand](x[prop_dict[prop]],value)
        return ret
    return compare

In [None]:
class Query:
    
    def __init__(self,xml,qid=None):
        self.xml=xml
        self.id=qid
        self.logic=None
        self.subqueries=[]
        self.category=None
        
    def write_query(self,qtype,tree):
        self.type=qtype
        if qtype=='b:StructureQuery':
            qstring=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ComplexSearch').text
            qstring=re.sub('false','False',qstring)
            qstring=re.sub('true','True',qstring)
            qdict=ast.literal_eval(qstring)
            smart=qdict['queries'][0]['smart']
            self.smart=smart
            if '[Ch3,#1]' in self.smart:
                split=re.search(r'(.*)\[([^\(\)]*),([^\(\)].*)\]$',self.smart)
                split1=split.group(1)
                split2=split.group(1)+'['+split.group(2)+']'
                smart_match1=define_smart_match(split1)
                smart_match2=define_smart_match(split2)
                def smart_match(x):
                    return any([smart_match1(x),smart_match2(x)])
            else:
                smart_match=define_smart_match(smart)
            self.query=smart_match
        elif qtype=='b:ParameterQuery':
            self.operand=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}Operand').text
            self.prop=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ParameterName').text
            self.value=float(self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}Value').text)
            compare=define_compare(self.prop,self.operand,self.value)
            self.query=compare
        elif qtype=='LogicalQuery':
            self.logic=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Logic').text
            elements=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Elements')
            node_ids=[elem.attrib['{http://schemas.microsoft.com/2003/10/Serialization/}Ref']\
                      for elem in elements.findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query')\
                      if '{http://schemas.microsoft.com/2003/10/Serialization/}Ref' in elem.attrib]
            if self.logic=='Not':
                node_id=node_ids[0] #Should only be one
                sq=tree[node_id]
                self.subqueries=[sq]
                def func(x):
                    return not(sq.query(x))
                self.query=func
            elif self.logic=='And':
                sqs=[tree[node_id] for node_id in node_ids]
                self.subqueries=sqs
                def func(x):
                    return all([sq.query(x) for sq in self.subqueries])
                self.query=func
            else:
                sqs=[tree[node_id] for node_id in node_ids]
                self.subqueries=sqs
                for orquery in elements.findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query'):
                    if '{http://www.w3.org/2001/XMLSchema-instance}type' in orquery.attrib:
                        extra_sq=Query(orquery)
                        extra_sq.write_query('b:StructureQuery',tree)
                        sqs.append(extra_sq)      
                def func(x):
                    return any([sq.query(x) for sq in self.subqueries])
                self.query=func
    
    def print_tree(self,x,tabs=0):
        qinfo=(self.id,self.type)
        if self.type=='b:StructureQuery':
            qinfo=qinfo+(self.smart,)
        elif self.type=='b:ParameterQuery':
            qinfo=qinfo+(self.prop,self.value,self.operand)
        elif self.type=='LogicalQuery':
            qinfo=qinfo+(self.logic,)
        try:
            qinfo=qinfo+(self.query(x),)
        except:
            qinfo=qinfo+('does not process',)
        print('\t'*tabs+str(qinfo))
        for sq in self.subqueries:
            sq.print_tree(x,tabs+1)

In [None]:
all_tests={}
bad_smarts=set()
bad_cats=set()
import re
import ast
for elem in e.iter('{http://schemas.microsoft.com/2003/10/Serialization/Arrays}anyType'):
    category=elem.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Caption').text
    queries=elem.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Expression')\
        .find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Queries')\
        .findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query')
    contents=[query.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Content') for query in queries]
    query_tree={}
    for query in contents:
        attributes=query.attrib
        if '{http://schemas.microsoft.com/2003/10/Serialization/}Id' not in attributes:
            continue
        query_id=attributes['{http://schemas.microsoft.com/2003/10/Serialization/}Id']
        query_type=attributes['{http://www.w3.org/2001/XMLSchema-instance}type']
        q=Query(query,query_id)
        q.category=category
        q.write_query(query_type,query_tree)
        if not q.query or not all([sq.query for sq in q.subqueries]): #Smarts did not compile, sqs needed bc of hidden sqs in or queries
            bad_cats.add(category)
            if q.type=='b:StructureQuery':
                bad_smarts.add(q.smart)
        query_tree[query_id]=q
    all_tests[category]=query_tree[query_id] #Final one should always be the top level query hopefully

In [None]:
bad_cats

In [None]:
tests=all_tests.copy()
for category in bad_cats:
    del tests[category]

In [None]:
# import dill
# with open(DAT_DIR+'tests.pkl','w') as f:
#     dill.dump(tests,f)

<h1>Test on OECD_NCC_ToxVal</h1>

In [None]:
tv=pd.read_excel(DAT_DIR+'toxval_epa_categories.xlsx')
len(tv)

In [None]:
casns=list(tv['CAS Number'])
toxval_sids=dsstox.find({'casrn':{'$in':casns}})
sids_dict={record['casrn']:record['dsstox_sid'] for record in toxval_sids}
sids=sids_dict.values()
toxval_logp=physprop.find({'dsstox_sid':{'$in':sids}},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_LogP':1})
logp_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_LogP'][0] for record in toxval_logp \
           if 'OPERA_LogP' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
toxval_ws=physprop.find({'dsstox_sid':{'$in':sids}},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_WS':1})
ws_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_WS'][0] for record in toxval_ws \
           if 'OPERA_WS' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
toxval_weight=dsstox.find({'dsstox_sid':{'$in':sids}})
weight_dict={record['dsstox_sid']:record['mol_weight'] for record in toxval_weight}

In [None]:
tv['dsstox_sid']=tv['CAS Number'].map(sids_dict)
tv['logp']=tv['dsstox_sid'].map(logp_dict)
tv['ws']=tv['dsstox_sid'].map(ws_dict)
tv['mol_weight']=tv['dsstox_sid'].map(weight_dict)
tv=tv.drop(['Molecular Formula','Predefined substance type','Additional Ids','Composition','CAS Smiles relation'],'columns')
tv=tv.rename(columns={'SMILES':'smiles'})
tv=tv[pd.notnull(tv).all(axis=1)]
tv=tv.drop_duplicates()
len(tv)

In [None]:
#from pymongo import InsertOne
toxval_cats={}
for i,row in tv.iterrows():
    record=dict(row)
    sid=record['dsstox_sid']
    mol=Chem.MolFromSmiles(record['smiles'])
    if not mol:
        print(i)
        continue
    record['mol']=mol
    categories=[category for category,test in tests.iteritems() if test.query(record)]
    if len(categories)==0:
        toxval_cats[sid]='Not categorized'
    else:
        toxval_cats[sid]=categories='|'.join(categories)

In [None]:
tv['categories']=tv['dsstox_sid'].map(toxval_cats)
tv=tv[pd.notnull(tv['categories'])]
len(tv)

In [None]:
len(toxval_cats)

In [None]:
mymatchset=set(tv[~tv['categories'].str.contains('\|')]['categories'].unique())
theirmatchset=set(tv[~tv['US-EPA New Chemical Categories'].str.contains('\|')]['US-EPA New Chemical Categories'].unique())
missmatchset=theirmatchset-mymatchset

In [None]:
import re
esc_bad_cats=[re.escape(c) for c in bad_cats]
mismatched=tv[~(tv['US-EPA New Chemical Categories'].str.contains('|'.join(esc_bad_cats))) & (tv['US-EPA New Chemical Categories']!=tv['categories'])]
with pd.option_context('display.max_rows', None):
    mismatched

In [None]:
len(mismatched)

In [None]:
#mismatched.to_excel(DAT_DIR+'mismatched_categories.xlsx')

In [None]:
# with open(DAT_DIR+'bad_categories.txt','w') as f:
#     f.write('\n'.join(list(bad_cats)))

<h1>Get Category Fingerprints</h1>
Moved to script

There was a bug where some substances got skipped. Need to fill in the blanks or rerun

In [None]:
import pymongo
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compound']
physprop=DB['physprop']
epa_cats=DB['epa_categories']

In [None]:
# dsstox_smiles=dsstox.find({},{'_id':0,'dsstox_sid':1,'smiles':1,'mol_weight':1})
# dsstox_logp=physprop.find({},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_LogP':1})
# #logp_dict_temp={record['dsstox_sid']:record.get('predicted_props',{}) for record in dsstox_logp if 'dsstox_sid' in record.keys()}
# #logp_dict={sid:props['OPERA_LogP'][0] for sid,props in logp_dict_temp.iteritems() if 'OPERA_LogP' in props.keys()}
# logp_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_LogP'][0] for record in dsstox_logp \
#            if 'OPERA_LogP' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
# dsstox_ws=physprop.find({},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_WS':1})
# ws_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_WS'][0] for record in dsstox_ws \
#            if 'OPERA_WS' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}

In [None]:
dsstox_smiles=dsstox.find({'dsstox_sid':'DTXSID60871632'},{'_id':0,'dsstox_sid':1,'smiles':1,'mol_weight':1})

In [None]:
dsstox_smiles.count()
epa_cats.count()

In [None]:
from pymongo import InsertOne
i=0
inserts=[]
while i<dsstox_smiles.count():
#     if inserts:
#         epa_cats.bulk_write(inserts)
#         inserts=[]
    j=0
    dsstox_smiles=dsstox.find({'dsstox_sid':'DTXSID60871632'},{'_id':0,'dsstox_sid':1,'smiles':1,'mol_weight':1},batch_size=20000)
    dsstox_smiles.skip(i)
    while j<=5000: #Must do it this way because cursor will time out
        i+=1
        j+=1
        record=dsstox_smiles.next()
        sid=record['dsstox_sid']
        try:
            record['logp']=logp_dict[sid]
            record['ws']=ws_dict[sid]
        except:
            continue
        smiles=record['smiles']
        try:
            mol=Chem.MolFromSmiles(smiles)
        except:
            print(sid + ' no smiles')
            continue
        if not mol:
            continue
        record['mol']=mol
        categories=[category for category,test in tests.iteritems() if test.query(record)]
        categories_record={'dsstox_sid':sid,'categories':categories}
        print(categories_record)
        break
        #inserts.append(InsertOne(categories_record))

In [None]:
notebook_categories={}
for i,row in tv.iterrows():
    record=dict(row)
    sid=record['dsstox_sid']
    mol=Chem.MolFromSmiles(record['smiles'])
    if not mol:
        print(i)
        continue
    record['mol']=mol
    notebook_categories[sid]={category for category,test in tests.iteritems() if test.query(record)} 

In [None]:
db_categories=epa_cats.find({'dsstox_sid':{'$in':list(tv['dsstox_sid'])}})

In [None]:
script_categories={record['dsstox_sid']:set(record['categories']) for record in db_categories}

In [None]:
len(notebook_categories)
len(script_categories)

In [None]:
missing_sids=set(notebook_categories.keys())-set(script_categories.keys())
#{record['dsstox_sid']:record['smiles'] for record in dsstox.find({'dsstox_sid':{'$in':list(missing_sids)}})}
[(sid,dsstox.find_one({'dsstox_sid':sid})['smiles'],tv[tv['dsstox_sid']==sid].iloc[0]['smiles']) for sid in missing_sids]

In [None]:
for sid in set(notebook_categories.keys())&set(script_categories.keys()):
    if notebook_categories[sid]!=script_categories[sid]:
        print (sid,notebook_categories[sid],script_categories[sid])

In [None]:
dsstox.find_one({'dsstox_sid':'DTXSID7035725'},{'smiles':1})
record=dict(tv[tv['dsstox_sid']=='DTXSID60873772'].iloc[0])
record['mol']=Chem.MolFromSmiles(record['smiles'])

In [None]:
tests['Phenols (Acute toxicity)'].print_tree(record)

<h1>Test on ToxCast</h1>

In [None]:
tc=pd.read_excel(DAT_DIR+'OECD_NCC_TXCST.xlsx')

In [None]:
len(tc)
tc.head()

In [None]:
casns=list(tc['CAS Number'])
toxval_sids=dsstox.find({'casrn':{'$in':casns}})
sids_dict={record['casrn']:record['dsstox_sid'] for record in toxval_sids}
sids=sids_dict.values()
toxval_logp=physprop.find({'dsstox_sid':{'$in':sids}},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_LogP':1})
logp_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_LogP'][0] for record in toxval_logp \
           if 'OPERA_LogP' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
toxval_ws=physprop.find({'dsstox_sid':{'$in':sids}},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_WS':1})
ws_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_WS'][0] for record in toxval_ws \
           if 'OPERA_WS' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
toxval_weight=dsstox.find({'dsstox_sid':{'$in':sids}})
weight_dict={record['dsstox_sid']:record['mol_weight'] for record in toxval_weight}

In [None]:
tc['dsstox_sid']=tc['CAS Number'].map(sids_dict)
tc['logp']=tc['dsstox_sid'].map(logp_dict)
tc['ws']=tc['dsstox_sid'].map(ws_dict)
tc['mol_weight']=tc['dsstox_sid'].map(weight_dict)
tc=tc.drop(['Predefined substance type','Additional Ids','Composition','CAS Smiles relation'],'columns')
tc=tc.rename(columns={'SMILES':'smiles'})
tc=tc[pd.notnull(tc).all(axis=1)]
tc=tc.drop_duplicates()
len(tc)

In [None]:
toxcast_cats={}
for i,row in tc.iterrows():
    record=dict(row)
    sid=record['dsstox_sid']
    mol=Chem.MolFromSmiles(record['smiles'])
    if not mol:
        print(i)
        continue
    record['mol']=mol
    categories=[category for category,test in tests.iteritems() if test.query(record)]
    if len(categories)==0:
        toxcast_cats[sid]='Not categorized'
    else:
        toxcast_cats[sid]=categories='|'.join(categories)

In [None]:
4257-818

In [None]:
tc['categories']=tc['dsstox_sid'].map(toxcast_cats)
tc=tc[pd.notnull(tc['categories'])]

In [None]:
esc_bad_cats=[re.escape(c) for c in bad_cats]
tc[tc['US-EPA New Chemical Categories'].str.contains('|'.join(esc_bad_cats))]

In [None]:
mismatched_indexes=[]
for i,row in tc.iterrows():
    theirset=set(row['US-EPA New Chemical Categories'].split('|'))
    if theirset&bad_cats:
        continue
    myset=set(row['categories'].split('|'))
    if myset!=theirset:
        mismatched_indexes.append(i)  

In [None]:
with pd.option_context('display.max_colwidth',100):
    tc.loc[mismatched_indexes]

In [None]:
record=tc.loc[614]
record['mol']=Chem.MolFromSmiles(record['smiles'])
tests['Acrylates/Methacrylates (Acute toxicity)'].print_tree(record)

In [None]:
tc.loc[mismatched_indexes].to_excel(DAT_DIR+'mismatched_toxcast.xlsx')

<h1>Fix bad SMARTS</h1>

In [None]:
bad_cats

In [None]:
record=tc.loc[614]
record['mol']=Chem.MolFromSmiles(record['smiles'])
for cat in bad_cats:
    print(cat+'\n')
    print(all_tests[cat].print_tree(record))
    print('\n')

In [None]:
new_tests={}

In [807]:
#Aliphatic amines
alphamine=Chem.MolFromSmarts('[$(C[NH2]),$(C[NH1]C),$(CN(C)C)]')
def test(x):
    mol=x['mol']
    mw=x['mol_weight']
    return mw<1000 and mol.HasSubstructMatch(alphamine)
new_tests['Aliphatic Amines']=test

In [None]:
#Alkoxysilanes
alkoxy=Chem.MolFromSmarts('[CX4]O[SiX4]')
def test(x):
    mol=x['mol']
    mw=x['mol_weight']
    return mw<1000 and mol.HasSubstructMatch(alkoxy)
new_tests['Alkoxysilanes']=test

In [None]:
#Aminobenzothiazole Azo Dyes
azodye=Chem.MolFromSmiles('NC1=NC2=C(S1)C=CC=C2')
def test(x):
    mol=x['mol'] 
    return mol.HasSubstructMatch(azodye)
new_tests['Aminobenzothiazole Azo Dyes']=test

In [None]:
dsstox.find_one({'name':{'$regex':'phosphate'}})

In [750]:
#Anionic Surfactants
sulfate=Chem.MolFromSmarts('C.OS(O)(O)O')
sulfonate=Chem.MolFromSmarts('C.OS(=O)=O')
phosphate=Chem.MolFromSmarts('C.O=P(O)(O)O')
carboxylic=Chem.MolFromSmarts('C.OC(=O)')
silicic=Chem.MolFromSmarts('[Si][OX2H]')
#Or these together
def test(x):
    mol=x['mol']
    return mol.HasSubstructMatch(sulfate) or mol.HasSubstructMatch(sulfonate)\
    or mol.HasSubstructMatch(phosphate) or mol.HasSubstructMatch(carboxylic)
new_tests['Anionic Surfactants']=test

In [751]:
Chem.MolFromSmiles('CCO[Si](O)=O').GetSubstructMatches(silicic)

((3, 4),)

In [719]:
#Dianilines
dianiline=Chem.MolFromSmarts('[$(c1cc(N)ccc1[C,O,N,S]c1ccccc1),$(c1cc(N)ccc1[C,O,N,S]c1ccccc1)]')
def test(x):
    mol=x['mol']
    return len(mol.GetSubstructMatches(dianiline))==4 #lol
    new_tests['Dianilines']=test

In [722]:
len(Chem.MolFromSmiles('Nc1ccc(Cc2ccc(N)cc2)cc1').GetSubstructMatches(dianiline))

4

In [None]:
#Dithiocarbamates
dithiocarbamate=Chem.MolFromSmarts('NC(=S)S')
def test(x):
    mol=x['mol']
    return mol.HasSubstructMatch(dithiocarbamate) and x['mol_weight']<1000 and x['logp']<5
new_tests['Dithiocarbamates (Acute toxicity)']=test
def test(x):
    mol=x['mol']
    return mol.HasSubstructMatch(dithiocarbamate) and x['mol_weight']<1000 and x['logp']>=5 and x['logp']<19
new_tests['Dithiocarbamates (Chronic toxicity)']=test

In [707]:
#Ethylene Glycol Ethers
ege=Chem.MolFromSmarts('[$(OCC),$(OCCOCCC),$(OCCOCC)][$(C),$(CC),$(CCC),$(CCCC),$(CCCCC),$(CCCCCC),$(CCCCCCC)]'\
                       '.[$(C),$(CC),$(CCC),$(CCCC),$(CCCCC),$(CCCCCC),$(CCCCCCC)]')
c=Chem.MolFromSmiles('CCCCOCCOCCO')
c.HasSubstructMatch(ege)

True

In [None]:
#It doesn't care about overlap!!!!
co=Chem.MolFromSmarts('[$(CCO),$(N)][$(CO),$(N)]')
m=Chem.MolFromSmiles('CCO')
m.HasSubstructMatches(co)

In [710]:
#Have to enumerate then
smarts=[]
match_mols=[]
for i in range(1,7):
    for j in range(0,7):
        for k in range(1,3):
            smart='C'*i+'OCC'*k+'O'+'C'*j
            match_mols.append(Chem.MolFromSmiles(smart))
            smarts.append(smart)
            
def test(x):
    mol=x['mol']
    ege_matches=[True if mol.HasSubstructMatch(match_mol) and match_mol.HasSubstructMatch(mol) else False for match_mol in match_mols]
    return any(ege_matches)
new_tests['Ethylene Glycol Ethers']=test

In [708]:
mol=Chem.MolFromSmiles('CCCCOCCOCCO')
match_mol=Chem.MolFromSmiles('CCCCOCCOCCO')
mol.HasSubstructMatch(match_mol)
match_mol.HasSubstructMatch(mol)

True

True

In [703]:
#Neutral Organics
#Contains alcohols,ketons,ethers,alkyl halides,aryl halides,aromatic hydrocarbons
alcohol=Chem.MolFromSmarts('C.C[OX2H]')
ether=Chem.MolFromSmarts('[OD2](C)C')
ketone=Chem.MolFromSmarts('C[CX3](=O)C')
halide=Chem.MolFromSmarts('C[Cl,Br]')
aromatichydrocarbon=Chem.MolFromSmarts('c')
carboxylic=Chem.MolFromSmarts('C.OC(=O)')
def test(x):
    mol=x['mol']
    return x['mol_weight']<1000 and x['logp']<8 and (mol.HasSubstructMatch(alcohol) or mol.HasSubstructMatch(ether) or mol.HasSubstructMatch(ketone)\
    or mol.HasSubstructMatch(halide) or mol.HasSubstructMatch(aromatichydrocarbon)) and not mol.HasSubstructMatch(carboxylic)
new_tests['Neutral Organics']=test

In [704]:
carboxylic=Chem.MolFromSmarts('C.OC(=O)')
mol=Chem.MolFromSmiles('OC(=O)C=C')
mol.HasSubstructMatch(carboxylic)
test({'mol':mol,'mol_weight':0,'logp':0,'smiles':'OC(=O)C=C'})

True

False

In [None]:
#Nonionic Surfactants
# nonsurf1=Chem.MolFromSmarts('COCCO')
# nonsurf2=Chem.MolFromSmarts('COCCOC')
# def test(x):
#     mol=x['mol']
#     return mol.HasSubstructMatch(nonsurf1) or mol.HasSubstructMatch(nonsurf2)
import re
def test(x):
    smiles=x['smiles']
    if '(' in smiles:
        return False
    split_smiles=smiles.split('O')
    return not any([re.search(r'[^C]',c) for c in split_smiles])
new_tests['Nonionic Surfactants']=test

In [None]:
#Organotins (Acute toxicity) and Organotins (Chronic toxicity)
organotin=Chem.MolFromSmarts('C[Sn]')
def test(x):
    mol=x['mol']
    return x['mol_weight']<1000 and mol.HasSubstructMatch(organotin) and x['logp']<=13.7
new_tests['Organotins (Acute toxicity)']=test
def test(x):
    mol=x['mol']
    return x['mol_weight']<1000 and mol.HasSubstructMatch(organotin) and x['logp']>=13.7
new_tests['Organotins (Chronic toxicity)']=test

In [None]:
Chem.MolFromSmiles('Cc1cc(C)n(SC(Cl)(Cl)Cl)n1').HasSubstructMatch(organotin)

In [None]:
#Persistent, Bioaccumulative and Toxic (PBT) Chemicals
#MW<1000
#OPERA_HL > np.log(60)
#Ready biodegradability ?????
#LogP>4.2
#Not sure this one is worth doing without ready biodegradability
# def test(x):
#     mol=x['mol']
# new_tests['Persistent, Bioaccumulative and Toxic (PBT) Chemicals']=test

In [None]:
#Polynitroaromatics (Acute toxicity) and Polynitroaromatics (Chronic toxicity)
#MW < 1000
polynitroaromatic=Chem.MolFromSmarts('N[$(c1c(N)cccc1),$(c1cc(N)ccc1),$(c1ccc(N)cc1),$(c1cncc(N)c1)]')

def test(x):
    mol=x['mol']
    return x['mol_weight']<1000 and mol.HasSubstructMatch(polynitroaromatic) and x['logp']<7
new_tests['Polynitroaromatics (Acute toxicity)']=test
def test(x):
    mol=x['mol']
    return x['mol_weight']<1000 and mol.HasSubstructMatch(polynitroaromatic) and x['logp']>=10
new_tests['Polynitroaromatics (Chronic toxicity)']=test

In [None]:
#Substituted Triazines (Acute toxicity) and Substituted Triazines (Chronic toxicity)
#logp<5
#MW<1000
subtriazine=Chem.MolFromSmarts('[$(n1nnccc1.[!#1]),$(n1ncncc1.[!#1]),$(n1cncnc1.[!#1])]')#[!H] did not work as expected with aromatics
def test(x):
    mol=x['mol']
    return x['mol_weight']<1000 and mol.HasSubstructMatch(subtriazine) and x['logp']<5
new_tests['Substituted Triazines (Acute toxicity)']=test
def test(x):
    mol=x['mol']
    return x['mol_weight']<1000 and mol.HasSubstructMatch(subtriazine) and x['logp']>5 and x['logp']<=8
new_tests['Substituted Triazines (Chronic toxicity)']=test

In [None]:
Chem.MolFromSmiles('Oc1nc(O)nc(O)n1').HasSubstructMatch(subtriazine)

In [None]:
Chem.MolFromSmiles('C1=CC(O)=CC=C1').HasSubstructMatch(Chem.MolFromSmarts('c.[!c,!H]'))

In [None]:
def convert_ppb(x): #OPERA results stored as mol/L
    ws=x['ws']
    mol_weight=x['mol_weight']
    return ws*mol_weight*10**6

In [None]:
#Triarylmethane Pigments/Dyes with Non-solubilizing Groups
triphenylmethane=Chem.MolFromSmarts('c1cc([N,O])ccc1C(c1ccc([N,O])cc1)=C1C=C[$(C(=N)),$(C(=O))]C=C1')
def test(x):
    mol=x['mol']
    return convert_ppb(x)>1 and mol.HasSubstructMatch(triphenylmethane)
new_tests['Triarylmethane Pigments/Dyes with Non-solubilizing Groups']=test

In [None]:
triphenylmethane

In [None]:
Chem.MolFromSmiles('Oc1nc(O)nc(O)n1').HasSubstructMatch(triphenylmethane)

In [None]:
#beta-Naphthylamines, Sulfonated
smarts=[]
match_mols=[]
prefix='Nc1c([H,OH])'
suffix='cc1'
for c1 in range(1,4):
    for c2 in range(c1+1,5):
        smarts.append(prefix+'c2'+'c'*c1+'([H,O,N])'+'c'*(c2-c1)+'([$(S(=O)(=O)O),$(S(=O)(=O)CCSO)])'+'c'*(4-c2)+'c2'+suffix)
        smarts.append(prefix+'c2'+'c'*c1+'([$(S(=O)(=O)O),$(S(=O)(=O)CCSO)])'+'c'*(c2-c1)+'([H,O,N])'+'c'*(4-c2)+'c2'+suffix)
match_mols=[Chem.MolFromSmarts(smart) for smart in smarts]
def test(x):
    mol=x['mol']
    naph_matches=[True for match in match_mols if mol.HasSubstructMatch(match) and match.HasSubstructMatch(mol)]
    return any(naph_matches)
    
new_tests['beta-Naphthylamines, Sulfonated']=test

In [None]:
mol=Chem.MolFromSmiles('Nc1ccc(N=Nc2ccc(cc2)-c2ccc(cc2)N=Nc2c(N)c3c(O)c(N=Nc4ccccc4)c(cc3cc2S(O)(=O)=O)S(O)(=O)=O)c(N)c1')
test({'mol':mol})

In [None]:
match

In [None]:
new_tests.update({k:q.query for k,q in tests.iteritems()})

<h1>New tests on ToxVal</h1>

In [None]:
toxval_cats={}
for i,row in tv.iterrows():
    record=dict(row)
    sid=record['dsstox_sid']
    mol=Chem.MolFromSmiles(record['smiles'])
    if not mol:
        print(i)
        continue
    record['mol']=mol
    categories=[category for category,test in new_tests.iteritems() if test(record)]
    if len(categories)==0:
        toxval_cats[sid]='Not categorized'
    else:
        toxval_cats[sid]=categories='|'.join(categories)

In [None]:
tv['categories']=tv['dsstox_sid'].map(toxval_cats)
tv=tv[pd.notnull(tv['categories'])]

In [756]:
category='Aliphatic Amines'
with pd.option_context('display.max_colwidth',200,'display.max_rows',None):
    tv[~(tv['US-EPA New Chemical Categories'].str.contains(category)) & (tv['categories'].str.contains(category))].head()
    #tv[(tv['categories'].str.contains(category)) & (tv['US-EPA New Chemical Categories'].str.contains(category))]

Unnamed: 0,CAS Number,smiles,US-EPA New Chemical Categories,dsstox_sid,logp,ws,mol_weight,categories
16,123-77-3,NC(=O)N=NC(N)=O,Not categorized,DTXSID0024553,-1.21083,0.225631,116.080002,Aliphatic amines|Aliphatic Amines
91,617-84-5,CCN(CC)C=O,Not categorized,DTXSID3020463,-0.233259,7.15855,101.149002,Aliphatic amines|Aliphatic Amines
97,121-69-7,CN(C)c1ccccc1,Not categorized,DTXSID2020507,2.11939,0.022527,121.182999,Neutral Organics|Aliphatic amines|Aliphatic Amines
99,68-12-2,CN(C)C=O,Not categorized,DTXSID6020515,-1.04481,8.57163,73.095001,Aliphatic amines|Aliphatic Amines
100,57-14-7,CN(C)N,Hydrazines and Related Compounds,DTXSID1020516,-0.748845,15.7877,60.099998,Aliphatic amines|Hydrazines and Related Compounds|Aliphatic Amines


In [None]:
bad_cats

In [None]:
tv[pd.isnull(tv['categories'])]