<h1>Extract SMARTS</h1>

In [1]:
import pandas as pd
import numpy as np
import pymongo
import sys
import os
from __future__ import print_function
from datetime import datetime

TOP = '/'.join(os.getcwd().split('/')[:-2])+'/'
LIB = TOP+'lib'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

DAT_DIR = TOP + 'data/'
FIG_DIR = TOP + 'figs/'

if not os.path.exists(DAT_DIR): os.mkdir(DAT_DIR)
if not os.path.exists(FIG_DIR): os.mkdir(FIG_DIR)
    
from db.mongo import *

from rax.genrapred import *
import db.etl as etl
from db.fpsim import *

from rdkit import Chem

import pymongo
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compound']
physprop=DB['physprop']

In [2]:
with open(DAT_DIR + 'epa_categories.xml','r') as f:
    xml=f.read()

In [3]:
xml=xml.replace('\n','')

In [4]:
xml

'<Scheme z:Id="1" xmlns="http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine" xmlns:i="http://www.w3.org/2001/XMLSchema-instance" xmlns:z="http://schemas.microsoft.com/2003/10/Serialization/"><CounterProfile z:Id="2">Not categorized</CounterProfile><Credits z:Id="3"><Adopted z:Id="4">QSAR Toolbox 2.0 beta, April 2010</Adopted><Author z:Id="5"/><Changelog z:Id="6">{\\rtf1\\ansi\\ansicpg1252\\uc1\\htmautsp\\deff2{\\fonttbl{\\f0\\fcharset0 Times New Roman;}{\\f2\\fcharset0 Segoe UI;}{\\f3\\fcharset0 Calibri;}}{\\colortbl\\red0\\green0\\blue0;\\red255\\green255\\blue255;}\\loch\\hich\\dbch\\pard\\plain\\ltrpar\\itap0{\\lang1033\\fs22\\f3\\cf0 \\cf0\\ql{\\f3 {\\ltrch SMARTS language for describing molecular patterns, i.e. structural boundaries, structural alerts has been implemented in OECD QSAR Toolbox 4.0.\\~ As a result "US-EPA New Chemical Categories" profiler has been rewritten but without modifying the knowledge and/or the logic it is based on. Only small distinctions are exp

In [5]:
#Mask analysis
import xml.etree.ElementTree as ET
e=ET.parse(DAT_DIR+'epa_categories.xml').getroot()

In [6]:
parent_map = {c:p for p in e.iter() for c in p}

In [197]:
import operator as op
op_dict={
    'GreaterThan': op.gt,
    'GreaterThanOrEqualTo': op.ge,
    'LessThan': op.lt,
    'LessThanOrEqualTo': op.le
}
prop_dict={
    'log Kow':'logp',
    'Molecular Weight':'mol_weight',
    'Molecular weight':'mol_weight',
    'Water Solubility': 'ws'
}

In [198]:
def define_smart_match(query):
    query_string=query.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ComplexSearch').text
    query_string=re.sub('false','False',query_string)
    query_string=re.sub('true','True',query_string)
    query_dict=ast.literal_eval(query_string)
    smart=query_dict['queries'][0]['smart']
    def smart_match(x):
        mol=Chem.MolFromSmiles(x['smiles'])
        pattern=Chem.MolFromSmarts(smart)
        if not pattern:
            return None
        ret=True if mol.GetSubstructMatches(pattern) else False
        return ret
    return smart_match
def define_compare(query):
    operand=query.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}Operand').text
    prop=query.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ParameterName').text
    value=query.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}Value').text
    def compare(x):
        ret = op_dict[operand](x[prop_dict[prop]],value)
        return ret
    return compare
def define_andit(funcs):
    def andit(x):
        return all([func(x) for func in funcs])
    return andit
def define_orit(funcs):
    def orit(x):
        return any([func(x) for func in funcs])
    return orit
def define_notit(func):
    def notit(x):
        return not(func(x))
    return notit

In [199]:
tests={}
import re
import ast
for elem in e.iter('{http://schemas.microsoft.com/2003/10/Serialization/Arrays}anyType'):
    category=elem.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Caption').text
    queries=elem.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Expression')\
        .find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Queries')\
        .findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query')
    contents=[query.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Content') for query in queries]
    query_tree={}
    for query in contents:
        attributes=query.attrib
        if '{http://schemas.microsoft.com/2003/10/Serialization/}Id' not in attributes:
            continue
        query_id=attributes['{http://schemas.microsoft.com/2003/10/Serialization/}Id']
        query_type=attributes['{http://www.w3.org/2001/XMLSchema-instance}type']
        if query_type=='b:StructureQuery':
            smart_match=define_smart_match(query)
            if smart_match:
                query_tree[query_id]=smart_match
        elif query_type=='b:ParameterQuery':
            compare=define_compare(query)
            query_tree[query_id]=compare
        elif query_type=='LogicalQuery':
            logic=query.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Logic').text
            elements=query.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Elements')
            node_ids=[elem.attrib['{http://schemas.microsoft.com/2003/10/Serialization/}Ref']\
                      for elem in elements.findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query')\
                      if '{http://schemas.microsoft.com/2003/10/Serialization/}Ref' in elem.attrib]
            if logic=='Not':
                node_id=node_ids[0] #Should only be one
                notit=define_notit(query_tree[node_id])
                query_tree[query_id]=notit
                del query_tree[node_id]
            elif logic=='And':
                funcs=[query_tree[node_id] for node_id in node_ids]
                andit=define_andit(funcs)
                query_tree[query_id]=andit
                for node_id in node_ids:
                    del query_tree[node_id]
            else:
                funcs=[query_tree[node_id] for node_id in node_ids]
                for orquery in elements.findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query'):
                    if '{http://www.w3.org/2001/XMLSchema-instance}type' in orquery.attrib:
                        extra_func=define_smart_match(orquery)
                        if extra_func:
                            funcs.append(extra_func)       
                orit=define_orit(funcs)
                query_tree[query_id]=orit
                for node_id in node_ids:
                    del query_tree[node_id]
    tests[category]=query_tree
tests={k:v.values()[0] for k,v in tests.iteritems()}

<h1>Get Category Fingerprints</h1>

In [12]:
from rdkit import Chem
smarts_df=pd.read_csv(DAT_DIR+'epa_categories.tsv',sep='\t')

In [13]:
#Some rows aren't valid smarts
for i,row in smarts_df.iterrows():
     if not Chem.MolFromSmarts(row['Smarts']):
        smarts_df=smarts_df.drop(i)

In [14]:
import pymongo
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compound']
epa_cats=DB['epa_categories']

In [16]:
dsstox_smiles=dsstox.find({},{'_id':0,'dsstox_sid':1,'smiles':1,'mol_weight':1})
dsstox_logp=physprop.find({},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_LogP':1})
#logp_dict_temp={record['dsstox_sid']:record.get('predicted_props',{}) for record in dsstox_logp if 'dsstox_sid' in record.keys()}
#logp_dict={sid:props['OPERA_LogP'][0] for sid,props in logp_dict_temp.iteritems() if 'OPERA_LogP' in props.keys()}
logp_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_LogP'][0] for record in dsstox_logp \
           if 'OPERA_LogP' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
dsstox_ws=physprop.find({},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_WS':1})
ws_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_WS'][0] for record in dsstox_ws \
           if 'OPERA_WS' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}

In [467]:
from pymongo import InsertOne
i=0
inserts=[]
while i<dsstox_smiles.count():
    if inserts:
        epa_cats.bulk_write(inserts)
        inserts=[]
    j=0
    dsstox_smiles=dsstox.find({},{'_id':0,'dsstox_sid':1,'smiles':1,'mol_weight':1},batch_size=20000)
    dsstox_smiles.skip(i)
    while j<=5000: #Must do it this way because cursor will time out
        record=dsstox_smiles.next()
        sid=record['dsstox_sid']
        try:
            record['logp']=logp_dict[sid]
            record['ws']=ws_dict[sid]
        except:
            continue
        i+=1
        j+=1
        smiles=record['smiles']
        try:
            mol=Chem.MolFromSmiles(smiles)
        except:
            print(sid + ' no smiles')
        if not mol:
            continue
        fp_set=[]
        for category,test in tests.iteritems():
            if test(record):
                fp_set.append(category)
        categories_record={'dsstox_sid':sid,'fp':fp}
        inserts.append(InsertOne(category_record))

<pymongo.cursor.Cursor at 0x7f35b71a0050>

DTXSID6020010 no smiles


NameError: name 'mol' is not defined

In [202]:
import inspect
record=dsstox.find_one({'casrn':'79-10-7'})
sid=record['dsstox_sid']
record['logp']=logp_dict[sid]
record['ws']=ws_dict[sid]
for category,test in tests.iteritems():
    if test(record):
        print(category)

In [495]:
import inspect
inspect.getsource(tests['Neutral Organics'])

u'                def andit(x):\n                    return all([func(x) for func in funcs])\n'

In [484]:
record

{u'_id': ObjectId('58fe6131f0e291b4c06a2c1d'),
 u'casrn': u'83-32-9',
 u'chemspider_id': 6478,
 u'created_at': datetime.datetime(2017, 4, 24, 20, 32, 19),
 u'dsstox_cid': u'DTXCID201774',
 u'dsstox_sid': u'DTXSID3021774',
 u'gsid': 21774,
 u'inchi_key': u'CWRYPZZKDGJXCA-UHFFFAOYSA-N',
 u'iupac': u'1,2-Dihydroacenaphthylene',
 'logp': 0.882667,
 u'mol_weight': 154.21200561523438,
 u'name': u'Acenaphthene',
 u'pubchem_cid': 6734,
 u'smiles': u'C1CC2=C3C1=CC=CC3=CC=C2',
 u'synonyms': [u'Acenaphthylene, 1,2-dihydro-',
  u'1,2-Dihydroacenaphthylene',
  u'1,8-Ethylenenaphthalene',
  u'acenafteno',
  u'Acenaphtene',
  u'Acenaphthen',
  u'Naphthyleneethylene',
  u'NSC 7657',
  u'peri-Ethylenenaphthalene',
  u'1,8-Dihydroacenaphthalene',
  u'EINECS 201-469-6',
  u'Ethylenenaphthalene',
  u'UNII-V8UT1GAC5Y'],
 u'updated_at': datetime.datetime(2017, 8, 30, 19, 10, 40),
 u'viz': u'<?xml version="1.0" encoding="UTF-8"?>\n<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xl

In [9]:
with open(DAT_DIR+'category_records.txt','r') as f:
    records=f.readlines()

In [35]:
import ast
lit_records=[ast.literal_eval(record)[0] for record in records]

In [36]:
DB['epa_categories'].insert_many(lit_records)

<pymongo.results.InsertManyResult at 0x7f99f8154780>

In [369]:
epa_cats=DB['epa_categories']

In [118]:
existing_sids=set([record['dsstox_sid'] for record in epa_cats.find()])
all_sids=set([record['dsstox_sid'] for record in dsstox.find()])
missing_sids=all_sids-existing_sids
missing_records=[record for record in dsstox.find() if record['dsstox_sid'] in missing_sids]

In [119]:
len(missing_sids)

1199

In [120]:
#Missing records has duplicates
temp={record['dsstox_sid']:record for record in missing_records}
deduped=[v for k,v in temp.iteritems()]

In [121]:
for record in deduped:
    sid=record['dsstox_sid']
    smiles=record['smiles']
    try:
        mol=Chem.MolFromSmiles(smiles)
    except:
        print(sid + ' no smiles')
    if not mol:
        continue
    fp_set=set()
    for index,row in smarts_df.iterrows():
        smarts=row['Smarts']
        category=row['Category']
        if mol.GetSubstructMatches(Chem.MolFromSmarts(smarts)):
            fp_set.add(category)
    fp=list(fp_set)
    categories_record={'dsstox_sid':sid,'fp':fp}
    epa_cats.insert_one(categories_record)

DTXSID10872639 no smiles


<h1>ToxVal Testing</h1>

In [319]:
toxval_df=pd.read_excel(DAT_DIR+'toxval_epa_categories.xlsx')
toxval_df=toxval_df[toxval_df['CAS Number']!='No CAS number']

In [459]:
toxval_df.head()

Unnamed: 0,CAS Number,SMILES,Molecular Formula,Predefined substance type,Additional Ids,Composition,CAS Smiles relation,US-EPA New Chemical Categories
0,83-32-9,C1Cc2cccc3cccc1c23,C12H10,Mono constituent,EC Number:3775224,,High,Not categorized
1,75-86-5,CC(C)(O)C#N,C4H7NO,Mono constituent,EC Number:3774714,,High,Not categorized
2,98-86-2,CC(=O)c1ccccc1,C8H8O,Mono constituent,EC Number:3776351,,High,Neutral Organics
3,107-02-8,C=CC=O,C3H4O,Mono constituent,EC Number:3777034,,High,Aldehydes (Acute toxicity)
4,79-10-7,OC(=O)C=C,C3H4O2,Mono constituent,EC Number:3774957,,High,Acrylates/Methacrylates (Acute toxicity)


In [436]:
casrns=list(toxval_df['CAS Number'])
toolbox_records=dsstox.find({'casrn':{'$in':casrns}},{'casrn':1,'dsstox_sid':1})
toolbox_records.count()
sids=[record['dsstox_sid'] for record in toolbox_records]
dupe_sids=[sid for sid,count in collections.Counter(sids).items() if count > 1]
#for sid in dupe_sids:
#    list(dsstox.find({'dsstox_sid':sid},{'smiles':1,'casrn':1,'dsstox_sid':1}))
#I think these are just salts

In [441]:
toolbox_records=dsstox.find({'casrn':{'$in':casrns}},{'casrn':1,'dsstox_sid':1})
sidmap={doc['casrn']:doc['dsstox_sid'] for doc in toolbox_records}
toxval_df['dsstox_sid']=toxval_df['CAS Number'].map(sidmap)
toxval_df['dsstox_sid']=toxval_df['casrn'].map(sidmap)
sids=sidmap.values()
catmap={row['CAS Number']:set(row['US-EPA New Chemical Categories'].split('|')) for i,row in toxval_df.iterrows()}
toolbox_cats={sidmap[casrn]:catmap[casrn] for casrn in casrns if casrn in sidmap.keys() and casrn in catmap.keys()}

In [462]:
toxval_df.head()

Unnamed: 0,CAS Number,SMILES,Molecular Formula,Predefined substance type,Additional Ids,Composition,CAS Smiles relation,US-EPA New Chemical Categories,dsstox_sid
0,83-32-9,C1Cc2cccc3cccc1c23,C12H10,Mono constituent,EC Number:3775224,,High,Not categorized,DTXSID3021774
1,75-86-5,CC(C)(O)C#N,C4H7NO,Mono constituent,EC Number:3774714,,High,Not categorized,DTXSID7025427
2,98-86-2,CC(=O)c1ccccc1,C8H8O,Mono constituent,EC Number:3776351,,High,Neutral Organics,DTXSID6021828
3,107-02-8,C=CC=O,C3H4O,Mono constituent,EC Number:3777034,,High,Aldehydes (Acute toxicity),DTXSID5020023
4,79-10-7,OC(=O)C=C,C3H4O2,Mono constituent,EC Number:3774957,,High,Acrylates/Methacrylates (Acute toxicity),DTXSID0039229


In [498]:
toolbox_records=dsstox.find({'casrn':{'$in':casrns}},{'casrn':1,'dsstox_sid':1,'smiles':1})
smilesmap={record['dsstox_sid']:record['smiles'] for record in toolbox_records}
toxval_df['db_smiles']=toxval_df['dsstox_sid'].map(smilesmap)

In [501]:
toxval_df[['dsstox_sid','SMILES','db_smiles']]

Unnamed: 0,dsstox_sid,SMILES,db_smiles
0,DTXSID3021774,C1Cc2cccc3cccc1c23,C1CC2=C3C1=CC=CC3=CC=C2
1,DTXSID7025427,CC(C)(O)C#N,CC(C)(O)C#N
2,DTXSID6021828,CC(=O)c1ccccc1,CC(=O)C1=CC=CC=C1
3,DTXSID5020023,C=CC=O,C=CC=O
4,DTXSID0039229,OC(=O)C=C,OC(=O)C=C
5,DTXSID8020040,ClC1=C(Cl)C2(Cl)C3C4CC(C=C4)C3C1(Cl)C2(Cl)Cl,ClC1=C(Cl)[C@]2(Cl)[C@@H]3[C@@H]4C[C@@H](C=C4)...
6,DTXSID8020044,OCC=C,OCC=C
7,DTXSID8074312,Cc1c(cc(N)cc1[N+]([O-])=O)[N+]([O-])=O,CC1=C(C=C(N)C=C1[N+]([O-])=O)[N+]([O-])=O
8,DTXSID6044068,Cc1c(N)cc(cc1[N+]([O-])=O)[N+]([O-])=O,CC1=C(C=C(C=C1N)[N+]([O-])=O)[N+]([O-])=O
9,DTXSID3024497,Nc1cccc(O)c1,NC1=CC(O)=CC=C1


In [445]:
rdkit_records=epa_cats.find({'dsstox_sid':{'$in':sids}})
rdkit_records.count()
rdkit_sids=[record['dsstox_sid'] for record in rdkit_records]
dupe_sids=[sid for sid,count in collections.Counter(rdkit_sids).items() if count > 1]
len(rdkit_sids)
#for sid in set(sids)-set(rdkit_sids):
#    print(list(dsstox.find({'dsstox_sid':sid},{'smiles':1})))
#Discrepancy comes from FAIL smiles

10822

10822

In [447]:
rdkit_records=epa_cats.find({'dsstox_sid':{'$in':sids}})
[record['dsstox_sid'] for record in rdkit_records if record['fp']==None]

[]

In [457]:
rdkit_records=epa_cats.find({'dsstox_sid':{'$in':rdkit_sids}})
rdkit_cats={record['dsstox_sid']:set(record['fp']) for record in rdkit_records}

In [455]:
len(toolbox_cats.keys())
len(rdkit_cats.keys())

10834

10822

In [468]:
toolbox_cats['DTXSID7025427']
rdkit_cats['DTXSID7025427']

{u'Not categorized'}

{u'Phosphates, Inorganic'}

In [474]:
compare_sids=set(sids)&set(rdkit_sids)

In [533]:
diffs={}
for sid in compare_sids:
    diffs[sid]=toolbox_cats[sid]&rdkit_cats[sid]

In [537]:
[sid for sid,cats, in diffs.iteritems() if 'Alkoxysilanes' in cats]

[]

In [524]:
l=list(smarts_df['Category'].unique())
l.sort()
l

['Acid Chlorides',
 'Acrylamides',
 'Acrylates/Methacrylates (Acute toxicity)',
 'Acrylates/Methacrylates (Chronic toxicity)',
 'Aldehydes (Acute toxicity)',
 'Aldehydes (Chronic toxicity)',
 'Aliphatic Amines',
 'Alkoxysilanes',
 'Aluminum Compounds',
 'Anhydrides, Carboxylic acid',
 'Anilines (Acute toxicity)',
 'Anilines (Chronic toxicity)',
 'Anionic Surfactants',
 'Azides (Acute toxicity)',
 'Azides (Chronic toxicity)',
 'Benzotriazole-hindered phenols',
 'Benzotriazoles (Acute toxicity)',
 'Benzotriazoles (Chronic toxicity)',
 'Boron Compounds',
 'Cationic (quaternary ammonium) surfactants',
 'Cobalt',
 'Diazoniums (Acute toxicity)',
 'Diazoniums (Chronic toxicity)',
 'Dichlorobenzidine-based Pigments',
 'Diisocyanates',
 'Dithiocarbamates (Acute toxicity)',
 'Dithiocarbamates (Chronic toxicity)',
 'Epoxides',
 'Esters (Acute toxicity)',
 'Esters (Chronic toxicity)',
 'Ethylene Glycol Ethers',
 'Hindered Amines',
 'Hydrazines and Related Compounds',
 'Imides (Acute toxicity)',
 '

In [526]:
diffs

{u'DTXSID1020485': set(),
 u'DTXSID70207089': set(),
 u'DTXSID3037672': set(),
 u'DTXSID1020489': {u'(N/A)'},
 u'DTXSID8027749': set(),
 u'DTXSID6051518': set(),
 u'DTXSID20232907': {u'Not categorized'},
 u'DTXSID9058783': {u'Not categorized'},
 u'DTXSID8027741': set(),
 u'DTXSID8027747': {u'Not categorized'},
 u'DTXSID2021151': {u'Not categorized'},
 u'DTXSID2021153': {u'Not categorized'},
 u'DTXSID2021155': {u'Not categorized'},
 u'DTXSID0022480': set(),
 u'DTXSID90172245': set(),
 u'DTXSID5075072': set(),
 u'DTXSID10237188': {u'Not categorized'},
 u'DTXSID00176097': {u'Not categorized'},
 u'DTXSID10237183': set(),
 u'DTXSID4021426': {u'Not categorized'},
 u'DTXSID4021395': {u'Polynitroaromatics (Acute toxicity)'},
 u'DTXSID4021424': {u'(N/A)'},
 u'DTXSID4021397': {u'Not categorized'},
 u'DTXSID4024143': set(),
 u'DTXSID4021393': {u'Ethylene Glycol Ethers', u'Nonionic Surfactants'},
 u'DTXSID70207660': {u'Not categorized'},
 u'DTXSID6024466': set(),
 u'DTXSID6024464': set(),
 u'DTXSI

In [475]:
both_cats={sid:(toolbox_cats[sid],rdkit_cats[sid]) for sid in compare_sids}

In [491]:
both_cats['DTXSID6020199']

({u'Neutral Organics'}, {u'Phosphates, Inorganic'})

In [484]:
[sid for sid,cat in toolbox_cats.iteritems() if len(cat)>1]

[u'DTXSID70207089',
 u'DTXSID4025161',
 u'DTXSID4021393',
 u'DTXSID20196398',
 u'DTXSID7051507',
 u'DTXSID90194104',
 u'DTXSID4020113',
 u'DTXSID40184974',
 u'DTXSID4051900',
 u'DTXSID50190389',
 u'DTXSID7032634',
 u'DTXSID70221220',
 u'DTXSID3049673',
 u'DTXSID10224912',
 u'DTXSID9027021',
 u'DTXSID20183864',
 u'DTXSID3045356',
 u'DTXSID9021344',
 u'DTXSID7074953',
 u'DTXSID0045252',
 u'DTXSID1023029',
 u'DTXSID1020649',
 u'DTXSID1057612',
 u'DTXSID3022825',
 u'DTXSID3052852',
 u'DTXSID6023864',
 u'DTXSID3060405',
 u'DTXSID7072969',
 u'DTXSID1027007',
 u'DTXSID4047466',
 u'DTXSID50204394',
 u'DTXSID5073799',
 u'DTXSID6051986',
 u'DTXSID1024621',
 u'DTXSID3020758',
 u'DTXSID8021804',
 u'DTXSID8021806',
 u'DTXSID2075184',
 u'DTXSID7044267',
 u'DTXSID70175133',
 u'DTXSID3020334',
 u'DTXSID3020330',
 u'DTXSID60858923',
 u'DTXSID5020865',
 u'DTXSID5020867',
 u'DTXSID1021792',
 u'DTXSID8045438',
 u'DTXSID4027103',
 u'DTXSID30231183',
 u'DTXSID5026548',
 u'DTXSID50145051',
 u'DTXSID7023724',

In [481]:
len(set.union(*rdkit_cats.values()))
len(set.union(*toolbox_cats.values()))

52

53

In [530]:
sid='DTXSID5061746'
dsstox.find_one({'dsstox_sid':sid},{'casrn':1})
smarts_df[smarts_df['Category']=='Alkoxysilanes']

{u'_id': ObjectId('58fe63bff0e291b4c06afe56'), u'casrn': u'1516-80-9'}

Unnamed: 0,Category,Smarts
69,Alkoxysilanes,[Si](=[#8])([#8h])[#8][#6]


In [544]:
cat_retry={}
smarts_matches={}
toolbox_records=dsstox.find({'casrn':{'$in':casrns}},{'casrn':1,'dsstox_sid':1,'smiles':1})
for record in toolbox_records:
    sid=record['dsstox_sid']
    smiles=record['smiles']
    try:
        mol=Chem.MolFromSmiles(smiles)
    except:
        print(sid + ' no smiles')
    if not mol:
        continue
    fp_set=set()
    smarts_set=set()
    for index,row in smarts_df.iterrows():
        smarts=row['Smarts']
        category=row['Category']
        if mol.GetSubstructMatches(Chem.MolFromSmarts(smarts)):
            fp_set.add(category)
            smarts_set.add(smarts)
    fp=list(fp_set)
    cat_retry[sid]=fp
    smarts_fp=list(smarts_set)
    smarts_matches[sid]=smarts_fp

In [542]:
smarts

'[#7](=[#8])$[[#1],[#6X4]]{1..;xm}'

In [505]:
list(epa_cats.find({'dsstox_sid':'DTXSID6020199'}))

[{u'_id': ObjectId('5af4a4e80c7ea0ba1507804c'),
  u'dsstox_sid': u'DTXSID6020199',
  u'fp': [u'Phosphates, Inorganic']}]

In [507]:
smarts_df[smarts_df['Category']=='Phosphates, Inorganic']

Unnamed: 0,Category,Smarts
49,"Phosphates, Inorganic",[#8][#15v5](=[#8])([#8])[#8]
50,"Phosphates, Inorganic",[#6]


In [538]:
smarts_df

Unnamed: 0,Category,Smarts
0,Acid Chlorides,"[#6,#16](=[#8])(Cl)[#6X4,c]"
1,Acrylamides,"[#6](=[#8])([#7h])[#6](=[#6h2])[Ch3,#1]"
2,Acrylates/Methacrylates (Acute toxicity),"[#6h2]=[#6]([#6](=[#8])[#8])[Ch3,#1]"
3,Aldehydes (Acute toxicity),"[#6h](=[#8])[#6,#1]"
4,Aliphatic Amines,[#6X4][#7h2v3]
5,Aliphatic Amines,[#6X4][#7hv3][#6X4]
6,Aliphatic Amines,[#6X4][#7v3]([#6X4])[#6X4]
7,Aliphatic Amines,[#6X4][#7+h3]
8,Aliphatic Amines,[#6X4][#7+h2][#6X4]
9,Aliphatic Amines,[#6X4][#7+h]([#6X4])[#6X4]
