In [None]:
import pandas as pd
import numpy as np
import pymongo
import sys
import os
from __future__ import print_function
from datetime import datetime
import matplotlib.pyplot as plt

TOP = '/'.join(os.getcwd().split('/')[:-2])+'/'
LIB = TOP+'lib'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

DAT_DIR = TOP + 'data/toxref/'
FIG_DIR = TOP + 'figs/toxref/'

from rax.genrapred import *

In [None]:
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compound']
toxref=DB['toxrefdb2']
physprop=DB['physprop']

In [None]:
import operator as op
op_dict={
    'GreaterThan': op.gt,
    'GreaterThanOrEqualTo': op.ge,
    'LessThan': op.lt,
    'LessThanOrEqualTo': op.le
}
prop_dict={
    'log Kow':'logp',
    'Molecular Weight':'mol_weight',
    'Molecular weight':'mol_weight',
    'Water Solubility': 'ws'
}

In [None]:
def convert_ppb(x): #OPERA results stored as mol/L
    ws=x['ws']
    mol_weight=x['mol_weight']
    return ws*mol_weight*10**6

In [None]:
import dill
with open(DAT_DIR+'../category_tests.dill') as f:
    category_tests=dill.load(f)

In [None]:
loael_smiles=dsstox.find({},{'_id':0,'dsstox_sid':1,'smiles':1})
smiles_dict={record['dsstox_sid']:record['smiles'] for record in loael_smiles if record['smiles']}
loael_logp=physprop.find({},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_LogP':1})
logp_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_LogP'][0] for record in loael_logp \
           if 'OPERA_LogP' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
loael_ws=physprop.find({},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_WS':1})
ws_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_WS'][0] for record in loael_ws \
           if 'OPERA_WS' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
loael_weight=dsstox.find({})
weight_dict={record['dsstox_sid']:record['mol_weight'] for record in loael_weight}

In [None]:
from rdkit import Chem
sids=set(logp_dict.keys())&set(ws_dict.keys())&set(weight_dict.keys())&set(smiles_dict.keys())
records=[]
for sid in sids:
    records.append({'dsstox_sid':sid,'smiles':smiles_dict[sid],'logp':logp_dict[sid],'ws':ws_dict[sid],'mol_weight':weight_dict[sid],'mol':Chem.MolFromSmiles(smiles_dict[sid])})
records=[record for record in records if record['mol']]

In [None]:
import math
for record in records:
    epa_categories=sorted([category for category,test in category_tests.iteritems() if test(record)])
    if 'Neutral Organics' in epa_categories and len(epa_categories)>1:
        epa_categories.remove('Neutral Organics')
    record['categories']=tuple(epa_categories)

In [None]:
dsstox_category_df=pd.DataFrame(records)
dsstox_category_df.to_csv(DAT_DIR+'dsstox_epa_categories.csv')

In [None]:
dsstox_category_df.head()

In [None]:
dsstox_counts=pd.DataFrame(dsstox_category_df['categories'].value_counts())
dsstox_counts.to_csv(DAT_DIR+'dsstox_epa_categories.csv')

In [None]:
pd.DataFrame(dsstox_counts)[()]