In [1]:
import pandas as pd
import numpy as np
import pymongo
import sys
import os
from __future__ import print_function
from datetime import datetime
import matplotlib.pyplot as plt

TOP = '/'.join(os.getcwd().split('/')[:-2])+'/'
LIB = TOP+'lib'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

DAT_DIR = TOP + 'data/toxref/'
FIG_DIR = TOP + 'figs/toxref/'

from rax.genrapred import *

In [2]:
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compound']
toxref=DB['toxrefdb2']
physprop=DB['physprop']

In [3]:
def wtavg(df,name,k,s):
    df=df[df['jaccard']>s]
    df=df[df[name]!=np.inf]
    df=df[df[name].notnull()].iloc[0:k]
    if df.empty:
        return np.nan
    weights=list(df['jaccard'])
    values=list(df[name])
    return np.average(values,weights=weights)

In [4]:
def exact_k_wtavg(df,name,k,s):
    df=df[df['jaccard']>s]
    df=df[df[name]!=np.inf]
    df=df[df[name].notnull()].iloc[0:k]
    if len(df)<k:
        return np.nan
    weights=list(df['jaccard'])
    values=list(df[name])
    return np.average(values,weights=weights)

In [5]:
from __future__ import division
ks=range(1,20)
ss=[round(s/20,2) for s in range(1,20)]

<h1>EDA</h1>

In [69]:
print(str(toxref.count()) + ' total substances')

1076 total substances


In [7]:
def pod_record(document):
    pods=document['pods']
    for pod in pods:
        pod['dsstox_sid']=document['dsstox_sid']
    return pods

In [8]:
pods_df=pd.DataFrame([pod for document in toxref.find() for pod in pod_record(document)])
#pods_df=pods_df[pods_df['effect_profile_id']==2] #Turns out they all equal 2

In [9]:
#Need to convert to log molar
sids=list(pods_df['dsstox_sid'].unique())
weights={record['dsstox_sid']:record['mol_weight'] for record in dsstox.find({'dsstox_sid':{'$in':sids}})}
pods_df['mol_weight']=pods_df['dsstox_sid'].map(weights)
pods_df['pod_value_LM']=-np.log10(pods_df['pod_value']/pods_df['mol_weight']/1000)



In [83]:
loael_df=pods_df[pods_df['pod_type']=='loael']
loael_df=loael_df[loael_df['pod_unit']=='mg/kg/day']
loael_df.to_csv(DAT_DIR+'loael.csv',encoding='utf-8')

In [84]:
print(str(len(loael_df))+ ' LOAEL values')
print(str(loael_df['dsstox_sid'].nunique()) + ' unique chemicals')

27546 LOAEL values
1049 unique chemicals


In [85]:
loael_agg=loael_df.pivot_table(index='dsstox_sid',columns='endpoint_category',values='pod_value_LM',aggfunc='min')
print(str(len(loael_agg)) + ' chemicals found DSSTox matches to obtain their mol weights')
loael_agg.to_csv(DAT_DIR+'loaelagg.csv',encoding='utf-8')

1014 chemicals found DSSTox matches to obtain their mol weights


In [44]:
loael_sids=list(set(loael_agg.index.values))
categories=list(loael_agg.columns.values)

In [76]:
with pd.option_context('display.max_columns',None):
    loael_df[~loael_df['dsstox_sid'].isin(loael_sids)]

Unnamed: 0,casrn,chemical_id,chemical_id_type,direction,dose_level,dsstox_gsid,dsstox_sid,effect_comment,effect_desc,effect_desc_free,effect_id,effect_profile_group_id,effect_profile_id,endpoint_category,endpoint_id,endpoint_target,endpoint_type,group_description,group_id,group_name,life_stage,max_dose_level,mg_kg_day_value,no_quant_data_reported,pod_id,pod_tg_effect_id,pod_type,pod_unit,pod_value,preferred_name,qualifier,staggered_dosing,study_id,target_site,tg_effect_id,tg_id,timestamp,mol_weight,pod_value_LM
3951,8001-79-4,58436,DSSTox_GSID,-1,5,24742,DTXSID7024742,,body weight,,1768,31,2,systemic,52,body weight,in life observation,,6,systemic/in life observation,adult,5,15017.000,0,829057,257807,loael,mg/kg/day,15017.000,Castor oil,>,0,,,109618,21022,2015-01-30 13:19:22,,
3955,8001-79-4,58436,DSSTox_GSID,1,5,24742,DTXSID7024742,,alkaline phosphatase (alp/alk),,1971,29,2,systemic,109,alkaline phosphatase (alp/alk),clinical chemistry,,4,systemic/clinical chemistry,adult,5,5725.000,0,831965,268092,loael,mg/kg/day,5725.000,Castor oil,>,0,,,109609,21019,2015-01-30 13:19:22,,
3960,8001-79-4,58436,DSSTox_GSID,1,5,24742,DTXSID7024742,,absolute,,102,57,2,systemic,300,kidney,organ weight,,19,kidney,adult,5,16786.000,0,836177,277660,loael,mg/kg/day,16786.000,Castor oil,>,0,,,109619,21021,2015-01-30 13:19:22,,
3964,8001-79-4,58436,DSSTox_GSID,1,5,24742,DTXSID7024742,,absolute,,119,60,2,systemic,220,liver,organ weight,,20,liver,adult,5,5835.000,0,839141,285656,loael,mg/kg/day,5835.000,Castor oil,>,0,,,109616,21020,2015-01-30 13:19:22,,
3965,8001-79-4,58436,DSSTox_GSID,1,5,24742,DTXSID7024742,,relative to body weight,,1910,60,2,systemic,220,liver,organ weight,,20,liver,adult,5,5835.000,0,839141,285657,loael,mg/kg/day,5835.000,Castor oil,>,0,,,109617,21020,2015-01-30 13:19:22,,
3972,8001-79-4,58436,DSSTox_GSID,-1,5,24742,DTXSID7024742,,mean corpuscular hemoglobin concentration (mchc),,190,30,2,systemic,318,mean corpuscular hemoglobin concentration (mchc),hematology,,5,systemic/hematology,adult,5,5835.000,0,854829,320643,loael,mg/kg/day,5835.000,Castor oil,>,0,,,109614,21020,2015-01-30 13:19:22,,
3973,8001-79-4,58436,DSSTox_GSID,-1,5,24742,DTXSID7024742,,mean corpuscular (cell) volume (mcv),,238,30,2,systemic,217,mean corpuscular (cell) volume (mcv),hematology,,5,systemic/hematology,adult,5,5835.000,0,854829,320644,loael,mg/kg/day,5835.000,Castor oil,>,0,,,109612,21020,2015-01-30 13:19:22,,
3974,8001-79-4,58436,DSSTox_GSID,1,5,24742,DTXSID7024742,,platelet,,161,30,2,systemic,284,platelet,hematology,,5,systemic/hematology,adult,5,5835.000,0,854829,320645,loael,mg/kg/day,5835.000,Castor oil,>,0,,,109615,21020,2015-01-30 13:19:22,,
3975,8001-79-4,58436,DSSTox_GSID,-1,5,24742,DTXSID7024742,,mean corpuscular hemoglobin (mch),,1666,30,2,systemic,138,mean corpuscular hemoglobin (mch),hematology,,5,systemic/hematology,adult,5,5835.000,0,854829,320646,loael,mg/kg/day,5835.000,Castor oil,>,0,,,109613,21020,2015-01-30 13:19:22,,
11491,42615-29-2,57915,DSSTox_GSID,1,3,20041,DTXSID3020041,,dead fetuses,,2095,28,2,reproductive,211,dead fetuses,offspring survival early,,3,reproductive,adult-pregnancy,3,500.000,1,824557,242987,loael,mg/kg/day,500.000,"Alkylbenzenesulfonate, linear",'=',0,,,137408,28268,2017-11-16 14:01:32,,


<h1>Make Data</h1>

In [46]:
loael_neighbors_l=[]
for sid in loael_sids:
    sid_neighbors=searchCollByFP(sid,s0=.05,SID=loael_sids,DB=DB)
    if sid_neighbors:
        for neighbor in sid_neighbors:
            neighbor['target_sid']=sid
            neighbor['neighbor_sid']=neighbor.pop('dsstox_sid')
        loael_neighbors_l=loael_neighbors_l+sid_neighbors

In [47]:
loael_neighbors=pd.DataFrame(loael_neighbors_l)
loael_neighbors=loael_neighbors[loael_neighbors['target_sid']!=loael_neighbors['neighbor_sid']]
loael_neighbors=loael_neighbors.merge(loael_agg,left_on='neighbor_sid',right_index=True)
loael_neighbors=loael_neighbors.sort_values('jaccard',ascending=False)
loael_neighbors.to_csv(DAT_DIR+'loael_neighbors_mrgn.csv')
#loael_neighbors=pd.read_csv(DAT_DIR+'loael_neighbors_mrgn.csv')
loael_neighbors.head()

Unnamed: 0,casrn,jaccard,name,neighbor_sid,target_sid,cholinesterase,developmental,reproductive,systemic
78973,57837-19-1,1.0,Metalaxyl,DTXSID6024175,DTXSID8032671,,2.844067,,3.048187
92026,51218-45-2,1.0,Metolachlor,DTXSID4022448,DTXSID6032431,,,2.453012,2.7998
30784,15165-67-0,1.0,Dichlorprop-P,DTXSID0034851,DTXSID0020440,,3.371179,,2.982013
29786,28434-00-6,1.0,S-Bioallethrin,DTXSID2039336,DTXSID8035180,,3.781632,3.190567,2.706962
61556,93-65-2,1.0,Mecoprop,DTXSID9024194,DTXSID3032670,,,,3.65687


In [48]:
predictions={}
k=10
s=.05
for sid,group in loael_neighbors.groupby('target_sid'):
        predictions[sid]={category+'_p':wtavg(group,category,k,s) for category in categories}

In [49]:
loael_predictions.head()

Unnamed: 0,cholinesterase_p,developmental_p,reproductive_p,systemic_p,cholinesterase,developmental,reproductive,systemic
DTXSID4047672,3.889967,3.493957,3.282332,2.941796,,3.83621,,2.842984
DTXSID1020485,5.304208,3.041462,3.113375,2.607519,,,,3.46392
DTXSID3024235,4.360391,3.965492,3.873737,2.985669,,5.8968,4.255822,4.067497
DTXSID2044343,3.922568,4.112034,3.31904,2.991001,,4.026338,,2.964258
DTXSID7030066,5.82707,2.90158,3.068403,3.351817,,,,9.230972


In [50]:
loael_predictions=pd.DataFrame(predictions.values(),index=predictions.keys())
loael_predictions=loael_predictions.merge(loael_agg,right_index=True,left_index=True)
print(str(len(loael_predictions))+' chemicals successfully predicted')
print(str(len(loael_df[loael_df['dsstox_sid'].isin(loael_predictions.index)])) + ' associated LOAEL values')
loael_predictions.describe()
loael_predictions.to_csv(DAT_DIR+'loael_predictions_mrgn.csv')

1004 chemicals successfully predicted
26444 associated LOAEL values


Unnamed: 0,cholinesterase_p,developmental_p,reproductive_p,systemic_p,cholinesterase,developmental,reproductive,systemic
count,990.0,1002.0,1002.0,1003.0,84.0,472.0,431.0,996.0
mean,4.497893,3.387018,3.375043,2.919175,5.045443,3.520007,3.510849,3.031798
std,0.815091,0.515842,0.50977,0.534465,1.482639,0.99966,1.041681,0.997426
min,2.271297,1.734318,1.502154,1.554868,2.271297,0.618623,0.87565,0.047981
25%,3.943706,3.055121,3.053886,2.600838,3.937997,2.866933,2.85889,2.358246
50%,4.407326,3.357345,3.338559,2.857601,5.178954,3.456329,3.424777,2.87894
75%,5.009778,3.690819,3.653302,3.141854,6.411867,4.051531,4.001633,3.562107
max,6.989261,5.174729,5.054443,4.978817,7.566654,7.772923,8.490874,9.507802


<h1>Mean Aggregation</h1>

In [60]:
loael_agg_mean=loael_df.pivot_table(index='dsstox_sid',columns='endpoint_category',values='pod_value_LM',aggfunc='mean')
loael_agg_sd=loael_df.pivot_table(index='dsstox_sid',columns='endpoint_category',values='pod_value_LM',aggfunc='std')
loael_agg_sd=loael_agg_sd.loc[loael_agg_mean.index]
loael_agg_mean.to_csv(DAT_DIR+'loael_agg_mean_mrgn.csv')
loael_agg_sd.to_csv(DAT_DIR+'loael_agg_sd_mrgn.csv')

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  app.launch_new_instance()


In [61]:
loael_neighbors_mean=pd.DataFrame(loael_neighbors_l)
loael_neighbors_mean=loael_neighbors_mean[loael_neighbors_mean['target_sid']!=loael_neighbors_mean['neighbor_sid']]
loael_neighbors_mean=loael_neighbors_mean.merge(loael_agg_mean,left_on='neighbor_sid',right_index=True)
loael_neighbors_mean=loael_neighbors_mean.sort_values('jaccard',ascending=False)
loael_neighbors_mean.to_csv(DAT_DIR+'loael_neighbors_mean_mrgn.csv')

In [62]:
loael_neighbors_sd=pd.DataFrame(loael_neighbors_l)
loael_neighbors_sd=loael_neighbors_sd[loael_neighbors_sd['target_sid']!=loael_neighbors_sd['neighbor_sid']]
loael_neighbors_sd=loael_neighbors_sd.merge(loael_agg_sd,left_on='neighbor_sid',right_index=True)
loael_neighbors_sd=loael_neighbors_sd.loc[loael_neighbors_mean.index]
loael_neighbors_sd.to_csv(DAT_DIR+'loael_neighbors_sd_mrgn.csv')

In [63]:
predictions_mean={}
k=10
s=.05
for sid,group in loael_neighbors_mean.groupby('target_sid'):
    predictions_mean[sid]={category+'_p':wtavg(group,category,k,s) for category in categories}

In [64]:
predictions_sd={}
k=10
for sid,group in loael_neighbors_sd.groupby('target_sid'):
    predictions_sd[sid]={category:wtvar(group,category,k) for category in categories}

In [65]:
loael_predictions_mean=pd.DataFrame(predictions_mean.values(),index=predictions_mean.keys())
loael_predictions_mean=loael_predictions_mean.merge(loael_agg_mean,right_index=True,left_index=True)
len(loael_predictions_mean)
loael_predictions_mean.head()
loael_predictions_mean.to_csv(DAT_DIR+'loael_predictions_mean_mrgn.csv')

1004

Unnamed: 0,cholinesterase_p,developmental_p,reproductive_p,systemic_p,cholinesterase,developmental,reproductive,systemic
DTXSID4047672,3.889967,3.493957,3.282332,3.362421,,3.83621,,2.942306
DTXSID1020485,5.304208,3.041462,3.113375,2.914006,,,,4.13531
DTXSID3024235,4.360391,3.965492,3.873737,3.69586,,5.8968,4.255822,4.594689
DTXSID2044343,3.922568,4.112034,3.31904,3.573338,,4.026338,,3.681471
DTXSID7030066,5.82707,2.982922,3.068403,3.887838,,,,9.230972


In [66]:
loael_predictions_sd=pd.DataFrame(predictions_sd.values(),index=predictions_sd.keys())
loael_predictions_sd=loael_predictions_sd.merge(loael_agg_sd,right_index=True,left_index=True)
loael_predictions_sd.to_csv(DAT_DIR+'loael_predictions_sd_mrgn.csv')

<h1>Cluster Analysis</h1>

In [35]:
con=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_v3")
DB2 = con['genra_v3']
clusters_collection=DB2['clusters1']

In [36]:
clusters=list(clusters_collection.find({},{'_id':0,'chems':1,'cl_id':1}))

In [37]:
cid_list=[chem for cluster in [cluster['chems'] for cluster in clusters] for chem in cluster]

In [38]:
cid_to_sid={record['dsstox_cid']:record['dsstox_sid'] for record in dsstox.find({'dsstox_cid':{'$in':cid_list}})}

In [39]:
for cluster in clusters:
    cluster['chems']=[cid_to_sid[cid] for cid in cluster['chems'] if cid in cid_to_sid.keys()]

In [None]:
import pickle as pkl
with open(DAT_DIR+'clusters.pkl','w') as f:
    pkl.dump(clusters,f)

<h1>k,s grid search for LOAELS using mean aggregation</h1>

In [None]:
predictions=[]
for k in ks:
    for s in ss: 
        for sid,group in loael_neighbors_mean.groupby('target_sid'):
                prediction={category+'_p':wtavg(group,category,k,s) for category in categories}
                prediction['dsstox_sid']=sid
                prediction['k']=k
                prediction['s']=s
                predictions.append(prediction)

In [None]:
prediction_df=pd.DataFrame(predictions)
prediction_df=prediction_df.merge(loael_agg,left_on='dsstox_sid',right_index=True)
prediction_df.to_csv(DAT_DIR+'toxref_ks_gridsearch_mrgn.csv')

In [None]:
exact_k_predictions=[]
for k in ks:
    for s in ss: 
        for sid,group in loael_neighbors_mean.groupby('target_sid'):
                prediction={category+'_p':exact_k_wtavg(group,category,k,s) for category in categories}
                prediction['dsstox_sid']=sid
                prediction['k']=k
                prediction['s']=s
                exact_k_predictions.append(prediction)

In [None]:
exact_k_df=pd.DataFrame(exact_k_predictions)
exact_k_df=exact_k_df.merge(loael_agg,left_on='dsstox_sid',right_index=True)
exact_k_df.to_csv(DAT_DIR+'toxref_exact_ks_gridsearch_mrgn.csv')

<h1>k,s grid search over clusters</h1>

In [None]:
#Within cluster predictions
cluster_predictions=[]
for k in ks:
    for s in ss: 
        for cluster in clusters:
            chems=cluster['chems']
            cluster_df=loael_neighbors_mean[(loael_neighbors_mean['target_sid'].isin(chems)) & loael_neighbors_mean['neighbor_sid'].isin(chems)]
            for sid,group in cluster_df.groupby('target_sid'):
                prediction={category+'_p':wtavg(group,category,k,s) for category in categories}
                prediction['dsstox_sid']=sid
                prediction['k']=k
                prediction['s']=s
                prediction['cluster']=cluster['cl_id']
                cluster_predictions.append(prediction)

In [None]:
cluster_prediction_df=pd.DataFrame(cluster_predictions)
cluster_prediction_df=cluster_prediction_df.merge(loael_agg_mean,left_on='dsstox_sid',right_index=True)
cluster_prediction_df.to_csv(DAT_DIR+'cluster_ks_gridsearch_mrgn.csv')

In [None]:
#Within cluster predictions
exactk_cluster_predictions=[]
for k in ks:
    for s in ss:
        for cluster in clusters:
            chems=cluster['chems']
            cluster_df=loael_neighbors_mean[(loael_neighbors_mean['target_sid'].isin(chems)) & loael_neighbors_mean['neighbor_sid'].isin(chems)]
            for sid,group in cluster_df.groupby('target_sid'):
                prediction={category+'_p':exact_k_wtavg(group,category,k,s) for category in categories}
                prediction['dsstox_sid']=sid
                prediction['k']=k
                prediction['s']=s
                prediction['cluster']=cluster['cl_id']
                exactk_cluster_predictions.append(prediction)

In [None]:
exactk_cluster_prediction_df=pd.DataFrame(exactk_cluster_predictions)
exactk_cluster_prediction_df=exactk_cluster_prediction_df.merge(loael_agg_mean,left_on='dsstox_sid',right_index=True)
exactk_cluster_prediction_df.to_csv(DAT_DIR+'exactk_cluster_ks_gridsearch_mrgn.csv')

In [None]:
cluster_prediction_df=pd.read_csv(DAT_DIR+'cluster_ks_gridsearch.csv')
exactk_cluster_prediction_df=pd.read_csv(DAT_DIR+'exactk_cluster_ks_gridsearch.csv')

In [None]:
cluster_grid_r2s={}
cluster_grid_ns={}
for cluster in clusters:
    chems=cluster['chems']
    clid=int(cluster['cl_id'])
    cluster_grid_r2s[clid]=np.empty([len(ks),len(ss)])
    cluster_grid_ns[clid]=np.empty([len(ks),len(ss)])
    for (k,s),group in cluster_prediction_df.groupby(['k','s']):
        k_index=ks.index(k)
        s_index=ss.index(round(s,2))
        df=cluster_prediction_df[(cluster_prediction_df['dsstox_sid'].isin(chems))\
                                 & (cluster_prediction_df['s']==s) & (cluster_prediction_df['k']==k)]
        df=df[['systemic','systemic_p']]
        df=plot_worthy(df)
        if df.empty:
            cluster_grid_r2s[clid][k_index,s_index]=np.nan
            cluster_grid_ns[clid][k_index,s_index]=0   
            continue
        cluster_grid_r2s[clid][k_index,s_index]=r2_score(df['systemic'],df['systemic_p'])
        cluster_grid_ns[clid][k_index,s_index]=len(df)

In [None]:
with pd.option_context('display.float_format',lambda x: '%.3f' % x):
    for clid,grid in cluster_grid_r2s.iteritems():
        print(clid)
        print(str(cluster_grid_ns[clid].max())+ ' predictions')
        pd.DataFrame(grid)

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
i=1
fig=plt.figure(figsize=(12,300))
for clid,cluster_grid_r2 in cluster_grid_r2s.iteritems():
    fig.suptitle('k,s grid search for up to k neighbors',fontsize=20)
    ax=fig.add_subplot(50,2,i,projection='3d')
    #ax.text2D(.5,.95,'Global',transform=ax.transAxes,fontsize=20)
    X,Y=np.meshgrid(ss,ks)
    i+=1
    ax.plot_surface(X,Y,cluster_grid_r2)
    ax.set_ylabel('Maximum number of neighbors (k)',fontsize=16)
    ax.set_xlabel('Similarity threshold (s)',fontsize=16)
    ax.set_zlabel('R2')
    ax.set_title('Cluster '+ clid )
plt.subplots_adjust()
plt.savefig(FIG_DIR+'cluster_ksgrid_uptok')
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

for clid,cluster_grid_r2 in cluster_grid_r2s.iteritems():
    fig=plt.figure(figsize=(8,6))
    plt.title('k,s grid search for up to k neighbors for cluster ' + str(clid),fontsize=20)
    #ax.text2D(.5,.95,'Global',transform=ax.transAxes,fontsize=20)
    X,Y=np.meshgrid(ss,ks)
    plt.contour(X,Y,cluster_grid_r2)
    plt.ylabel('Maximum number of neighbors (k)',fontsize=16)
    plt.xlabel('Similarity threshold (s)',fontsize=16)
    #ax.set_title('Cluster '+ clid )
    plt.show()
#plt.savefig(FIG_DIR+'cluster_ksgrid_uptok')
#plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
i=1
fig=plt.figure(figsize=(12,300))
for clid,cluster_grid_r2 in exactk_cluster_grid_r2s.iteritems():
    fig.suptitle('k,s grid search for exactly k neighbors',fontsize=20)
    ax=fig.add_subplot(2,50,i,projection='3d')
    #ax.text2D(.5,.95,'Global',transform=ax.transAxes,fontsize=20)
    X,Y=np.meshgrid(ss,ks)
    i+=1
    ax.plot_surface(X,Y,cluster_grid_r2)
    ax.set_ylabel('Maximum number of neighbors (k)',fontsize=16)
    ax.set_xlabel('Similarity threshold (s)',fontsize=16)
    ax.set_zlabel('R2')
    ax.set_title('Cluster '+ clid )
plt.subplots.adjust()
plt.savefig(FIG_DIR+'cluster_ksgrid_exactk')
plt.show()

<h1>EPA Categories</h1>

In [None]:
import operator as op
op_dict={
    'GreaterThan': op.gt,
    'GreaterThanOrEqualTo': op.ge,
    'LessThan': op.lt,
    'LessThanOrEqualTo': op.le
}
prop_dict={
    'log Kow':'logp',
    'Molecular Weight':'mol_weight',
    'Molecular weight':'mol_weight',
    'Water Solubility': 'ws'
}

In [None]:
def convert_ppb(x): #OPERA results stored as mol/L
    ws=x['ws']
    mol_weight=x['mol_weight']
    return ws*mol_weight*10**6

In [None]:
import dill
with open(DAT_DIR+'../category_tests.dill') as f:
    category_tests=dill.load(f)

In [19]:
loael_smiles=dsstox.find({'dsstox_sid':{'$in':loael_sids}},{'_id':0,'dsstox_sid':1,'smiles':1})
smiles_dict={record['dsstox_sid']:record['smiles'] for record in loael_smiles}
loael_logp=physprop.find({'dsstox_sid':{'$in':loael_sids}},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_LogP':1})
logp_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_LogP'][0] for record in loael_logp \
           if 'OPERA_LogP' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
loael_ws=physprop.find({'dsstox_sid':{'$in':loael_sids}},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_WS':1})
ws_dict={record['dsstox_sid']:record.get('predicted_props',{})['OPERA_WS'][0] for record in loael_ws \
           if 'OPERA_WS' in record.get('predicted_props',{}) and record.get('dsstox_sid',None)}
loael_weight=dsstox.find({'dsstox_sid':{'$in':loael_sids}})
weight_dict={record['dsstox_sid']:record['mol_weight'] for record in loael_weight}

In [20]:
from rdkit import Chem
sids=set(logp_dict.keys())&set(ws_dict.keys())&set(weight_dict.keys())
records=[]
for sid in sids:
    records.append({'dsstox_sid':sid,'smiles':smiles_dict[sid],'logp':logp_dict[sid],'ws':ws_dict[sid],'mol_weight':weight_dict[sid],'mol':Chem.MolFromSmiles(smiles_dict[sid])})
records=[record for record in records if record['mol']]

In [21]:
import math
for record in records:
    if not record['mol']:
        continue
    epa_categories=sorted([category for category,test in category_tests.iteritems() if test(record)])
    if 'Neutral Organics' in epa_categories and len(epa_categories)>1:
        epa_categories.remove('Neutral Organics')
    record['categories']=tuple(epa_categories)

In [22]:
from collections import Counter
count=Counter(record['categories'] for record in records)
count

Counter({(): 569,
         ('Acrylamides',): 3,
         ('Acrylates/Methacrylates (Acute toxicity)',
          'Esters (Acute toxicity)'): 1,
         ('Aldehydes (Acute toxicity)',): 6,
         ('Alkoxysilanes', 'Cationic (quaternary ammonium) surfactants'): 1,
         ('Anhydrides, Carboxylic acid',): 2,
         ('Anilines (Acute toxicity)',): 26,
         ('Anilines (Acute toxicity)', 'Dianilines'): 2,
         ('Anilines (Acute toxicity)', 'Phenols (Acute toxicity)'): 2,
         ('Anilines (Acute toxicity)',
          'Polynitroaromatics (Acute toxicity)'): 1,
         ('Anionic Surfactants',): 1,
         ('Azides (Acute toxicity)',): 1,
         ('Benzotriazoles (Acute toxicity)',): 1,
         ('Cationic (quaternary ammonium) surfactants',): 9,
         ('Diisocyanates',): 1,
         ('Dithiocarbamates (Acute toxicity)',): 3,
         ('Epoxides',): 8,
         ('Esters (Acute toxicity)',): 82,
         ('Esters (Acute toxicity)', 'Imides (Acute toxicity)'): 2,
         ('

In [37]:
from collections import defaultdict
search_spaces=defaultdict(list)
for record in records:
    search_spaces[record['categories']].append(record['dsstox_sid'])
search_spaces={cat:l for cat,l in search_spaces.iteritems() if len(l)>1}
import pickle
with open(DAT_DIR+'search_spaces.pkl','w') as f:
    pickle.dump(search_spaces,f)

In [24]:
category_neighbors_l=[]
for record in records:
    sid=record['dsstox_sid']
    search_space=search_spaces[record['categories']][:]
    if len(search_space)==1:
        continue
    search_space.remove(sid)
    sid_neighbors=searchCollByFP(sid,s0=.05,SID=search_space,DB=DB)
    if sid_neighbors:
        for neighbor in sid_neighbors:
            neighbor['target_sid']=sid
            neighbor['neighbor_sid']=neighbor.pop('dsstox_sid')
        category_neighbors_l=category_neighbors_l+sid_neighbors

In [31]:
category_neighbors=pd.DataFrame(category_neighbors_l)
category_neighbors=category_neighbors[category_neighbors['target_sid']!=category_neighbors['neighbor_sid']]
category_neighbors=category_neighbors.merge(loael_agg_mean,left_on='neighbor_sid',right_index=True)
category_neighbors=category_neighbors.sort_values('jaccard',ascending=False)
category_neighbors.to_csv(DAT_DIR+'category_neighbors_mrgn.csv',index=False)
category_neighbors.head()

Unnamed: 0,casrn,jaccard,name,neighbor_sid,target_sid,cholinesterase,developmental,reproductive,systemic
52165,51630-58-1,1.0,Fenvalerate,DTXSID3020621,DTXSID4032667,,,,4.331634
48004,138-86-3,1.0,Limonene,DTXSID2029612,DTXSID1020778,,2.435328,,2.435328
38245,69806-50-4,1.0,Fluazifop-butyl,DTXSID3034612,DTXSID0034855,,4.583615,4.340577,4.300948
45772,60-57-1,1.0,Dieldrin,DTXSID9020453,DTXSID6020561,,,5.70576,5.581639
21584,28434-00-6,1.0,S-Bioallethrin,DTXSID2039336,DTXSID8035180,,3.781632,3.190567,3.518175


In [34]:
predictions={}
k=10
s=.05
for sid,group in category_neighbors.groupby('target_sid'):
    predictions[sid]={category+'_p':wtavg(group,category,k,s) for category in categories}

In [35]:
category_predictions=pd.DataFrame(predictions.values(),index=predictions.keys())
category_predictions=category_predictions.merge(loael_agg_mean,right_index=True,left_index=True)
category_predictions.to_csv(DAT_DIR+'category_predictions_mrgn.csv')
category_predictions.head()

Unnamed: 0,cholinesterase_p,developmental_p,reproductive_p,systemic_p,cholinesterase,developmental,reproductive,systemic
DTXSID0020076,5.266096,3.412768,3.369072,3.325482,,3.322643,2.322643,3.322643
DTXSID0020151,5.004252,2.896481,3.354341,2.573533,,,,2.146777
DTXSID0020232,5.158729,3.556275,3.260099,3.224968,,3.191326,3.191326,3.191326
DTXSID0020311,3.946856,3.296379,3.151303,3.521586,,,,2.662276
DTXSID0020315,5.550991,3.466629,2.720297,3.004631,,,5.136641,5.093944
