In [None]:
import pandas as pd
import numpy as np
import pymongo
import sys
import os
from __future__ import print_function
from datetime import datetime

TOP = '/'.join(os.getcwd().split('/')[:-2])+'/'
LIB = TOP+'lib'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

DAT_DIR = TOP + 'data/'
FIG_DIR = TOP + 'figs/'

from rax.genrapred import *

In [None]:
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compound']
toxref=DB['toxrefdb2']

<h1>EDA</h1>

In [None]:
print(str(toxref.count()) + ' total substances')

In [None]:
set.union(*[set(document.keys()) for document in toxref.find()])

In [None]:
def pod_record(document):
    pods=document['pods']
    for pod in pods:
        pod['dsstox_sid']=document['dsstox_sid']
    return pods

In [None]:
pods_df=pd.DataFrame([pod for document in toxref.find() for pod in pod_record(document)])

In [None]:
pods_df.shape
pods_df.head()

In [None]:
pods_df=pods_df[pods_df.notnull().all(axis='columns')]
print(str(len(pods_df))+' not null POD values')

In [None]:
endpoint_cols=['pod_type','endpoint_category','endpoint_target','endpoint_type']
pods_df.set_index(endpoint_cols).head()

In [None]:
pods_df['pod_type'].value_counts()

In [None]:
pods_df['pod_value'].describe()

In [None]:
pods_df['pod_unit'].value_counts()
pods_df=pods_df[pods_df['pod_unit']=='mg/kg/day'] #Don't feel like dealing with this
print(str(len(pods_df))+' have units mg/kg/day')

In [None]:
pods_df['qualifier'].value_counts()
print("Ask Imran what to do about this.")

In [None]:
import matplotlib.pyplot as plt
pods_df.boxplot('pod_value',by='pod_type',figsize=(8,6))
plt.show()

In [None]:
#Need to convert to log molar
sids=list(pods_df['dsstox_sid'].unique())
weights={record['dsstox_sid']:record['mol_weight'] for record in dsstox.find({'dsstox_sid':{'$in':sids}})}
pods_df['mol_weight']=pods_df['dsstox_sid'].map(weights)
pods_df['pod_value_LM']=-np.log10(pods_df['pod_value']/pods_df['mol_weight']/1000)

In [None]:
pods_df.head()

In [None]:
pods_df['pod_unit'].unique()

In [None]:
pods_df.boxplot('pod_value_LM',by='pod_type',figsize=(8,6))
plt.show()

In [None]:
pd.pivot_table(pods_df,index='endpoint_category',columns='pod_type',values='pod_value',aggfunc=len,fill_value=0)

In [None]:
pd.pivot_table(pods_df,index='endpoint_type',columns='pod_type',values='pod_value',aggfunc=len,fill_value=0)

In [None]:
# assay_counts=pods_df.groupby(endpoint_cols).size().reset_index().rename(columns={0:'count'}).sort_values('count',ascending=False)
# assay_counts.head()
# len(assay_counts)
# len(assay_counts[assay_counts['count']>30])
pd.DataFrame(pd.pivot_table(pods_df,index=['pod_type','endpoint_category','endpoint_target','endpoint_type'],values='pod_value',aggfunc=len,fill_value=0))\
.rename(columns={'pod_value':'pod_count'})

<h1>Analysis</h1>

In [None]:
loael_df=pods_df[pods_df['pod_type']=='loael']
lel_df=pods_df[pods_df['pod_type']=='lel']

In [None]:
loael_df.head()

In [None]:
loael_agg=loael_df.pivot_table(index='dsstox_sid',columns='endpoint_category',values='pod_value_LM',aggfunc='min')
lel_agg=lel_df.pivot_table(index='dsstox_sid',columns='endpoint_category',values='pod_value_LM',aggfunc='min')

In [None]:
loael_agg.head()

In [None]:
print(str(len(loael_agg))+' total substances')
loael_agg.describe()

In [None]:
loael_sids=list(set(loael_agg.index.values))
lel_sids=list(set(lel_agg.index.values))

In [None]:
loael_neighbors_l=[]
for sid in loael_sids:
    sid_neighbors=searchCollByFP(sid,s0=.05,SID=loael_sids,DB=DB)
    if sid_neighbors:
        for neighbor in sid_neighbors:
            neighbor['target_sid']=sid
            neighbor['neighbor_sid']=neighbor.pop('dsstox_sid')
        loael_neighbors_l=loael_neighbors_l+sid_neighbors

In [None]:
lel_neighbors_l=[]
for sid in lel_sids:
    sid_neighbors=searchCollByFP(sid,s0=.05,SID=loael_sids,DB=DB)
    if sid_neighbors:
        for neighbor in sid_neighbors:
            neighbor['target_sid']=sid
            neighbor['neighbor_sid']=neighbor.pop('dsstox_sid')
        lel_neighbors_l=lel_neighbors_l+sid_neighbors

In [None]:
loael_neighbors=pd.DataFrame(loael_neighbors_l)
loael_neighbors=loael_neighbors[loael_neighbors['target_sid']!=loael_neighbors['neighbor_sid']]
loael_neighbors=loael_neighbors.merge(loael_agg,left_on='neighbor_sid',right_index=True)
loael_neighbors=loael_neighbors.sort_values('jaccard',ascending=False)
loael_neighbors.head()

In [None]:
lel_neighbors=pd.DataFrame(lel_neighbors_l)
lel_neighbors=lel_neighbors[lel_neighbors['target_sid']!=lel_neighbors['neighbor_sid']]
lel_neighbors=lel_neighbors.merge(lel_agg,left_on='neighbor_sid',right_index=True)
lel_neighbors=lel_neighbors.sort_values('jaccard',ascending=False)
lel_neighbors.head()

In [None]:
categories=list(loael_agg.columns.values)

In [None]:
def wtavg(df,name,k):
    df=df[df[name].notnull()].iloc[0:k]
    if df.empty:
        return np.nan
    weights=list(df['jaccard'])
    values=list(df[name])
    return np.average(values,weights=weights)

In [None]:
predictions={}
k=10
for sid,group in loael_neighbors.groupby('target_sid'):
        predictions[sid]={category+'_p':wtavg(group,category,k) for category in categories}

In [None]:
loael_predictions=pd.DataFrame(predictions.values(),index=predictions.keys())
loael_predictions=loael_predictions.merge(loael_agg,right_index=True,left_index=True)
len(loael_predictions)
loael_predictions.describe()

In [None]:
i=1
f=plt.figure(figsize=(12,12))
for category in categories:
    plt.subplot(2,2,i)
    i+=1
    df=loael_predictions[[category,category+'_p']]
    df=df[df.notnull().all(axis='columns')]
    plt.scatter(df[category],df[category+'_p'])
    plt.title(category+ ' LOAEL Predictions')
    plt.xlabel('True')
    plt.ylabel('Predicted')
    plt.annotate('R2='+str(round(r2_score(df[category],df[category+'_p']),2)),xy=(.8,-.15),xycoords='axes fraction')
plt.subplots_adjust(wspace=.5,hspace=.4)
plt.show()

In [None]:
predictions1={}
k=10
for sid,group in lel_neighbors.groupby('target_sid'):
        predictions1[sid]={category+'_p':wtavg(group,category,k) for category in categories}

In [None]:
lel_predictions=pd.DataFrame(predictions1.values(),index=predictions1.keys())
lel_predictions=lel_predictions.merge(lel_agg,right_index=True,left_index=True)
lel_predictions.head()

In [None]:
i=1
f=plt.figure(figsize=(12,12))
for category in categories:
    plt.subplot(2,2,i)
    i+=1
    df=lel_predictions[[category,category+'_p']]
    df=df[df.notnull().all(axis='columns')]
    plt.scatter(df[category],df[category+'_p'])
    plt.title(category+ ' LOAEL Predictions')
    plt.xlabel('True')
    plt.ylabel('Predicted')
    plt.annotate('R2='+str(round(r2_score(df[category],df[category+'_p']),2)),xy=(.8,-.15),xycoords='axes fraction')
plt.subplots_adjust(wspace=.5,hspace=.4)
plt.show()

<h1>BMDs</h1>

In [None]:
def bmd_record(document):
    bmds=document['bmds']
    for bmd in bmds:
        bmd['dsstox_sid']=document['dsstox_sid']
    return bmds

In [None]:
bmd_df=pd.DataFrame([bmd for document in toxref.find({'bmds':{'$exists':True}}) for bmd in bmd_record(document)])

In [None]:
bmd_df=bmd_df[bmd_df['recommended']==True]
print(str(len(bmd_df)) +' BMD values')
print(str(len(bmd_df['dsstox_sid'].unique())) + ' unique substances')

In [None]:
bmd_df.describe()

In [None]:
with pd.option_context('display.max_columns',None):
    bmd_df.head()

In [None]:
bmd_df['AIC']=pd.to_numeric(bmd_df['AIC']) #Error values are 1.#QNAN whatever that means
#Convert other columns to numeric as needed
#bmd_df=bmd_df[pd.notnull(bmd_df['AIC'])]
#bmd_df=bmd_df[(bmd_df['AIC']!=-999) & (bmd_df['BMD']!=-999) & (bmd_df['BMDU']!=-999) & (bmd_df['BMDL']!=-999)] #Code for fail

In [None]:
bmd_df['bmr_type'].value_counts()

In [None]:
bmd_df['endpoint_category'].value_counts()

In [None]:
bmd_df['model_name'].value_counts()

In [None]:
sids=list(bmd_df['dsstox_sid'].unique())
weights={record['dsstox_sid']:record['mol_weight'] for record in dsstox.find({'dsstox_sid':{'$in':sids}})}
bmd_df['mol_weight']=bmd_df['dsstox_sid'].map(weights)

In [None]:
import matplotlib.pyplot as plt
bmd_df.boxplot('BMD',by='bmr_type')
plt.title('')
plt.ylabel('BMD (log molar)')
plt.show()
bmd_df['mol_weight']=bmd_df['dsstox_sid'].map(weights)
bmd_df['BMD_LM']=-np.log10(bmd_df['BMD']/1000/bmd_df['mol_weight'])
bmd_df.boxplot('BMD_LM',by='bmr_type')
plt.title('')
plt.ylabel('BMD (log molar)')
plt.show()

In [None]:
bmd_df=bmd_df.sort_values('AIC')
#bmd_df=bmd_df.reset_index()
with pd.option_context('display.max_columns',None):
    bmd_df.head()

In [None]:
bmd_agg=bmd_df.pivot_table(index=['dsstox_sid','bmr_type'],columns='endpoint_category',values='BMD_LM',aggfunc='first') #May want to average in the case of multiple models with same AIC

In [None]:
bmd_agg.head()

In [None]:
bmd_sids=list(bmd_agg.index.unique(level='dsstox_sid'))
bmd_neighbors_l=[]
for sid in bmd_sids:
    sid_neighbors=searchCollByFP(sid,s0=.05,SID=bmd_sids,DB=DB)
    if sid_neighbors:
        for neighbor in sid_neighbors:
            neighbor['target_sid']=sid
            neighbor['neighbor_sid']=neighbor.pop('dsstox_sid')
            neighbor['neighbor_name']=neighbor.pop('name')
            neighbor['neighbor_casrn']=neighbor.pop('casrn')
        bmd_neighbors_l=bmd_neighbors_l+sid_neighbors

In [None]:
bmd_neighbors=pd.DataFrame(bmd_neighbors_l)
bmd_neighbors=bmd_neighbors[bmd_neighbors['target_sid']!=bmd_neighbors['neighbor_sid']]
bmd_neighbors=bmd_neighbors.merge(bmd_agg.reset_index(1),left_on='neighbor_sid',right_index=True)
bmd_neighbors=bmd_neighbors.sort_values('jaccard',ascending=False)
bmd_neighbors.head()

In [None]:
categories=list(bmd_agg.columns.values)

In [None]:
def wtavg(df,name,k):
    df=df[df[name].notnull()].iloc[0:k]
    if df.empty:
        return np.nan
    weights=list(df['jaccard'])
    values=list(df[name])
    return np.average(values,weights=weights)

In [None]:
for index,group in bmd_neighbors.groupby(['target_sid','bmr_type']):
    index
    group

In [None]:
predictions={}
k=10
for index,group in bmd_neighbors.groupby(['target_sid','bmr_type']):
    predictions[index]={category+'_p':wtavg(group,category,k) for category in categories}

In [None]:
bmd_predictions=pd.DataFrame(predictions.values(),index=pd.MultiIndex.from_tuples(predictions.keys(),names=['dsstox_sid','bmr_type']))
bmd_predictions=bmd_predictions.merge(bmd_agg,left_index=True,right_index=True)

In [None]:
from sklearn.metrics import r2_score
i=1
f=plt.figure(figsize=(12,16))
for bmr_type,group in bmd_predictions.reset_index(1).groupby('bmr_type'):
    for category in categories:
        df=group[[category,category+'_p']]
        df=df[df.notnull().all(axis='columns')]
        if df.empty:
            continue
        plt.subplot(4,2,i)
        i+=1
        plt.scatter(df[category],df[category+'_p'])
        plt.title(category+ ' ' + bmr_type+ ' predictions')
        plt.xlabel('True')
        plt.ylabel('Predicted')
        plt.annotate('R2='+str(round(r2_score(df[category],df[category+'_p']),2)),xy=(.8,-.23),xycoords='axes fraction')
plt.subplots_adjust(wspace=.4,hspace=.6)
plt.show()

In [None]:
bmd_predictions.head()

In [None]:
ranked=bmd_predictions.loc[(slice(None),'sd'),:]
ranked['rank']=abs(ranked['reproductive_p']-ranked['reproductive'])
ranked=ranked.sort_values('rank')
ranked.head()

In [None]:
sid='DTXSID5035957'
bmd_neighbors[(bmd_neighbors['target_sid']==sid) & (bmd_neighbors['bmr_type']=='sd')]

<h1>Case Study</h1>

In [None]:
#Butyl Benzyl Phthalate
sid='DTXSID3020205'
mol_weight=dsstox.find_one({'dsstox_sid':sid})['mol_weight']

In [None]:
bmd_predictions

In [None]:
len(bmd_neighbors[bmd_neighbors['target_sid']==sid]['neighbor_sid'].unique())

In [None]:
bbp_pods

In [None]:
bbp_pods['pod_value_LM']=-np.log10(bbp_pods['pod_value']/1000/mol_weight)

In [None]:
10**(-loael_predictions.loc[sid])*1000*mol_weight

In [None]:
10**(-lel_predictions.loc[sid])*1000*mol_weight

In [None]:
10**(-bmd_predictions.loc[sid])*1000*mol_weight

In [None]:
bmd_predictions.loc[sid]