In [None]:
import pandas as pd
import numpy as np
import pymongo
import sys
import os
from __future__ import print_function
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from __future__ import division

TOP = '/'.join(os.getcwd().split('/')[:-3])+'/'
LIB = TOP+'lib'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

DAT_DIR = TOP + 'data/toxref/'
FIG_DIR = TOP + 'figs/toxref/'

from rax.genrapred import *

In [None]:
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compound']
toxref=DB['toxrefdb2'] #Do not change! toxrefdb2 is the correct collection

In [None]:
def plot_worthy(pdobject):
    if isinstance(pdobject,pd.core.series.Series):
        pdobject=pdobject[pd.notnull(pdobject)]
        pdobject=pdobject[pdobject!=np.inf]
        return pdobject
    elif isinstance(pdobject,pd.core.frame.DataFrame):
        pdobject=pdobject[pdobject.notnull().all(axis='columns')]
        pdobject=pdobject[(pdobject!=np.inf).all(axis=1)]
        return pdobject

In [None]:
def exact_k_wtavg(df,name,k,s):
    df=df[df['jaccard']>s]
    df=df[df[name]!=np.inf]
    df=df[df[name].notnull()].iloc[0:k]
    if len(df)<k:
        return np.nan
    weights=list(df['jaccard'])
    values=list(df[name])
    return np.average(values,weights=weights)

In [None]:
def wtvar(df,name,k):
    df=df[(df[name].notnull()) & (df[name]!=np.inf)].iloc[0:k]
    if df.empty:
        return np.nan
    weights=list(df['jaccard'])
    values=list(df[name])
    return sum([weights[i]**2*values[i] for i in range(len(values))])/sum(weights)**2

In [None]:
loael_df=pd.read_csv(DAT_DIR+'loael.csv',index_col=0)
loael_agg=pd.read_csv(DAT_DIR+'loaelagg.csv',index_col='dsstox_sid')
loael_sids=list(set(loael_agg.index.values))
loael_neighbors=pd.read_csv(DAT_DIR+'loael_neighbors_mrgn.csv')
loael_predictions=pd.read_csv(DAT_DIR+'loael_predictions_mrgn.csv',index_col=0)
loael_agg_mean=pd.read_csv(DAT_DIR+'loael_agg_mean_mrgn.csv',index_col='dsstox_sid')
loael_agg_sd=pd.read_csv(DAT_DIR+'loael_agg_sd_mrgn.csv',index_col='dsstox_sid')
loael_neighbors_mean=pd.read_csv(DAT_DIR+'loael_neighbors_mean_mrgn.csv',index_col=0)
loael_neighbors_sd=pd.read_csv(DAT_DIR+'loael_neighbors_sd_mrgn.csv',index_col=0)
loael_predictions_mean=pd.read_csv(DAT_DIR+'loael_predictions_mean_mrgn.csv',index_col=0)
study_loaels=pd.read_csv(DAT_DIR+'study_loaels.csv',index_col=0)
categories=list(loael_agg.columns.values)
from __future__ import division
ks=range(1,20)
ss=[round(s/20,2) for s in range(1,20)]

In [None]:
loael_df['dsstox_sid'].nunique()
loael_df.head()

<h2>Table 1</h2>

In [None]:
def f0(x):
    return len(x)
def f1(x):
    return x.nunique()
def f2(x):
    return sum(x.value_counts()>2)
def f3(x):
    return sum(x.value_counts()>3)
def f4(x):
    return x.value_counts().mean()

In [None]:
#Table 1
table1=loael_df.pivot_table(index='endpoint_category',values='dsstox_sid',aggfunc=(f0,f1,f2,f3,f4))
table1.columns=['Number of LOAELS','Number of chemicals','2','>3','Mean number of LOAELS']
table1
table1.to_csv(DAT_DIR+'table1.csv')

<h2>Table 2</h2>

In [None]:
weights={record['dsstox_sid']:record['mol_weight'] for record in dsstox.find({'dsstox_sid':{'$in':loael_sids}})}

In [None]:
def convert_to_mgkg(lm,weight):
    return (10**-lm)*1000*weight

In [None]:
#Good prediction
sid='DTXSID5020607'
dsstox.find_one({'dsstox_sid':sid},{'_id':0,'name':1})
row=loael_predictions_mean.loc[sid]
print('Predictions')
row[[category+'_p' for category in categories]]
print('Measured')
row[categories]
print('mg/kg Predictions')
[{category:convert_to_mgkg(row[category+'_p'],weights[sid]) for category in categories}]
print('mg/kg Measured')
[{category:convert_to_mgkg(row[category],weights[sid]) for category in categories}]
loael_neighbors[(loael_neighbors['target_sid']==sid) & (pd.notnull(loael_neighbors['developmental']))].iloc[0:10]
nhood=loael_neighbors[(loael_neighbors['target_sid']==sid) & (pd.notnull(loael_neighbors['systemic']))].iloc[0:10]
','.join(nhood['neighbor_sid'])

<h2>Table 3</h2>

In [None]:
#Table of endpoint categories vs study types. # unique chemicals (# studies)
study_category_tally=study_loaels.pivot_table(index='endpoint_category',columns='study_type',values='dsstox_sid',aggfunc=table_function)
study_category_tally
study_category_tally.to_csv(DAT_DIR+'study_category_tally.csv')

In [None]:
with pd.option_context('display.max_columns',None):
    loael_df.head()

<h2>Figure 1</h2>

Figure 1 is a PowerPoint graphic.

<h2>Figure 2</h2>

In [None]:
f = plt.figure(figsize={10,3})
ax1=f.add_subplot(121)
ax1.hist(plot_worthy(loael_df['pod_value']),rwidth=.95)
ax1.set_title('Histogram of LOAEL values')
ax1.set_xlabel('LOAEL Value')
ax1.annotate('2 values',xy=(.90,.02),xytext=(.85,.15),xycoords='axes fraction',arrowprops=dict(facecolor='red'))
ax2=f.add_subplot(122)
ax2.hist(plot_worthy(loael_df['pod_value_LM']),rwidth=.95)
ax2.set_title('Histogram of Log Molar LOAEL values')
ax2.set_xlabel('LOAEL Value (Log Molar)')
plt.savefig(FIG_DIR+'histograms.png')
plt.show()

<h2>Figure 3</h2>

Created by the nn graph service

<h2>Figure 4</h2>

In [None]:
nstudies=study_loaels.pivot_table(index='dsstox_sid',values='study_type',aggfunc=len)
nguideline=study_loaels.pivot_table(index='dsstox_sid',values='study_type',aggfunc=lambda x: x.nunique()).unstack()
nguideline=pd.DataFrame(nguideline.value_counts()).reset_index().rename(columns={'index':'nguide',0:'nchem'})

In [None]:
f=plt.figure(figsize=(10,3))
ax1=f.add_subplot(121)
ax1.hist(nstudies.values,color='steelblue',rwidth=.85,bins=40)
ax1.set_title('Number of studies per chemical')
ax1.set_xlabel('Number of studies')
ax1.set_ylabel('Count of chemicals')
ax2=f.add_subplot(122)
sns.barplot(x='nguide',y='nchem',data=nguideline,color='steelblue',ax=ax2)
ax2.set_title('Number of guideline study types per chemical')
ax2.set_xlabel('Number of guideline study types')
ax2.set_ylabel('Count of chemicals')
plt.savefig(FIG_DIR+'nguideline.png')
plt.show()

<h2>Figure 5</h2>

In [None]:
from __future__ import division
ss=[round(s/20,2) for s in range(1,20)]
gt1_neighbor=[]
for s in ss:
    gt1_neighbor.append({'s':s,'n':loael_neighbors_mean[loael_neighbors_mean['jaccard']>s]['target_sid'].nunique()})
gt1_neighbor=pd.DataFrame(gt1_neighbor)

In [None]:
plt.plot(gt1_neighbor['s'],gt1_neighbor['n'])
plt.title('Coverage vs Similarity')
plt.xlabel('Similarity threshold (s)')
plt.ylabel('Dataset Coverage')
plt.savefig(FIG_DIR+'coverage')
plt.show()

<h2>Figure 6</h2>

In [None]:
from sklearn.metrics import r2_score
i=1
f=plt.figure(figsize=(12,12))
plt.suptitle('Min Aggregation Prediction')
for category in categories:
    plt.subplot(2,2,i)
    i+=1
    df=loael_predictions[[category,category+'_p']]
    df=df[df.notnull().all(axis='columns')]
    plt.scatter(df[category],df[category+'_p'])
    plt.title(category+ ' LOAEL Predictions')
    plt.xlabel('True')
    plt.ylabel('Predicted')
    plt.annotate('R2='+str(round(r2_score(df[category],df[category+'_p']),2)),xy=(.03,.93),xycoords='axes fraction')
plt.subplots_adjust(wspace=.5,hspace=.4)
plt.savefig(FIG_DIR+'example_fit')
plt.show()

<h2>Figure 7</h2>

In [None]:
from sklearn.metrics import r2_score
i=1
f=plt.figure(figsize=(12,12))
f.suptitle('Mean Aggregation Predictions')
for category in categories:
    plt.subplot(2,2,i)
    i+=1
    df=loael_predictions_mean[[category,category+'_p']]
    df=df[df.notnull().all(axis='columns')]
    df=df[(df!=np.inf).all(axis=1)]
    plt.scatter(df[category],df[category+'_p'])
    plt.title(category+ ' LOAEL Predictions')
    plt.xlabel('True')
    plt.ylabel('Predicted')
    plt.annotate('R2='+str(round(r2_score(df[category],df[category+'_p']),2)),xy=(.03,.9),xycoords='axes fraction')
plt.subplots_adjust(wspace=.5,hspace=.4)
plt.savefig(FIG_DIR+'example_fit_mean')
plt.show()

<h2>Figure 8</h2>

In [None]:
def wtavg(df,name,k,s):
    df=df[df['jaccard']>=s]
    df=df[df[name]!=np.inf]
    df=df[df[name].notnull()].iloc[0:k]
    if df.empty:
        return np.nan
    weights=list(df['jaccard'])
    values=list(df[name])
    return np.average(values,weights=weights)

In [None]:
def genra_predict(ndf,tdf,category,k,s):
    predictions={}
    for sid,group in ndf.groupby(['target_sid']):
        predictions[sid]=wtavg(group,category,k,s)
    prediction_df=pd.DataFrame(predictions.values(),index=predictions.keys(),columns=[category+'_p'])
    prediction_df=prediction_df.merge(tdf,right_index=True,left_index=True)
    prediction_df=prediction_df[[category,category+'_p']]
    return prediction_df

In [None]:
sids=loael_neighbors['target_sid'].unique()

In [None]:
np.random.seed(6979)
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
mean_r2s={}
k=10
s=.05
for category in categories:
    i=0
    mean_r2s[category]=[]
    while i<100:
        train,test=train_test_split(sids,test_size=.1)
        test_neighbors=loael_neighbors_mean[(loael_neighbors_mean['neighbor_sid'].isin(train)) & (loael_neighbors_mean['target_sid'].isin(test))]
        tts_predictions=plot_worthy(genra_predict(test_neighbors,loael_agg_mean,category,k,s))
        mean_r2s[category].append(r2_score(tts_predictions[category],tts_predictions[category+'_p']))
        i+=1
mean_r2_df=pd.DataFrame([{'category':category,'mean':np.mean(r2list),'sd':np.std(r2list)} for category,r2list in mean_r2s.iteritems()])

In [None]:
fig,ax=plt.subplots(2,2,figsize=(12,12))
fig.suptitle('R2 scores for 100 90-10 train-test splits')
ax=ax.reshape(-1)
for category, r2list in mean_r2s.iteritems():
    axes,ax=ax[0],ax[1:]
    q25,q75=np.percentile(r2list,[25,75])
    iqr=q75-q25
    maxi=q75+iqr*.25
    mini=q25-iqr*.25
    r2list=[r2 for r2 in r2list if r2>mini and r2<maxi]
    sns.distplot(r2list,ax=axes)
    df=loael_predictions_mean[[category,category+'_p']]
    df=df[df.notnull().all(axis='columns')]
    df=df[(df!=np.inf).all(axis=1)]
    axes.axvline(x=r2_score(df[category],df[category+'_p']),color='orange',label='Full Dataset')
    axes.set_xlabel('R2 score')
    axes.set_title(category)
    axes.legend(loc='best',fontsize=11)
    axes.annotate('Mean='+str(round(np.mean(r2list),2)),xy=(.03,.85),xycoords='axes fraction')
    axes.annotate('SD='+str(round(np.std(r2list),2)),xy=(.03,.8),xycoords='axes fraction')
    plt.tight_layout
    plt.savefig(FIG_DIR+'loael_validation')
plt.show()

<h2>Figure 9</h2>

In [None]:
def av_sim(ser):
    ser=ser.iloc[0:2]
    return np.mean(ser)
av_sims=pd.DataFrame()
for category in categories:
    df=loael_neighbors_mean[pd.notnull(loael_neighbors_mean[category])]
    av_sims_cat=df.pivot_table(index='target_sid',values='jaccard',aggfunc=av_sim)
    av_sims_cat['category']=category
    av_sims=av_sims.append(av_sims_cat)

In [None]:
sns.boxplot(x='category',y='jaccard',data=av_sims)
plt.title('Source analogue similarity by endpoint category')
plt.savefig(FIG_DIR+'simbycat')

<h2>Figure 10</h2>

See k,s grid search for LOAELS using mean aggregation section in main notebook

<h2>Figure 11</h2>

See k,s grid search for LOAELS using mean aggregation section in main notebook

<h2>Figure 12</h2>

See Cluster grid search section in 121018_meeting

<h2>Figure 13</h2>

See Cluster grid search section in 121018_meeting

<h2>Figure 14</h2>

See Cluster grid search section in 121018_meeting