In [None]:
import pandas as pd
import numpy as np
import pymongo
import sys
import os
from __future__ import print_function
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

TOP = '/'.join(os.getcwd().split('/')[:-2])+'/'
LIB = TOP+'lib'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

DAT_DIR = TOP + 'data/toxref/'
FIG_DIR = TOP + 'figs/toxref/'

from rax.genrapred import *

In [None]:
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compound']
toxref=DB['toxrefdb2']

In [None]:
def plot_worthy(pdobject):
    if isinstance(pdobject,pd.core.series.Series):
        pdobject=pdobject[pd.notnull(pdobject)]
        pdobject=pdobject[pdobject!=np.inf]
        return pdobject
    elif isinstance(pdobject,pd.core.frame.DataFrame):
        pdobject=pdobject[pdobject.notnull().all(axis='columns')]
        pdobject=pdobject[(pdobject!=np.inf).all(axis=1)]
        return pdobject

In [None]:
def exact_k_wtavg(df,name,k,s):
    df=df[df['jaccard']>s]
    df=df[df[name]!=np.inf]
    df=df[df[name].notnull()].iloc[0:k]
    if len(df)<k:
        return np.nan
    weights=list(df['jaccard'])
    values=list(df[name])
    return np.average(values,weights=weights)

In [None]:
def wtvar(df,name,k):
    df=df[(df[name].notnull()) & (df[name]!=np.inf)].iloc[0:k]
    if df.empty:
        return np.nan
    weights=list(df['jaccard'])
    values=list(df[name])
    return sum([weights[i]**2*values[i] for i in range(len(values))])/sum(weights)**2

In [None]:
loael_df=pd.read_csv(DAT_DIR+'loael.csv')
lel_df=pd.read_csv(DAT_DIR+'lel.csv')
loael_agg=pd.read_csv(DAT_DIR+'loaelagg.csv',index_col='dsstox_sid')
lel_agg=pd.read_csv(DAT_DIR+'lelagg.csv',index_col='dsstox_sid')
loael_sids=list(set(loael_agg.index.values))
lel_sids=list(set(lel_agg.index.values))
loael_neighbors=pd.read_csv(DAT_DIR+'loael_neighbors_mrgn.csv')
loael_predictions=pd.read_csv(DAT_DIR+'loael_predictions_mrgn.csv',index_col=0)
loael_agg_mean=pd.read_csv(DAT_DIR+'loael_agg_mean_mrgn.csv',index_col='dsstox_sid')
loael_agg_sd=pd.read_csv(DAT_DIR+'loael_agg_sd_mrgn.csv',index_col='dsstox_sid')
loael_neighbors_mean=pd.read_csv(DAT_DIR+'loael_neighbors_mean_mrgn.csv',index_col=0)
loael_neighbors_sd=pd.read_csv(DAT_DIR+'loael_neighbors_sd_mrgn.csv',index_col=0)
loael_predictions_mean=pd.read_csv(DAT_DIR+'loael_predictions_mean_mrgn.csv',index_col=0)

In [None]:
#Table 1
loael_df['endpoint_category'].value_counts()

In [None]:
len(loael_df)

In [None]:
loael_df.head()

In [None]:
loael_df[loael_df['dsstox_sid']=='DTXSID0020076']

In [None]:
loael_df['pod_unit'].value_counts()

In [None]:
plt.hist(loael_df['pod_value'])
plt.title('Histogram of LOAEL values')
plt.xlabel('LOAEL Value')
plt.annotate('2 values',xy=(.90,.02),xytext=(.85,.15),xycoords='axes fraction',arrowprops=dict(facecolor='red'))
plt.savefig(FIG_DIR+'pod_histogram')
plt.show()

In [None]:
plt.hist(plot_worthy(loael_df['pod_value_LM']))
plt.title('Histogram of Log Molar LOAEL values')
plt.xlabel('LOAEL Value (Log Molar)')
plt.savefig(FIG_DIR+'lm_histogram')
plt.show()

In [None]:
from __future__ import division
ss=[round(s/20,2) for s in range(1,20)]
gt1_neighbor=[]
for s in ss:
    gt1_neighbor.append({'s':s,'n':loael_neighbors_mean[loael_neighbors_mean['jaccard']>s]['target_sid'].nunique()})
gt1_neighbor=pd.DataFrame(gt1_neighbor)

In [None]:
plt.plot(gt1_neighbor['s'],gt1_neighbor['n'])
plt.title('Coverage vs Similarity')
plt.xlabel('Similarity threshold (s)')
plt.ylabel('Dataset Coverage')
plt.savefig(FIG_DIR+'coverage')
plt.show()

In [None]:
categories=list(loael_agg.columns.values)

In [None]:
def av_sim(ser):
    ser=ser.iloc[0:2]
    return np.mean(ser)
av_sims=pd.DataFrame()
for category in categories:
    df=loael_neighbors_mean[pd.notnull(loael_neighbors_mean[category])]
    av_sims_cat=df.pivot_table(index='target_sid',values='jaccard',aggfunc=av_sim)
    av_sims_cat['category']=category
    av_sims=av_sims.append(av_sims_cat)

In [None]:
av_sims

In [None]:
sns.boxplot(x='category',y='jaccard',data=av_sims)
plt.title('Source analogue similarity by endpoint category')
plt.savefig(FIG_DIR+'simbycat')

<h1>EPA Categories analysis w/o restricing neighbor search to category subspace</h1>

In [None]:
import pickle
search_spaces=pickle.load(open(DAT_DIR+'search_spaces.pkl'))
search_spaces={str(k):v for k,v in search_spaces.iteritems()}

In [None]:
loael_predictions_mean.head()

In [None]:
plt.rcParams['font.size']=12

In [None]:
from sklearn.metrics import r2_score
i=0
fig,ax=plt.subplots(5,5,figsize=(50,50))
ax=ax.reshape(-1)
[fig.delaxes(ax[-k]) for k in range(1,5)]
for epa_category,search_space in search_spaces.iteritems():
    epa_category_df=loael_predictions_mean.loc[search_space]
    for category in categories:
        df=plot_worthy(epa_category_df[[category,category+'_p']])
        if len(df)<10:
            continue
        axes,ax=ax[0],ax[1:]
        axes.scatter(df[category],df[category+'_p'])
        if epa_category=='()':
            axes.set_title('Uncategorized ' + category)
        else:
            axes.set_title(epa_category[2:-3]+' ' + category)
        axes.set_xlabel('True')
        axes.set_ylabel('Predicted')
        axes.annotate('R2='+str(round(r2_score(df[category],df[category+'_p']),2)),xy=(.02,.94),xycoords='axes fraction')
        i+=1
plt.savefig(FIG_DIR+'epa_category_scatterplots.png')
plt.show()

<h1>Summary of results for structurally-related groups of environmental chemicals</h1>

In [None]:
epa_category_dict={chem:epa_category for epa_category,list_of_chems in search_spaces.iteritems() for chem in list_of_chems}
epa_category_sids=set(epa_category_dict.keys())

In [None]:
perfluoro_df

In [None]:
#How do perfluoro compounds categorize?
pfas_list=pd.read_csv(DAT_DIR+'../pfas_list.csv',sep='\t')
pfas_sids=list(pfas_list['DTXSID'])
perfluoro_sids=[record['dsstox_sid'] for record in dsstox.find({'$and':[{'dsstox_sid':{'$in':list(epa_category_sids)}},{'dsstox_sid':{'$in':pfas_sids}}]})]
[epa_category_dict[sid] for sid in perfluoro_sids]
perfluoro_df=loael_predictions_mean.loc[perfluoro_sids]
fig=plt.figure(figsize=(12,12))
i=1
for category in categories:
    ax=plt.subplot(2,2,i)
    df=plot_worthy(perfluoro_df[[category,category+'_p']])
    if df.empty:
        fig.delaxes(ax)
        continue
    plt.scatter(df[category],df[category+'_p'])
    plt.title(category)
    plt.xlabel('True')
    plt.ylabel('Predicted')
    plt.annotate('R2='+str(round(r2_score(df[category],df[category+'_p']),2)),xy=(.80,.95),xycoords='axes fraction')
    i+=1
plt.suptitle('PFAs')
plt.savefig(FIG_DIR+'pfas.png')

In [None]:
#How do phthalates categorize?
phthalate_sids=[record['dsstox_sid'] for record in dsstox.find({'$and':[{'dsstox_sid':{'$in':list(epa_category_sids)}},{'name':{'$regex':'phthalate'}}]})]
[epa_category_dict[sid] for sid in phthalate_sids]
phthalate_df=loael_predictions_mean.loc[phthalate_sids]
fig=plt.figure(figsize=(12,12))
i=1
for category in categories:
    ax=plt.subplot(2,2,i)
    df=plot_worthy(phthalate_df[[category,category+'_p']])
    if df.empty:
        fig.delaxes(ax)
        continue
    plt.scatter(df[category],df[category+'_p'])
    plt.title(category)
    plt.xlabel('True')
    plt.ylabel('Predicted')
    plt.annotate('R2='+str(round(r2_score(df[category],df[category+'_p']),2)),xy=(.80,.95),xycoords='axes fraction')
    i+=1
plt.suptitle('Phthalates')
plt.savefig(FIG_DIR+'phthalates.png')

In [None]:
#How do conazoles categorize?
conazole_casrns=['60107-31-0','38083-17-9','23593-75-1','94361-06-5','75736-33-3','119446-68-3','70217-36-6','133855-98-8',\
       '71245-23-3','136426-54-5','85509-19-9','76674-21-0','79983-71-4','35554-44-0','58594-72-2','125116-23-6',\
        '88671-89-0','66246-88-6','67747-09-5','60207-90-1','178928-70-6','120983-64-4','107534-96-3','43121-43-3',\
       '55219-65-3','70585-35-2','68694-11-1','131983-72-7','83657-22-1']
conazole_sids=[record['dsstox_sid'] for record in dsstox.find({'$and':[{'dsstox_sid':{'$in':list(epa_category_sids)}},{'casrn':{'$in':conazole_casrns}}]})]
[epa_category_dict[sid] for sid in conazole_sids]
conazole_df=loael_predictions_mean.loc[conazole_sids]
fig=plt.figure(figsize=(12,12))
i=1
for category in categories:
    ax=plt.subplot(2,2,i)
    df=plot_worthy(conazole_df[[category,category+'_p']])
    if len(df)<3:
        fig.delaxes(ax)
        continue
    plt.scatter(df[category],df[category+'_p'])
    plt.title(category)
    plt.xlabel('True')
    plt.ylabel('Predicted')
    plt.annotate('R2='+str(round(r2_score(df[category],df[category+'_p']),2)),xy=(.80,.95),xycoords='axes fraction')
    i+=1
plt.suptitle('Conazoles')
plt.savefig(FIG_DIR+'conazoles.png')

In [None]:
#Organophosphates
op_names=['Dichlorvos','Dicrotophos','Naled','Tetrachlorvinphos','Trichlorfon','Ethoprop','Azinphos methyl',\
         'Bensulide','Dimethoate','Disulfoton Ethion','Malathion','Methidathion','Phorate','Phosmet','Sulfopros'\
         'Temephos','Terbufos','Fonofos','Oxydemeton-methyl','Profenofos','Chlorethoxyfos','Chlorpyrifos','Coumaphos',\
         'Diazinon','Fenitrothion','Fenthion','Methyl chlorpyrifos','Methyl parathion','Parathion','Pirimiphos methyl',\
         'Sulfotepp','Tebupirimphos','Fenamiphos','Acephate','Methamidophos','Isofenfos','Propetamphos']
op_sids=[record['dsstox_sid'] for record in dsstox.find({'$and':[{'dsstox_sid':{'$in':list(epa_category_sids)}},{'name':{'$in':op_names}}]})]
[epa_category_dict[sid] for sid in op_sids]
op_df=loael_predictions_mean.loc[op_sids]
fig=plt.figure(figsize=(12,12))
i=1
for category in categories:
    ax=plt.subplot(2,2,i)
    df=plot_worthy(op_df[[category,category+'_p']])
    if df.empty:
        fig.delaxes(ax)
        continue
    plt.scatter(df[category],df[category+'_p'])
    plt.title(category)
    plt.xlabel('True')
    plt.ylabel('Predicted')
    plt.annotate('R2='+str(round(r2_score(df[category],df[category+'_p']),2)),xy=(.80,.95),xycoords='axes fraction')
    i+=1
plt.suptitle('Organophosphates')
plt.savefig(FIG_DIR+'organophosphates.png')

In [None]:
loael_neighbors_mean.pivot_table(index='target_sid',values='neighbor_sid',aggfunc=len)

In [None]:
loael_neighbors_mean[loael_neighbors_mean['target_sid']=='DTXSID0020076']

In [None]:
DB.collection_names()

In [None]:
DB['tox_fp'].find_one()