In [4]:
import pandas as pd
import numpy as np
import pymongo
import sys
import os
from __future__ import print_function
from datetime import datetime
import matplotlib.pyplot as plt

TOP = '/'.join(os.getcwd().split('/')[:-2])+'/'
LIB = TOP+'lib'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

DAT_DIR = TOP + 'data/toxref/'
FIG_DIR = TOP + 'figs/toxref/'

from rax.genrapred import *

In [5]:
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compound']
toxref=DB['toxrefdb2']
physprop=DB['physprop']

In [6]:
def wtavg(df,name,k,s):
    df=df[df['jaccard']>s]
    df=df[df[name]!=np.inf]
    df=df[df[name].notnull()].iloc[0:k]
    if df.empty:
        return np.nan
    weights=list(df['jaccard'])
    values=list(df[name])
    return np.average(values,weights=weights)

In [7]:
def exact_k_wtavg(df,name,k,s):
    df=df[df['jaccard']>s]
    df=df[df[name]!=np.inf]
    df=df[df[name].notnull()].iloc[0:k]
    if len(df)<k:
        return np.nan
    weights=list(df['jaccard'])
    values=list(df[name])
    return np.average(values,weights=weights)

In [8]:
from __future__ import division
ks=range(1,20)
ss=[round(s/20,2) for s in range(1,20)]

<h1>BMDs</h1>

In [None]:
def bmd_record(document):
    bmds=document['bmds']
    for bmd in bmds:
        bmd['dsstox_sid']=document['dsstox_sid']
    return bmds

In [None]:
bmd_df=pd.DataFrame([bmd for document in toxref.find({'bmds':{'$exists':True}}) for bmd in bmd_record(document)])

In [None]:
bmd_df=bmd_df[bmd_df['recommended']==True]
print(str(len(bmd_df)) +' BMD values')
print(str(len(bmd_df['dsstox_sid'].unique())) + ' unique substances')

In [None]:
with pd.option_context('display.max_columns',None):
    bmd_df.head()

In [None]:
bmd_df['AIC']=pd.to_numeric(bmd_df['AIC']) #Error values are 1.#QNAN whatever that means
#Convert other columns to numeric as needed
#bmd_df=bmd_df[pd.notnull(bmd_df['AIC'])]
#bmd_df=bmd_df[(bmd_df['AIC']!=-999) & (bmd_df['BMD']!=-999) & (bmd_df['BMDU']!=-999) & (bmd_df['BMDL']!=-999)] #Code for fail

In [None]:
bmd_df['bmr_type'].value_counts()

In [None]:
bmd_df['endpoint_category'].value_counts()

In [None]:
bmd_df['model_name'].value_counts()

In [None]:
sids=list(bmd_df['dsstox_sid'].unique())
weights={record['dsstox_sid']:record['mol_weight'] for record in dsstox.find({'dsstox_sid':{'$in':sids}})}
bmd_df['mol_weight']=bmd_df['dsstox_sid'].map(weights)

In [None]:
bmd_df.boxplot('BMD',by='bmr_type')
plt.title('')
plt.ylabel('BMD (log molar)')
plt.show()
bmd_df['mol_weight']=bmd_df['dsstox_sid'].map(weights)
bmd_df['BMD_LM']=-np.log10(bmd_df['BMD']/1000/bmd_df['mol_weight'])
bmd_df.boxplot('BMD_LM',by='bmr_type')
plt.title('')
plt.ylabel('BMD (log molar)')
plt.show()

In [None]:
bmd_df=bmd_df.sort_values('AIC')
#bmd_df=bmd_df.reset_index()
with pd.option_context('display.max_columns',None):
    bmd_df.head()

In [None]:
bmd_agg=bmd_df.pivot_table(index=['dsstox_sid','bmr_type'],columns='endpoint_category',values='BMD_LM',aggfunc='first') #May want to average in the case of multiple models with same AIC
bmd_agg.to_csv(DAT_DIR+'bmdagg.csv')

In [None]:
bmd_agg.head()

In [None]:
# bmd_sids=list(bmd_agg.index.unique(level='dsstox_sid'))
# bmd_neighbors_l=[]
# for sid in bmd_sids:
#     sid_neighbors=searchCollByFP(sid,s0=.05,SID=bmd_sids,DB=DB,col='chemotypes',fpn='chemotypes')
#     if sid_neighbors:
#         for neighbor in sid_neighbors:
#             neighbor['target_sid']=sid
#             neighbor['neighbor_sid']=neighbor.pop('dsstox_sid')
#             neighbor['neighbor_name']=neighbor.pop('name',None)
#         bmd_neighbors_l=bmd_neighbors_l+sid_neighbors

In [12]:
# bmd_neighbors=pd.DataFrame(bmd_neighbors_l)
# bmd_neighbors=bmd_neighbors[bmd_neighbors['target_sid']!=bmd_neighbors['neighbor_sid']]
# bmd_neighbors=bmd_neighbors.merge(bmd_agg.reset_index(1),left_on='neighbor_sid',right_index=True)
# bmd_neighbors=bmd_neighbors.sort_values('jaccard',ascending=False)
# bmd_neighbors.to_csv(DAT_DIR+'bmd_neighbors.csv')
bmd_neighbors=pd.read_csv(DAT_DIR+'bmd_neighbors.csv')
bmd_neighbors.head()

Unnamed: 0.1,Unnamed: 0,jaccard,neighbor_name,neighbor_sid,target_sid,bmr_type,cholinesterase,developmental,reproductive,systemic
0,494,1.0,Dichlorprop-P,DTXSID0034851,DTXSID0020440,bmr,,3.410048,,3.357642
1,14185,1.0,3-Chloroaniline,DTXSID0024761,DTXSID2021810,sd,,,,4.439967
2,31282,1.0,2-Nitrotoluene,DTXSID4025791,DTXSID5023792,bmr,,,,2.767601
3,5830,1.0,Fluazifop-butyl,DTXSID3034612,DTXSID0034855,bmr,,2.986412,2.98905,4.519996
4,5830,1.0,Fluazifop-butyl,DTXSID3034612,DTXSID0034855,rd,,,,4.890079


In [13]:
categories=list(bmd_agg.columns.values)

NameError: name 'bmd_agg' is not defined

In [None]:
predictions={}
k=10
s=.05
for index,group in bmd_neighbors.groupby(['target_sid','bmr_type']):
    predictions[index]={category+'_p':wtavg(group,category,k,s) for category in categories}

In [None]:
bmd_predictions=pd.DataFrame(predictions.values(),index=pd.MultiIndex.from_tuples(predictions.keys(),names=['dsstox_sid','bmr_type']))
bmd_predictions=bmd_predictions.merge(bmd_agg,left_index=True,right_index=True)

In [None]:
from sklearn.metrics import r2_score
i=1
f=plt.figure(figsize=(12,16))
for bmr_type,group in bmd_predictions.reset_index(1).groupby('bmr_type'):
    for category in categories:
        df=group[[category,category+'_p']]
        df=df[df.notnull().all(axis='columns')]
        if df.empty:
            continue
        plt.subplot(4,2,i)
        i+=1
        plt.scatter(df[category],df[category+'_p'])
        plt.title(category+ ' ' + bmr_type+ ' predictions')
        plt.xlabel('True')
        plt.ylabel('Predicted')
        plt.annotate('R2='+str(round(r2_score(df[category],df[category+'_p']),2)),xy=(.03,.90),xycoords='axes fraction')
plt.subplots_adjust(wspace=.4,hspace=.6)
plt.show()

In [None]:
bmd_predictions.head()

In [None]:
bmd_gt9=bmd_neighbors[bmd_neighbors['jaccard']>.9]['target_sid'].unique() #Targets with atleast 1 neighbors >.3
len(bmd_neighbors['target_sid'].unique())
len(bmd_gt9)

In [None]:
def ranker(series):
    diff=[abs(series[category]-series[category+'_p']) for category in categories if not np.isnan(series[category])]
    return sum(diff)/len(diff)

In [None]:
#BMD best results
ranked_bmds=bmd_predictions.copy()
ranked_bmds['rank']=ranked_bmds.apply(ranker,axis='columns')
ranked_bmds=ranked_bmds.loc[bmd_gt9]
ranked_bmds=ranked_bmds.sort_values('rank')
ranked_bmds

In [None]:
#BMD good prediction
sid='DTXSID8023848'
bmr_type='rd'
dsstox.find_one({'dsstox_sid':sid},{'_id':0,'name':1})
row=bmd_predictions.loc[sid,bmr_type]
print('Predictions')
row[[category+'_p' for category in categories]]
print('Measured')
row[categories]
print('mg/kg Predictions')
[{category:convert_back(row[category+'_p'],weights[sid]) for category in categories}]
print('mg/kg Measured')
[{category:convert_back(row[category],weights[sid]) for category in categories}]
bmd_neighbors[(bmd_neighbors['target_sid']==sid) & (bmd_neighbors['bmr_type']==bmr_type) & (pd.notnull(bmd_neighbors['systemic']))].iloc[0:10]['neighbor_sid'].values

In [None]:
#BMD bad prediction
sid='DTXSID4032459'
bmr_type='sd'
dsstox.find_one({'dsstox_sid':sid},{'_id':0,'name':1})
row=bmd_predictions.loc[sid,bmr_type]
print('Predictions')
row[[category+'_p' for category in categories]]
print('Measured')
row[categories]
print('mg/kg Predictions')
[{category:convert_back(row[category+'_p'],weights[sid]) for category in categories}]
print('mg/kg Measured')
[{category:convert_back(row[category],weights[sid]) for category in categories}]
nhood=bmd_neighbors[(bmd_neighbors['target_sid']==sid) & (bmd_neighbors['bmr_type']==bmr_type) & (pd.notnull(bmd_neighbors['systemic']))].iloc[0:10]

<h1>BMD/LOAEL neighborhood comparison</h1>

In [9]:
def compare(diff):
    if abs(diff)==0: return '='
    elif diff<.0: return '<'
    elif diff>.0: return '>'

In [11]:
loael_neighbors=pd.read_csv(DAT_DIR+'loael_neighbors_mrgn.csv')
loael_sim=loael_neighbors.pivot_table(index='target_sid',values='jaccard',aggfunc=lambda x: x[0:2].mean())
bmd_sim=bmd_neighbors.pivot_table(index='target_sid',values='jaccard',aggfunc=lambda x: x[0:2].mean())
sims=loael_sim.merge(bmd_sim,left_index=True,right_index=True)
sims.columns=['loael','bmd']
sims['diff']=sims['loael']-sims['bmd']
sims['loael_comp']=sims['diff'].map(compare)
sims.head()
sims.describe()
sims['loael_comp'].value_counts()

NameError: name 'bmd_neighbors' is not defined

In [None]:
plt.scatter(sims['loael'],sims['bmd'])
plt.title('BMD vs LOAEL neighborhood similarity')
plt.ylabel('BMD')
plt.xlabel('LOAEL')
plt.show()

<h1>BMD Cluster Analysis</h1>

In [None]:
#BMD cluster analysis without restricting neighbors
i=1
r2s=[]
f=plt.figure(figsize=(12,300))
for cluster in clusters: 
    chems=cluster['chems']
    try:
        df=bmd_predictions.loc[chems]
    except:
        continue    
    df=df[['systemic','systemic_p']]
    df=df[df.notnull().all(axis='columns')]
    df=df[(df!=np.inf).all(axis=1)]
    if df.empty:
        continue
    plt.subplot(50,2,i)
    i+=1
    plt.scatter(df['systemic'],df['systemic_p'])
    ax_min=df.values.min()-.1
    ax_max=df.values.max()+.1
    plt.xlim(ax_min,ax_max)
    plt.ylim(ax_min,ax_max)
    plt.title('Cluster ' + str(cluster['cl_id']) + 'systemic BMD Predictions')
    plt.xlabel('True')
    plt.ylabel('Predicted')
    plt.annotate('R2='+str(round(r2_score(df['systemic'],df['systemic_p']),2)),xy=(.8,-.15),xycoords='axes fraction')
    plt.annotate('n='+str(len(df)),xy=(.8,-.2),xycoords='axes fraction')
    r2s.append({'cl_id':cluster['cl_id'],'R2':r2_score(df['systemic'],df['systemic_p']),'size':len(df)})
plt.subplots_adjust(wspace=.5,hspace=.4)
plt.show()

In [None]:
bmd_r2_df=pd.DataFrame(r2s)
bmd_r2_df=bmd_r2_df.sort_values('R2',ascending=False)
bmd_r2_df