In [None]:
import pandas as pd
import numpy as np
import pymongo
import sys
import os
from __future__ import print_function
from datetime import datetime
import matplotlib.pyplot as plt

TOP = '/'.join(os.getcwd().split('/')[:-3])+'/'
LIB = TOP+'lib'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

DAT_DIR = TOP + 'data/toxref/'
FIG_DIR = TOP + 'figs/toxref/'

from rax.genrapred import *

In [None]:
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compound']
toxref=DB['toxrefdb2']

In [None]:
def plot_worthy(pdobject):
    if isinstance(pdobject,pd.core.series.Series):
        pdobject=pdobject[pd.notnull(pdobject)]
        pdobject=pdobject[pdobject!=np.inf]
        return pdobject
    elif isinstance(pdobject,pd.core.frame.DataFrame):
        pdobject=pdobject[pdobject.notnull().all(axis='columns')]
        pdobject=pdobject[(pdobject!=np.inf).all(axis=1)]
        return pdobject

def wtavg(df,name,k,s):
    df=df[df['jaccard']>s]
    df=df[df[name]!=np.inf]
    df=df[df[name].notnull()].iloc[0:k]
    if df.empty:
        return np.nan
    weights=list(df['jaccard'])
    values=list(df[name])
    return np.average(values,weights=weights)

def exact_k_wtavg(df,name,k,s):
    df=df[df['jaccard']>s]
    df=df[df[name]!=np.inf]
    df=df[df[name].notnull()].iloc[0:k]
    if len(df)<k:
        return np.nan
    weights=list(df['jaccard'])
    values=list(df[name])
    return np.average(values,weights=weights)

def wtvar(df,name,k):
    df=df[(df[name].notnull()) & (df[name]!=np.inf)].iloc[0:k]
    if df.empty:
        return np.nan
    weights=list(df['jaccard'])
    values=list(df[name])
    return sum([weights[i]**2*values[i] for i in range(len(values))])/sum(weights)**2

In [None]:
from __future__ import division
ks=range(1,20)
ss=[round(s/20,2) for s in range(1,20)]

In [None]:
def chemical_pod_record(document):
    pods=document['pods']
    for pod in pods:
        pod['dsstox_sid']=document['dsstox_sid']
    return pods

In [None]:
def study_pod_record(document):
    studies=document['studies']
    for study in studies:
        study['dsstox_sid']=document['dsstox_sid']
    return studies

In [None]:
from sklearn.metrics import r2_score
def r2_plot(pred_df):
    i=1
    f=plt.figure(figsize=(12,12))
    f.suptitle('Mean Aggregation Predictions')
    for category in categories:
        plt.subplot(2,2,i)
        i+=1
        df=pred_df[[category,category+'_p']]
        df=df[df.notnull().all(axis='columns')]
        df=df[(df!=np.inf).all(axis=1)]
        plt.scatter(df[category],df[category+'_p'])
        plt.title(category+ ' study predictions')
        plt.xlabel('True')
        plt.ylabel('Predicted')
        plt.annotate('R2='+str(round(r2_score(df[category],df[category+'_p']),2)),xy=(.03,.9),xycoords='axes fraction')
    plt.subplots_adjust(wspace=.5,hspace=.4)
    plt.savefig(FIG_DIR+'example_fit_mean')
    plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
from matplotlib.ticker import NullFormatter
def simresplot(accuracy_df):
    df=accuracy_df.copy()[['av_sim','systemic_accuracy']]
    df=df.loc[plot_worthy(df['systemic_accuracy']).index.values]
    x=df['av_sim']
    y=df['systemic_accuracy']

    nullfmt=NullFormatter()
    left,width=.1,.65
    bottom, height = .1,.65
    bottom_h = bottom + height +.02
    left_h = left + width + .02
    rect_scatter = [left,bottom,width,height]
    rect_histx = [left,bottom_h,width,.2]
    rect_histy = [left_h,bottom,.2,height]
    plt.figure(1, figsize=(8,8))

    axScatter=plt.axes(rect_scatter)
    axHistx = plt.axes(rect_histx)
    axHisty = plt.axes(rect_histy)
    axHistx.xaxis.set_major_formatter(nullfmt)
    axHisty.yaxis.set_major_formatter(nullfmt)

    axScatter.scatter(x,y,label="")
    X=np.array([x**i for i in range(0,2)]).T
    order3=LinearRegression()
    order3.fit(X,y)
    x_space=np.linspace(0,1,100)
    x_dummy=np.array([x_space**i for i in range(0,2)]).T
    axScatter.plot(x_space,order3.predict(x_dummy),color='orange',linestyle='--',linewidth=3, label='fit')
    axScatter.legend(loc='upper left')

    axHistx.hist(x)
    axHisty.hist(y,orientation='horizontal')
    axHistx.set_xlim(axScatter.get_xlim())
    axHisty.set_ylim(axScatter.get_ylim())

    axHistx.set_title('Systemic residual vs similarity')
    axScatter.set_xlabel('Average similarity across neighborhood')
    axScatter.set_ylabel('Systemic residual')
    plt.savefig(FIG_DIR+'simvsres',bbox_inches='tight')
    plt.show()

In [None]:
chemical_pods_df=pd.DataFrame([pod for document in toxref.find() for pod in chemical_pod_record(document)])
str(len(chemical_pods_df)) + ' total chemical level PODs'
chemical_pods_df.head()

In [None]:
study_pods_df=pd.DataFrame([study for document in toxref.find() for study in study_pod_record(document)])
str(len(study_pods_df)) + ' total study level PODs'
study_pods_df.head()

In [None]:
study_pods_df['study_type'].unique()

In [None]:
chemical_loaels=chemical_pods_df[chemical_pods_df['pod_type']=='loael']
study_loaels=study_pods_df[study_pods_df['pod_type']=='loael']
str(len(chemical_loaels)) + ' chemical level LOAELs'
str(len(study_loaels)) + ' study level LOAELs'

In [None]:
categories=list(chemical_pods_df['endpoint_category'].unique())
categories.sort()

In [None]:
chemical_loaels.head()

<h1>Chemical level POD munging</h1>

In [None]:
chemical_loaels['pod_unit'].value_counts()

In [None]:
#No obvious way to convert units because we do not have species info
chemical_loaels=chemical_loaels.loc[chemical_loaels.pod_unit=='mg/kg/day']

In [None]:
chemical_loaels['qualifier'].value_counts()
print('Why = in quotations??')
chemical_loaels.loc[chemical_loaels.qualifier=="'='",'qualifier']='='
print('Fixed')

In [None]:
#Need to convert to log molar
chemical_sids=list(chemical_loaels['dsstox_sid'].unique())
weights={record['dsstox_sid']:record['mol_weight'] for record in dsstox.find({'dsstox_sid':{'$in':chemical_sids}})}
chemical_loaels['mol_weight']=chemical_loaels['dsstox_sid'].map(weights)
chemical_loaels['pod_value_LM']=-np.log10(chemical_loaels['pod_value']/chemical_loaels['mol_weight']/1000)

In [None]:
chemical_loaels.to_csv(DAT_DIR+'chemical_loaels.csv',encoding='utf-8')

In [None]:
len(chemical_loaels)

<h1>Study level POD munging</h1>

In [None]:
study_loaels['admin_route'].value_counts()

In [None]:
study_loaels=study_loaels[study_loaels['admin_route']=='Oral']
print('Only looking at oral studies')

In [None]:
study_loaels['pod_unit'].value_counts()

In [None]:
study_loaels.loc[study_loaels.pod_unit=='ppm']['species'].value_counts()

In [None]:
study_loaels.loc[study_loaels.pod_unit=='%']['species'].value_counts()

In [None]:
#Unit conversions
from __future__ import division
study_loaels.loc[(study_loaels['pod_unit']=='ppm') & (study_loaels['species']=='rat'),'pod_value']*=.05
study_loaels.loc[(study_loaels['pod_unit']=='ppm') & (study_loaels['species']=='mouse'),'pod_value']*=.15
study_loaels.loc[(study_loaels['pod_unit'] =='ppm') & (study_loaels['species']=='dog'),'pod_value']*=.075
study_loaels.loc[(study_loaels['pod_unit']=='ppm') & (study_loaels['species']=='rabbit'),'pod_value']*=.03
study_loaels.loc[(study_loaels['pod_unit']=='%'),'pod_value']*=10000*.15
study_loaels.loc[(study_loaels['pod_unit']=='mg/kg/wk'),'pod_value']*=(1/7)
study_loaels.loc[(study_loaels['pod_unit']=='mg/rat/day'),'pod_value']*=(1/.4)
study_loaels['pod_unit']='mg/kg/day'

In [None]:
study_loaels['qualifier'].value_counts()
print('Why = in quotations??')
study_loaels.loc[study_loaels.qualifier=="'='",'qualifier']='='
print('Fixed')

In [None]:
#Need to convert to log molar
study_sids=list(study_loaels['dsstox_sid'].unique())
weights={record['dsstox_sid']:record['mol_weight'] for record in dsstox.find({'dsstox_sid':{'$in':study_sids}})}
study_loaels['mol_weight']=study_loaels['dsstox_sid'].map(weights)
study_loaels['pod_value_LM']=-np.log10(study_loaels['pod_value']/study_loaels['mol_weight']/1000)

In [None]:
study_loaels.to_csv(DAT_DIR+'study_loaels.csv',encoding='utf-8')

In [None]:
study_loaels.head()

In [None]:
len(chemical_sids)

<h1>EDA</h1>

In [None]:
plt.hist([plot_worthy(chemical_loaels['pod_value']),plot_worthy(study_loaels['pod_value'])],bins=20,histtype='step',label=['chemical','study'])
plt.legend(prop={'size':12})
plt.title('POD value histogram')

In [None]:
plt.hist([plot_worthy(chemical_loaels['pod_value_LM']),plot_worthy(study_loaels['pod_value_LM'])],bins=20,histtype='step',label=['chemical','study'])
plt.legend(prop={'size':12})
plt.title('Log molar POD value histogram')

In [None]:
plot_data=[plot_worthy(chemical_loaels.loc[chemical_loaels.qualifier=='=','pod_value_LM']),plot_worthy(chemical_loaels.loc[chemical_loaels.qualifier=='>','pod_value_LM'])]
plt.hist(plot_data,bins=20,histtype='step',label=['=','>'])
plt.legend(prop={'size':12})
plt.title('Chemical LOAELs by qualifier')

In [None]:
plot_data=[plot_worthy(study_loaels.loc[study_loaels.qualifier=='=','pod_value_LM']),plot_worthy(study_loaels.loc[study_loaels.qualifier=='>','pod_value_LM'])]
plt.hist(plot_data,histtype='step',bins=20,label=['=','>'])
plt.legend(prop={'size':12})
plt.title('Chemical LOAELs by qualifier')

In [None]:
study_loaels.head()

<h1>Chemical level POD fit</h1>

In [None]:
chemical_neighbors_l=[]
for sid in chemical_sids:
    sid_neighbors=searchCollByFP(sid,s0=.05,SID=chemical_sids,DB=DB)
    if sid_neighbors:
        for neighbor in sid_neighbors:
            neighbor['target_sid']=sid
            neighbor['neighbor_sid']=neighbor.pop('dsstox_sid')
        chemical_neighbors_l=chemical_neighbors_l+sid_neighbors

In [None]:
chemical_agg_mean=chemical_loaels.pivot_table(index='dsstox_sid',columns='endpoint_category',values='pod_value_LM',aggfunc='mean')

In [None]:
chemical_neighbors_mean=pd.DataFrame(chemical_neighbors_l)
chemical_neighbors_mean=chemical_neighbors_mean[chemical_neighbors_mean['target_sid']!=chemical_neighbors_mean['neighbor_sid']]
chemical_neighbors_mean=chemical_neighbors_mean.merge(chemical_agg_mean,left_on='neighbor_sid',right_index=True)
chemical_neighbors_mean=chemical_neighbors_mean.sort_values('jaccard',ascending=False)

In [None]:
chemical_neighbors_mean.to_csv(DAT_DIR+'chemical_neighbors.csv')

In [None]:
chemical_predictions_mean_dict={}
k=10
s=.05
for sid,group in chemical_neighbors_mean.groupby('target_sid'):
    chemical_predictions_mean_dict[sid]={category+'_p':wtavg(group,category,k,s) for category in categories}

In [None]:
chemical_predictions_mean=pd.DataFrame(chemical_predictions_mean_dict.values(),index=chemical_predictions_mean_dict.keys())
chemical_predictions_mean=chemical_predictions_mean.merge(chemical_agg_mean,right_index=True,left_index=True)
chemical_predictions_mean.to_csv(DAT_DIR+'chemical_predictions.csv')

In [None]:
r2_plot(chemical_predictions_mean)

In [None]:
k=10
av_sims={}
for sid,group in chemical_neighbors_mean.groupby('target_sid'):
    av_sim=group.iloc[0:2]['jaccard'].mean()
    av_sims[sid]=av_sim
chemical_accuracy=chemical_predictions_mean.copy()
chemical_accuracy['systemic_accuracy']=abs(chemical_accuracy['systemic']-chemical_accuracy['systemic_p'])
chemical_accuracy['av_sim']=chemical_accuracy.index.map(av_sims)

In [None]:
simresplot(chemical_accuracy)

<h1>Study level POD fit</h1>

In [None]:
study_neighbors_l=[]
for sid in study_sids:
    sid_neighbors=searchCollByFP(sid,s0=.05,SID=study_sids,DB=DB)
    if sid_neighbors:
        for neighbor in sid_neighbors:
            neighbor['target_sid']=sid
            neighbor['neighbor_sid']=neighbor.pop('dsstox_sid')
        study_neighbors_l=study_neighbors_l+sid_neighbors

In [None]:
study_agg_mean=study_loaels.pivot_table(index='dsstox_sid',columns='endpoint_category',values='pod_value_LM',aggfunc='mean')

In [None]:
study_neighbors_mean=pd.DataFrame(study_neighbors_l)
study_neighbors_mean=study_neighbors_mean[study_neighbors_mean['target_sid']!=study_neighbors_mean['neighbor_sid']]
study_neighbors_mean=study_neighbors_mean.merge(study_agg_mean,left_on='neighbor_sid',right_index=True)
study_neighbors_mean=study_neighbors_mean.sort_values('jaccard',ascending=False)

In [None]:
study_neighbors_mean.to_csv(DAT_DIR+'study_neighbors.csv')

In [None]:
study_predictions_mean_dict={}
k=10
s=.05
for sid,group in study_neighbors_mean.groupby('target_sid'):
    study_predictions_mean_dict[sid]={category+'_p':wtavg(group,category,k,s) for category in categories}

In [None]:
study_predictions_mean=pd.DataFrame(study_predictions_mean_dict.values(),index=study_predictions_mean_dict.keys())
study_predictions_mean=study_predictions_mean.merge(study_agg_mean,right_index=True,left_index=True)
study_predictions_mean.to_csv(DAT_DIR+'study_predictions.csv')

In [None]:
r2_plot(study_predictions_mean)

<h1>Only = qualifier</h1>

<h2>Chemical</h2>

In [None]:
chemical_loaels_equal=chemical_loaels.loc[chemical_loaels.qualifier=='=']
chemical_loaels_equal['dsstox_sid'].nunique()

In [None]:
chemical_sids_equal=list(chemical_loaels_equal['dsstox_sid'].unique())
chemical_neighbors_equal_l=[]
for sid in chemical_sids_equal:
    sid_neighbors=searchCollByFP(sid,s0=.05,SID=chemical_sids_equal,DB=DB)
    if sid_neighbors:
        for neighbor in sid_neighbors:
            neighbor['target_sid']=sid
            neighbor['neighbor_sid']=neighbor.pop('dsstox_sid')
        chemical_neighbors_equal_l=chemical_neighbors_equal_l+sid_neighbors

In [None]:
chemical_agg_mean_equal=chemical_loaels_equal.pivot_table(index='dsstox_sid',columns='endpoint_category',values='pod_value_LM',aggfunc='mean')

In [None]:
chemical_neighbors_mean_equal=pd.DataFrame(chemical_neighbors_equal_l)
chemical_neighbors_mean_equal=chemical_neighbors_mean_equal[chemical_neighbors_mean_equal['target_sid']!=chemical_neighbors_mean_equal['neighbor_sid']]
chemical_neighbors_mean_equal=chemical_neighbors_mean_equal.merge(chemical_agg_mean_equal,left_on='neighbor_sid',right_index=True)
chemical_neighbors_mean_equal=chemical_neighbors_mean_equal.sort_values('jaccard',ascending=False)

In [None]:
chemical_neighbors_mean_equal.to_csv(DAT_DIR+'chemical_neighbors_equal.csv')

In [None]:
chemical_predictions_mean_equal_dict={}
k=10
s=.05
for sid,group in chemical_neighbors_mean_equal.groupby('target_sid'):
    chemical_predictions_mean_equal_dict[sid]={category+'_p':wtavg(group,category,k,s) for category in categories}

In [None]:
chemical_predictions_mean_equal=pd.DataFrame(chemical_predictions_mean_equal_dict.values(),index=chemical_predictions_mean_equal_dict.keys())
chemical_predictions_mean_equal=chemical_predictions_mean_equal.merge(chemical_agg_mean_equal,right_index=True,left_index=True)
chemical_predictions_mean_equal.to_csv(DAT_DIR+'chemical_predictions_equal.csv')

In [None]:
r2_plot(chemical_predictions_mean_equal)

<h2>Study</h2>

In [None]:
study_loaels_equal=study_loaels.loc[study_loaels.qualifier=='=']
study_loaels_equal['dsstox_sid'].nunique()

In [None]:
study_sids_equal=list(study_loaels_equal['dsstox_sid'].unique())
study_neighbors_equal_l=[]
for sid in study_sids_equal:
    sid_neighbors=searchCollByFP(sid,s0=.05,SID=study_sids_equal,DB=DB)
    if sid_neighbors:
        for neighbor in sid_neighbors:
            neighbor['target_sid']=sid
            neighbor['neighbor_sid']=neighbor.pop('dsstox_sid')
        study_neighbors_equal_l=study_neighbors_equal_l+sid_neighbors

In [None]:
study_agg_mean_equal=study_loaels_equal.pivot_table(index='dsstox_sid',columns='endpoint_category',values='pod_value_LM',aggfunc='mean')

In [None]:
study_neighbors_mean_equal=pd.DataFrame(study_neighbors_equal_l)
study_neighbors_mean_equal=study_neighbors_mean_equal[study_neighbors_mean_equal['target_sid']!=study_neighbors_mean_equal['neighbor_sid']]
study_neighbors_mean_equal=study_neighbors_mean_equal.merge(study_agg_mean_equal,left_on='neighbor_sid',right_index=True)
study_neighbors_mean_equal=study_neighbors_mean_equal.sort_values('jaccard',ascending=False)

In [None]:
study_neighbors_mean_equal.to_csv(DAT_DIR+'study_neighbors_equal.csv')

In [None]:
study_predictions_mean_equal_dict={}
k=10
s=.05
for sid,group in study_neighbors_mean_equal.groupby('target_sid'):
    study_predictions_mean_equal_dict[sid]={category+'_p':wtavg(group,category,k,s) for category in categories}

In [None]:
study_predictions_mean_equal=pd.DataFrame(study_predictions_mean_equal_dict.values(),index=study_predictions_mean_equal_dict.keys())
study_predictions_mean_equal=study_predictions_mean_equal.merge(study_agg_mean_equal,right_index=True,left_index=True)
study_predictions_mean_equal.to_csv(DAT_DIR+'study_predictions_equal.csv')

In [None]:
r2_plot(study_predictions_mean_equal)

<h1>Additional Covariates</h1>

<h1>Create stacked df (should have done this all along)</h1>

In [None]:
import pickle as pkl
with open(DAT_DIR+'../../clusters.pkl') as f:
    clusters=pkl.load(f)
cluster_dict={cluster['cl_id']:cluster['chems'] for cluster in clusters}
reverse_cluster_dict={dsstox_sid:clid for clid,list_of_sids in cluster_dict.iteritems() for dsstox_sid in list_of_sids}

In [None]:
pd.DataFrame([{'dsstox_sid':sid,'clid':clid} for sid,clid in reverse_cluster_dict.iteritems()]).to_csv(DAT_DIR+'cluster_membership.csv')

In [None]:
cluster_membership=pd.read_csv(DAT_DIR+'cluster_membership.csv')
cluster_membership.head()

In [None]:
ks_gridsearch_mrgn=pd.read_csv(DAT_DIR+'../toxref_ks_gridsearch_mrgn.csv',index_col=0)
ks_gridsearch_mrgn.head()

In [None]:
exact_ks_gridsearch_mrgn=pd.read_csv(DAT_DIR+'../toxref_exact_ks_gridsearch_mrgn.csv',index_col=0)
exact_ks_gridsearch_mrgn.head()

In [None]:
sids=set(ks_gridsearch_mrgn['dsstox_sid'].unique())
toxref_cluster_dict={clid:set(chems)&sids for clid,chems in cluster_dict.iteritems()}
clusters_gt15={clid:chems for clid,chems in toxref_cluster_dict.iteritems() if len(chems)>15}

In [None]:
from sklearn.metrics import r2_score
stacked_df=pd.DataFrame()
for category in categories:
    cat_df=ks_gridsearch_mrgn[[category,category+'_p','k','s','dsstox_sid']]
    cat_df=cat_df.rename(columns={category:'true',category+'_p':'predicted'})
    cat_df['endpoint_category']=category
    stacked_df=stacked_df.append(cat_df)
stacked_df=plot_worthy(stacked_df)
stacked_df['cluster']=stacked_df['dsstox_sid'].map(reverse_cluster_dict)

In [None]:
stacked_df.head()

In [None]:
from sklearn.metrics import r2_score
exactk_stacked_df=pd.DataFrame()
for category in categories:
    cat_df=exact_ks_gridsearch_mrgn[[category,category+'_p','k','s','dsstox_sid']]
    cat_df=cat_df.rename(columns={category:'true',category+'_p':'predicted'})
    cat_df['endpoint_category']=category
    exactk_stacked_df=exactk_stacked_df.append(cat_df)
exactk_stacked_df=plot_worthy(exactk_stacked_df)
exactk_stacked_df['cluster']=exactk_stacked_df['dsstox_sid'].map(reverse_cluster_dict)

In [None]:
exactk_stacked_df.head()

<h1>Cluster grid search</h1>

<h2>Up to k</h2>

In [None]:
ks_clusters={}
for clid,chems in clusters_gt15.iteritems():
    ks_clusters[clid]=np.full([len(ks),len(ss)],np.nan)
    cluster_df=stacked_df.loc[stacked_df.dsstox_sid.isin(chems)]
    for (k,s),group in cluster_df.groupby(['k','s']):
        k_index=ks.index(k)
        s_index=ss.index(round(s,2))
        ks_clusters[clid][k_index,s_index]=r2_score(group['true'],group['predicted'])

In [None]:
from mpl_toolkits.mplot3d import axes3d
X,Y=np.meshgrid(ss,ks)
for clid,cluster_grid_r2 in ks_clusters.iteritems():
    fig,ax=plt.subplots(figsize=(6,4),subplot_kw={'projection':'3d'})
    ax.plot_surface(X,Y,cluster_grid_r2,cmap=plt.cm.coolwarm)
    ax.set_ylabel('Maximum number of neighbors (k)',fontsize=16)
    ax.set_xlabel('Similarity threshold (s)',fontsize=16)
    ax.set_zlabel('R2')
    ax.set_title('Cluster '+ clid )
    ax.text2D(.75,.95,'n='+str(len(clusters_gt15[clid])),transform=ax.transAxes,fontsize=12)

In [None]:
cluster_cat_r2s=[]
for (k,s,endpoint_category,clid),group in stacked_df.groupby(['k','s','endpoint_category','cluster']):
    if len(group)<2:
        continue
    cluster_cat_r2s.append({'k':k,'s':s,'endpoint_category':endpoint_category,'clid':clid,'n':len(group),\
                           'r2':r2_score(group['true'],group['predicted'])})

In [None]:
cluster_cat_r2_df=pd.DataFrame(cluster_cat_r2s)
cluster_cat_r2_df.head()
cluster_cat_r2_df.to_csv(DAT_DIR+'cluster_ks_gridsearch_wo_restriction.csv')

In [None]:
cluster_cat_r2_df[(cluster_cat_r2_df.k==10) & (cluster_cat_r2_df.s==.5)].pivot_table(index='endpoint_category',columns='clid',values='r2')

In [None]:
optimal_ks=[]
for (clid,endpoint_category),group in cluster_cat_r2_df.groupby(['clid','endpoint_category']):
    max_row=group.loc[group['r2'].idxmax()]
    optimal_ks.append({'clid':clid,'endpoint_category':endpoint_category,'k':max_row['k'],'s':max_row['s'],'n':max_row['n'],'r2':max_row['r2']})

In [None]:
pd.DataFrame(optimal_ks).head()
pd.DataFrame(optimal_ks).to_csv(DAT_DIR+'cluster_optimal_ks.csv')

<h2>Exactk</h2>

In [None]:
exactk_cluster_cat_r2s=[]
for (k,s,endpoint_category,clid),group in exactk_stacked_df.groupby(['k','s','endpoint_category','cluster']):
    if len(group)<2:
        continue
    exactk_cluster_cat_r2s.append({'k':k,'s':s,'endpoint_category':endpoint_category,'clid':clid,'n':len(group),\
                           'r2':r2_score(group['true'],group['predicted'])})

In [None]:
exactk_cluster_cat_r2_df=pd.DataFrame(exactk_cluster_cat_r2s)
exactk_cluster_cat_r2_df.head()
exactk_cluster_cat_r2_df.to_csv(DAT_DIR+'exactk_cluster_ks_gridsearch_wo_restriction.csv')

In [None]:
exactk_optimal_ks=[]
for (clid,endpoint_category),group in exactk_cluster_cat_r2_df.groupby(['clid','endpoint_category']):
    max_row=group.loc[group['r2'].idxmax()]
    exactk_optimal_ks.append({'clid':clid,'endpoint_category':endpoint_category,'k':max_row['k'],'s':max_row['s'],'n':max_row['n'],'r2':max_row['r2']})

In [None]:
pd.DataFrame(exactk_optimal_ks).head()
pd.DataFrame(exactk_optimal_ks).to_csv(DAT_DIR+'exactk_cluster_optimal_ks.csv')

In [None]:
with pd.option_context('display.max_rows',None):
    opt_df=pd.DataFrame(exactk_optimal_ks)
    opt_df[opt_df['n']>5]

In [None]:
nandata=pd.DataFrame(\
                     [{'clid':clid,'k':k,'s':s,'endpoint_category':endpoint_category,'r2':np.nan}\
                      for clid in exactk_cluster_cat_r2_df['clid'].unique()\
                      for k in ks\
                      for s in ss\
                      for endpoint_category in categories])

In [None]:
exactk_cluster_cat_r2_df_expanded=exactk_cluster_cat_r2_df.merge(nandata,how='right',on=['k','s','endpoint_category','clid'],suffixes=('','_drop'))
del exactk_cluster_cat_r2_df_expanded['r2_drop']

In [None]:
import seaborn as sns
for clid,group in exactk_cluster_cat_r2_df_expanded.groupby('clid'):
    cluster_r2s=group.pivot_table(index='k',columns='s',values='r2',dropna=False)
    ax=sns.heatmap(cluster_r2s,cmap=plt.cm.coolwarm,vmin=-.5,vmax=.5,linewidth=1)
    ax.invert_yaxis()
    plt.title('Cluster ' + clid)
    plt.xlabel('s')
    plt.ylabel('k')
    plt.savefig(FIG_DIR+'cluster_heatmaps/cluster'+clid+'.png')
    plt.show()

<h1>Deduped Study Comparison</h1>

In [None]:
def deduped_study_pod_record(document):
    studies=document['deduped_studies']
    for study in studies:
        study['dsstox_sid']=document['dsstox_sid']
    return studies

In [None]:
deduped_study_pods_df=pd.DataFrame([study for document in toxref.find() for study in deduped_study_pod_record(document)])
str(len(deduped_study_pods_df)) + ' total study level PODs'
deduped_study_pods_df.head()

In [None]:
deduped_study_loaels=deduped_study_pods_df[deduped_study_pods_df['pod_type']=='loael']
str(len(deduped_study_loaels)) + ' deduped study level LOAELs'

In [None]:
deduped_study_loaels['admin_route'].value_counts()

In [None]:
deduped_study_loaels=deduped_study_loaels[deduped_study_loaels['admin_route']=='Oral']
print('Only looking at oral studies')

In [None]:
deduped_study_loaels['pod_unit'].value_counts()

In [None]:
deduped_study_loaels.loc[deduped_study_loaels.pod_unit=='ppm']['species'].value_counts()

In [None]:
deduped_study_loaels.loc[deduped_study_loaels.pod_unit=='%']['species'].value_counts()

In [None]:
#Unit conversions
from __future__ import division
deduped_study_loaels.loc[(deduped_study_loaels['pod_unit']=='ppm') & (deduped_study_loaels['species']=='rat'),'pod_value']*=.05
deduped_study_loaels.loc[(deduped_study_loaels['pod_unit']=='ppm') & (deduped_study_loaels['species']=='mouse'),'pod_value']*=.15
deduped_study_loaels.loc[(deduped_study_loaels['pod_unit'] =='ppm') & (deduped_study_loaels['species']=='dog'),'pod_value']*=.075
deduped_study_loaels.loc[(deduped_study_loaels['pod_unit']=='ppm') & (deduped_study_loaels['species']=='rabbit'),'pod_value']*=.03
deduped_study_loaels.loc[(deduped_study_loaels['pod_unit']=='%'),'pod_value']*=10000*.15
deduped_study_loaels.loc[(deduped_study_loaels['pod_unit']=='mg/kg/wk'),'pod_value']*=(1/7)
deduped_study_loaels.loc[(deduped_study_loaels['pod_unit']=='mg/rat/day'),'pod_value']*=(1/.4)
deduped_study_loaels['pod_unit']='mg/kg/day'

In [None]:
deduped_study_loaels['qualifier'].value_counts()
print('Why = in quotations??')
deduped_study_loaels.loc[deduped_study_loaels.qualifier=="'='",'qualifier']='='
print('Fixed')

In [None]:
#Need to convert to log molar
deduped_study_sids=list(deduped_study_loaels['dsstox_sid'].unique())
weights={record['dsstox_sid']:record['mol_weight'] for record in dsstox.find({'dsstox_sid':{'$in':deduped_study_sids}})}
deduped_study_loaels['mol_weight']=deduped_study_loaels['dsstox_sid'].map(weights)
deduped_study_loaels['pod_value_LM']=-np.log10(deduped_study_loaels['pod_value']/deduped_study_loaels['mol_weight']/1000)

In [None]:
deduped_study_loaels.to_csv(DAT_DIR+'deduped_study_loaels.csv')

In [None]:
plt.hist([plot_worthy(chemical_loaels['pod_value']),plot_worthy(study_loaels['pod_value']),plot_worthy(deduped_study_loaels['pod_value'])],bins=20,histtype='step',label=['chemical','study','deduped'])
plt.legend(prop={'size':12})
plt.title('POD value histogram')

In [None]:
plt.hist([plot_worthy(chemical_loaels['pod_value_LM']),plot_worthy(study_loaels['pod_value_LM']),plot_worthy(deduped_study_loaels['pod_value_LM'])],bins=20,histtype='step',label=['chemical','study','deduped'])
plt.legend(prop={'size':12})
plt.title('Log molar POD value histogram')

In [None]:
deduped_study_neighbors_l=[]
for sid in study_sids:
    sid_neighbors=searchCollByFP(sid,s0=.05,SID=study_sids,DB=DB)
    if sid_neighbors:
        for neighbor in sid_neighbors:
            neighbor['target_sid']=sid
            neighbor['neighbor_sid']=neighbor.pop('dsstox_sid')
        deduped_study_neighbors_l=deduped_study_neighbors_l+sid_neighbors

In [None]:
deduped_study_agg_mean=deduped_study_loaels.pivot_table(index='dsstox_sid',columns='endpoint_category',values='pod_value_LM',aggfunc='mean')

In [None]:
deduped_study_neighbors_mean=pd.DataFrame(deduped_study_neighbors_l)
deduped_study_neighbors_mean=deduped_study_neighbors_mean[deduped_study_neighbors_mean['target_sid']!=deduped_study_neighbors_mean['neighbor_sid']]
deduped_study_neighbors_mean=deduped_study_neighbors_mean.merge(deduped_study_agg_mean,left_on='neighbor_sid',right_index=True)
deduped_study_neighbors_mean=deduped_study_neighbors_mean.sort_values('jaccard',ascending=False)

In [None]:
deduped_study_neighbors_mean.to_csv(DAT_DIR+'deduped_study_neighbors.csv')

In [None]:
deduped_study_predictions_mean_dict={}
k=10
s=.05
for sid,group in deduped_study_neighbors_mean.groupby('target_sid'):
    deduped_study_predictions_mean_dict[sid]={category+'_p':wtavg(group,category,k,s) for category in categories}

In [None]:
deduped_study_predictions_mean=pd.DataFrame(deduped_study_predictions_mean_dict.values(),index=deduped_study_predictions_mean_dict.keys())
deduped_study_predictions_mean=deduped_study_predictions_mean.merge(deduped_study_agg_mean,right_index=True,left_index=True)
deduped_study_predictions_mean.to_csv(DAT_DIR+'deduped_study_predictions.csv')

In [None]:
r2_plot(deduped_study_predictions_mean)