In [None]:
import pandas as pd
import numpy as np
import pymongo
import sys
import os
from __future__ import print_function
from datetime import datetime

TOP = '/'.join(os.getcwd().split('/')[:-2])+'/'
LIB = TOP+'lib'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

DAT_DIR = TOP + 'data/'
FIG_DIR = TOP + 'figs/'

if not os.path.exists(DAT_DIR): os.mkdir(DAT_DIR)
if not os.path.exists(FIG_DIR): os.mkdir(FIG_DIR)
    
from db.mongo import *

from rax.genrapred import *
import db.etl as etl
from db.fpsim import *

In [None]:
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compound']
acute_tox=DB['acutetox']

In [None]:
acute_tox.find_one()

In [None]:
acute_tox.count()
len(acute_tox.distinct('casn'))

In [None]:
acute_tox.find({'dsstox_sid':{'$exists':True}}).count()

In [None]:
df1=pd.DataFrame(list(acute_tox.find({'dsstox_sid':{'$exists':True}},{'_id':0})))

In [None]:
len(df1)

In [None]:
sids=list(df1['dsstox_sid'].unique())
len(sids)
mol_weights=pd.DataFrame(list(dsstox.find({'dsstox_sid':{'$in':sids}},{'_id':0,'dsstox_sid':1,'mol_weight':1})))
len(mol_weights)
mol_weights=mol_weights.drop_duplicates('dsstox_sid')
len(mol_weights)

In [None]:
df2=df1.merge(mol_weights,on='dsstox_sid')

In [None]:
len(df2)

In [None]:
from __future__ import division
df2['LD50_LM']=-np.log10(df2['LD50_mgkg']/df2['mol_weight'])

In [None]:
df=df2
df.head(20)

In [None]:
df1[df1['dsstox_sid']=='DTXSID5024768']

In [None]:
df.to_csv(DAT_DIR+'acute.csv')

<h1>EDA</h1>

In [None]:
len(df)

In [None]:
sid_counts=df['dsstox_sid'].value_counts()
sid_counts

In [None]:
str(round(len(sid_counts[sid_counts>1])/len(sid_counts)*100,1)) + '% of substances have more than 1 measurement'

In [None]:
'sdf unique value: ' + str(list(df['sdf'].unique()))
'LD50_type_sub unique values: ' + str(list(df['LD50_type_sub'].unique()))
str(len(df['dsstox_sid'].unique())) + ' unique sids in ' + str(len(df)) + ' rows'

In [None]:
df['sdf'].value_counts()
df['LD50_type_sub'].value_counts()

In [None]:
import matplotlib.pyplot as plt
from math import log, exp
from scipy import stats

In [None]:
df.boxplot(column='LD50_LOGmgkg',by='LD50_type_sub',figsize=(8,6))
plt.subplots_adjust(top=.9)
plt.show()

In [None]:
ax=df.boxplot(column='LD50_LOGmgkg',by='LD50_type_sub')
plt.subplots_adjust(top=.85)
ax.set_xticklabels([1,2,3])
plt.show()

In [None]:
ld50_mean=df.groupby('dsstox_sid')['LD50_mgkg'].mean()
logld50_mean=ld50_mean.apply(log,args=(10,))
logld50_mean.name='LD50_LOGmgkg'
ld50lm_mean=df.groupby('dsstox_sid')['LD50_LM'].mean()
df_mean=pd.concat([ld50_mean,logld50_mean,ld50lm_mean],axis=1)
df_mean['LD50_preLOGmgkg']=df.groupby('dsstox_sid')['LD50_LOGmgkg'].mean()

In [None]:
df_mean

In [None]:
df_mean['LD50_mgkg'].max()

In [None]:
hist=plt.hist(df_mean['LD50_mgkg'],bins=50)
#plt.xlim([0,10000])
plt.show()

In [None]:
hist=plt.hist(df_mean['LD50_LOGmgkg'],bins=20)
plt.show()

In [None]:
hist=plt.hist(df_mean['LD50_preLOGmgkg'],bins=20)
plt.show()

In [None]:
hist=plt.hist(df_mean['LD50_LM'])
plt.show()

In [None]:
xt,p=stats.boxcox(df_mean['LD50_mgkg'])
hist=plt.hist(xt)
plt.show()

In [None]:
def cox(x,p):
    return (x**p-1)/p
df_mean['LD50_cox']=df_mean['LD50_mgkg'].apply(cox,args=(p,))

In [None]:
df_mean.head()

<h1>Analysis</h1>

In [None]:
sids=list(df['dsstox_sid'].unique())

In [None]:
#kn={}
#for sid in sids:
#    kn[sid]=searchCollByFP(sid,s0=.5,SID=sids,DB=DB)

In [None]:
#import pickle
#with open(DAT_DIR+'acute_neighborhoods.pkl','w') as f:
#    pickle.dump(kn,f)

In [None]:
import collections
odictkn={}
for target in kn.keys():
    neighborhood=kn[target]
    odict=collections.OrderedDict()
    if neighborhood is not None:
        for neighbor in neighborhood:
            odict[neighbor['dsstox_sid']]=neighbor['jaccard']
    odictkn[target]=odict

In [None]:
odictkn['DTXSID70207089']

In [None]:
import pickle
with open(DAT_DIR+'acute_neighborhoods.pkl','r') as f:
    kn=pickle.load(f)

In [None]:
odict['DTXSID70207089']=1.0

In [None]:
len(kn)

In [None]:
knm1={sid:neighborhood[1:] for sid,neighborhood in kn.iteritems() if neighborhood is not None and len(neighborhood)>1} #Self is always first neighbor

In [None]:
n={sid:len(r) for sid,r in knm1.iteritems() if r is not None}

In [None]:
s=pd.Series(n)
'Found neighbors for ' + str(len(s)) + ' of the ' + str(len(df_mean)) + ' chemicals'

In [None]:
c=s.value_counts()
c

In [None]:
c_slice=c.loc[1:10]

In [None]:
plt.scatter(c_slice.index.values,c_slice)
plt.show()

In [None]:
k10={k:r[0:10] for k,r in knm1.iteritems() if r is not None}

In [None]:
ld50_predictions={}
logld50_predictions={}
prelogld50_predictions={}
ld50cox_predictions={}
for sid,neighborhood in k10.iteritems():
    neighborhood=pd.DataFrame(k10[sid])
    neighbor_data=neighborhood.merge(df_mean,left_on='dsstox_sid',right_index=True)
    ld50=np.average(neighbor_data['LD50_mgkg'])
    logld50=np.average(neighbor_data['LD50_LOGmgkg'],weights=neighbor_data['jaccard'])
    prelogld50=np.average(neighbor_data['LD50_preLOGmgkg'],weights=neighbor_data['jaccard'])
    ld50cox=np.average(neighbor_data['LD50_cox'])
    ld50_predictions[sid]=ld50
    logld50_predictions[sid]=logld50
    prelogld50_predictions[sid]=prelogld50
    ld50cox_predictions[sid]=ld50cox

In [None]:
ld50lm_predictions={}
for sid,neighborhood in k10.iteritems():
    neighborhood=pd.DataFrame(k10[sid])
    neighbor_data=neighborhood.merge(df_mean,left_on='dsstox_sid',right_index=True)
    ld50lm=np.average(neighbor_data['LD50_LM'])
    ld50lm_predictions[sid]=ld50lm

In [None]:
dfr=df_mean.copy()
dfr['LD50_p']=dfr.index.to_series().map(ld50_predictions)
dfr['LD50_LOG_p']=dfr.index.to_series().map(logld50_predictions)
dfr['LD50_preLOG_p']=dfr.index.to_series().map(prelogld50_predictions)
dfr['LD50_cox_p']=dfr.index.to_series().map(ld50cox_predictions)
dfr['LD50_LM_p']=dfr.index.to_series().map(ld50lm_predictions)

In [None]:
dfr=dfr[dfr.notnull().all(axis=1)]

In [None]:
dfr.head()

In [None]:
from sklearn.metrics import r2_score

<h3>No log</h3>

In [None]:
ybar=dfr['LD50_mgkg'].mean()
sst=((dfr['LD50_mgkg']-ybar)**2).sum()
ssr=((dfr['LD50_p']-dfr['LD50_mgkg'])**2).sum()
r2=1-ssr/sst
r2

In [None]:
r2_score(dfr['LD50_mgkg'],dfr['LD50_p'])

<h3>Log before mean</h3>

In [None]:
ybar=dfr['LD50_preLOGmgkg'].mean()
sst=((dfr['LD50_preLOGmgkg']-ybar)**2).sum()
ssr=((dfr['LD50_preLOG_p']-dfr['LD50_preLOGmgkg'])**2).sum()
r2=1-ssr/sst
r2

<h3>Log after mean</h3>

In [None]:
ybar=dfr['LD50_LOGmgkg'].mean()
sst=((dfr['LD50_LOGmgkg']-ybar)**2).sum()
ssr=((dfr['LD50_LOG_p']-dfr['LD50_LOGmgkg'])**2).sum()
r2=1-ssr/sst
r2

<h3>Cox</h3>

In [None]:
ybar=dfr['LD50_cox'].mean()
sst=((dfr['LD50_cox']-ybar)**2).sum()
ssr=((dfr['LD50_cox_p']-dfr['LD50_cox'])**2).sum()
r2=1-ssr/sst
r2

<h3>Log Molar</h3>

In [None]:
ybar=dfr['LD50_LM'].mean()
sst=((dfr['LD50_LM']-ybar)**2).sum()
ssr=((dfr['LD50_LM_p']-dfr['LD50_LM'])**2).sum()
r2=1-ssr/sst
r2

In [None]:
plt.scatter(dfr['LD50_LM'],dfr['LD50_LM_p'])
plt.show()

# Median

In [None]:
ld50lm_median_predictions={}
for sid,neighborhood in k10.iteritems():
    neighborhood=pd.DataFrame(k10[sid])
    neighbor_data=neighborhood.merge(df_mean,left_on='dsstox_sid',right_index=True)
    ld50lm=np.median(neighbor_data['LD50_LM'])
    ld50lm_median_predictions[sid]=ld50lm

In [None]:
dfr['LD50_median_p']=dfr.index.to_series().map(ld50lm_median_predictions)

In [None]:
ybar=dfr['LD50_LM'].mean()
sst=((dfr['LD50_LM']-ybar)**2).sum()
ssr=((dfr['LD50_median_p']-dfr['LD50_LM'])**2).sum()
r2=1-ssr/sst
r2

In [None]:
plt.scatter(dfr['LD50_median_p'],dfr['LD50_LM'])
plt.show()

In [None]:
#Median predictions very similar to mean since most substances only find 1-2 neighbors
plt.scatter(dfr['LD50_median_p'],dfr['LD50_LM_p'])
plt.show()

<h1>Regression</h1>

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()
model.fit(dfr['LD50_LM'],dfr['LD50_LM_p'])