In [None]:
import pandas as pd
import numpy as np
import pymongo
import sys
import os
from __future__ import print_function
from datetime import datetime

TOP = '/'.join(os.getcwd().split('/')[:-2])+'/'
LIB = TOP+'lib'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

DAT_DIR = TOP + '/data/'
FIG_DIR = TOP + '/figs/'

if not os.path.exists(DAT_DIR): os.mkdir(DAT_DIR)
if not os.path.exists(FIG_DIR): os.mkdir(FIG_DIR)

import db.etl as etl

In [None]:
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compounds']
predictions=DB['pred_is_v1']

In [None]:
study_types=predictions.distinct('study')

<h1>EDA</h1>

In [None]:
pipeline=[
    {'$group':{
        '_id': {'dsstox_sid':'$dsstox_sid','study_type':'$study'},
        'average_ap':{'$avg':'$a_p'}
    }}
]

In [None]:
agg=predictions.aggregate(pipeline,allowDiskUse=True)

In [None]:
agg_list=[]
for result in agg:
    dsstox_sid=result['_id'].get('dsstox_sid',None)
    study_type=result['_id'].get('study_type',None)
    if dsstox_sid is None or study_type is None:
        continue
    average_ap=result['average_ap']
    agg_list.append({'dsstox_sid':dsstox_sid,'study_type':study_type,'average_ap':average_ap})

In [None]:
df=pd.DataFrame(list(agg_list))

In [None]:
df.head()

In [None]:
import matplotlib.pyplot as plt

In [None]:
chr_df=df[df['study_type']=='CHR']

In [None]:
plt.hist(chr_df['average_ap'])
plt.xlabel('average a_p')
plt.title('CHR')

In [None]:
for study_type in df['study_type'].unique():
    histogram=plt.hist(df[df['study_type']==study_type]['average_ap'])
    label=plt.xlabel('average a_p')
    title=plt.title(study_type)
    plt.savefig(FIG_DIR+'study-aggregation/' + study_type + '.png')
    plt.show()

In [None]:
pipeline1=[
    {'$match':{'$and':[{'p_val':{'$lt':.1}},{'auc':{'$gt':.4}}]}},
    {'$group':{
        '_id': {'dsstox_sid':'$dsstox_sid','study_type':'$study'},
        'average_ap':{'$avg':'$a_p'}
    }}
]

In [None]:
agg=predictions.aggregate(pipeline1,allowDiskUse=True)

In [None]:
agg_list=[]
for result in agg:
    dsstox_sid=result['_id'].get('dsstox_sid',None)
    study_type=result['_id'].get('study_type',None)
    if dsstox_sid is None or study_type is None:
        continue
    average_ap=result['average_ap']
    agg_list.append({'dsstox_sid':dsstox_sid,'study_type':study_type,'average_ap':average_ap})

In [None]:
df_filtered=pd.DataFrame(list(agg_list))

In [None]:
for study_type in df_filtered['study_type'].unique():
    histogram=plt.hist(df_filtered[df_filtered['study_type']==study_type]['average_ap'])
    label=plt.xlabel('average a_p')
    title=plt.title(study_type)
    plt.savefig(FIG_DIR+'study-aggregation/' + study_type + '_filtered.png')
    plt.show()

In [None]:
pipeline2=[
    {'$match':{'$and':[{'p_val':{'$lt':.1}},{'auc':{'$gt':.4}}]}},
    {'$group':{
        '_id': {'dsstox_sid':'$dsstox_sid','study_type':'$study'},
        'average_as':{'$avg':'$a_s'}
    }}
]

In [None]:
agg=predictions.aggregate(pipeline2,allowDiskUse=True)

In [None]:
agg_list=[]
for result in agg:
    dsstox_sid=result['_id'].get('dsstox_sid',None)
    study_type=result['_id'].get('study_type',None)
    if dsstox_sid is None or study_type is None:
        continue
    average_as=result['average_as']
    agg_list.append({'dsstox_sid':dsstox_sid,'study_type':study_type,'average_as':average_as})

In [None]:
df_filtered_as=pd.DataFrame(list(agg_list))

In [None]:
for study_type in df_filtered_as['study_type'].unique():
    histogram=plt.hist(df_filtered_as[df_filtered_as['study_type']==study_type]['average_as'])
    label=plt.xlabel('average a_s')
    title=plt.title(study_type)
    plt.savefig(FIG_DIR+'study-aggregation/' + study_type + '_filtered_as.png')
    plt.show()

<h1>Any call</h1>
If any filtered prediction is positive for study/organ combo, call positive at study level

In [None]:
p0=.1
auc0=.4

In [None]:
pipeline3=[
    {'$match':{'$and':[{'p_val':{'$lt':.1}},{'auc':{'$gt':.4}}]}},
    {'$group':{
        '_id': {'dsstox_sid':'$dsstox_sid','study_type':'$study'},
        'call':{'$max':'$a_p'}
    }}
]

In [None]:
agg=predictions.aggregate(pipeline3,allowDiskUse=True)

In [None]:
call_list=[]
for result in agg:
    dsstox_sid=result['_id'].get('dsstox_sid',None)
    study_type=result['_id'].get('study_type',None)
    if dsstox_sid is None or study_type is None:
        continue
    call=result['call']
    call_list.append({'dsstox_sid':dsstox_sid,'study_type':study_type,'call':call})

In [None]:
DB['study_calls_1'].insert_many(call_list)

In [None]:
call_df=pd.DataFrame(call_list)

In [None]:
call_df.head()

In [None]:
call_df['call'].unique()

In [None]:
(call_df[call_df['study_type']=='DNT']['dsstox_sid']=='DTXSID00110012').any()