In [None]:
import pandas as pd
import numpy as np
import pymongo
import sys
import os
from __future__ import print_function
from datetime import datetime

TOP = '/'.join(os.getcwd().split('/')[:-2])+'/'
LIB = TOP+'lib'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

DAT_DIR = TOP + '/data/'
FIG_DIR = TOP + '/figs/'

if not os.path.exists(DAT_DIR): os.mkdir(DAT_DIR)
if not os.path.exists(FIG_DIR): os.mkdir(FIG_DIR)
    
from db.mongo import *

from rax.genrapred import *
import db.etl as etl

In [None]:
mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
DB=mongocon['genra_dev_v4']
dsstox=DB['compounds']
predictions=DB['pred_GH']

In [None]:
def filteredSearchCollByFP(sid,phys_threshold=0,fpn='mrgn',
                   SID=None,s0=0.0,
                   i1=0,i2=None,dbg=False,
                   max_hits=10,sel_by=None):
    Q0 = DB['chm_fp'].find_one({'dsstox_sid':sid})
    if not Q0: return
    Q = Q0[fpn]
    P0=DB['chm_fp'].find_one({'dsstox_sid':sid})
    if not P0: return
    target_phys=P0['phys_fp']
    target_phys_ss=sum([p*p for p in target_phys])
    
    Agg = [
        {'$match':{'phys_fp':{'$exists':True}}},
        {'$match':{'dsstox_sid':{'$ne':sid}}},
        {'$project': 
             {'jaccard': 
                 {'$let':
                  {'vars': 
                   {'olap': {'$size':{'$setIntersection': ['$%s.ds'%fpn,Q['ds']] }}},
                   'in': {'$divide':['$$olap',
                                     {'$subtract': [{'$add':[Q['n'],'$%s.n'%fpn]},'$$olap'] }]}
                  }
                 },
              '_id':0,
              'dsstox_sid':1,
              'phys_fp':1
             }
        },
        {'$project':
         {'target_phys':target_phys,
          '_id':0,
          'dsstox_sid':1,
          'phys_fp':1,
          'jaccard':1
         }        
        },
        {'$project':
            {'phys_sim':
             {'$let':
              {'vars':{'neighbor_ss':{'$sum':
                                      {'$map':
                                       {'input':'$phys_fp',
                                        'as':'property',
                                        'in':{'$pow':['$$property',2]}
                                       }
                                      }
                                     },
                       'interaction_term':{'$sum':
                                           {'$map':
                                            {'input':[0,1,2,3],
                                             'as':'index',
                                             'in':{'$multiply':[{'$arrayElemAt':['$phys_fp','$$index']},
                                                                {'$arrayElemAt':['$target_phys','$$index']}]}
                                            }
                                           }
                                          }
                      },
                       'in':{'$divide':['$$interaction_term',
                                         {'$subtract':[{'$add':['$$neighbor_ss',target_phys_ss]},'$$interaction_term']}]}
                }
             },
             '_id':0,
             'dsstox_sid':1,
             'jaccard':1,
            }  
        },
        {'$match':{'jaccard':{'$gt':s0}}},
        {'$sort': {'jaccard':-1}},
        {'$limit': max_hits},
        {'$match':{'phys_sim':{'$gt':phys_threshold}}}, #Filter step
    ]
    
    if sel_by:
        col,ds=getColFPMap(sel_by)
        SID_h = DB[col].find({'dsstox_sid':{'$exists':1}}).distinct('dsstox_sid')
        Agg = [{'$match':{'dsstox_sid':{'$in':SID_h}}}]+Agg
    
    #print qmin,qmax
    try:
        return list(DB['chm_fp'].aggregate(Agg))
    except:
        return sid

In [None]:
df=pd.DataFrame(filteredSearchCollByFP('DTXSID3020205',phys_threshold=.8,sel_by='toxp_txrf'))

In [None]:
df

In [None]:
def wtavg(asim_df): #dataframe with column for activities followed by column for similarity
    asim_df=asim_df.dropna(how='any')
    sim=asim_df['jaccard']
    act=asim_df.drop('jaccard',axis=1).ix[:,0]
    return np.sum(act.values*sim.values)/np.sum(sim.values)

In [None]:
def predict(a_s,a_t,t0):
    if a_t > 0:
        if a_s >= t0:
            pred='TP'
        elif a_s < t0:
            pred='FN'
    elif a_t==0:
        if a_s >= t0:
            pred='FP'
        elif a_s < t0:
            pred='TN'
    else:
        if a_s >= t0:
            pred='Pos'
        elif a_s < t0:
            pred='Neg'
    return pred

In [None]:
getFP(['DTXSID80386278'],DB=DB)

In [None]:
def runFilteredGenRA(sid,phys_threshold=0,k0=10,s0=.1,Y=None,sel_by='toxp_txrf'):
    
    Hits = filteredSearchCollByFP(sid=sid,s0=s0,max_hits=k0,phys_threshold=phys_threshold,sel_by=sel_by)
    if not Hits: 
        Hits=[] 
        return Hits
        
    NN  = pd.DataFrame(Hits)
    SID0 = list(NN.dsstox_sid)
    # Get fingerprints
    print(SID0)
        
    Y_pos = getFP(SID0,DB=DB,fp='toxp_txrf',FP=Y) 
    Y_neg = getFP(SID0,DB=DB,fp='toxn_txrf',FP=Y)
    Y_pos[Y_neg==1]=0
    Y_fp = Y_pos.copy()
    
    endpoints=Y_fp.columns
    Y_fp=Y_fp.merge(NN,left_index=True,right_on='dsstox_sid')
    
    Yp=getFP([sid],DB=DB,fp='toxp_txrf')
    Yn=getFP([sid],DB=DB,fp='toxn_txrf')
    Yn[Yn==1]=0
    print(pd.concat([Yp,Yn],axis=1))
    Yt=pd.concat([Yp,Yn],axis=1).loc[sid]
    
    Res = []
    for y in endpoints:
        a_s = wtavg(Y_fp[[y,'jaccard']])
        Yi = Y_fp[y]
        Yi = Yi[Yi.notnull()]
        a_t=Yt.get(y,None)
        pred = predict(a_s,a_t,.5)
        R = {'a_s':a_s,'out':y,'k0':k0,'s0':s0,'n_tot':len(SID0),'pred':pred,
             'n_pos':(Yi==1).sum(),'n_neg':(Yi==0).sum(),'dsstox_sid':sid,'phys_threshold':phys_threshold}

        Res.append(R)

    #DB['pred_filter'].insert_many(Res)
    
    return Res

In [None]:
res=runFilteredGenRA('DTXSID80386278',phys_threshold=0.0)

In [None]:
df=pd.DataFrame(res)
df

<h1>iPyParallel</h1>

ipcluster start --profile=pb_parallel --n=10
in root project directory

In [None]:
import ipyparallel as PP

RC=PP.Client(profile='pb_parallel')
RC.ids
d_view=RC[:]
%px import sys
%px import os
%px sys.path.insert(0,os.getcwd()+'/lib')
%px import pymongo
%px from lib.db.mongo import *
%px from lib.db.getfp import *

In [None]:
%px mongocon=pymongo.MongoClient("mongodb://ghelman:ghelman@pb.epa.gov/genra_dev_v4")
%px DB=mongocon['genra_dev_v4']

In [None]:
global_env={
    'filteredSearchCollByFP': filteredSearchCollByFP,
    'wtavg': wtavg,
    'predict': predict,
    'runFilteredGenRA':runFilteredGenRA
}
d_view.push(global_env)

In [None]:
run50=lambda(sid):runFilteredGenRA(sid,phys_threshold=.5)

In [None]:
pred_l=set(DB['pred_filter'].distinct('dsstox_sid'))
compound_l={r['dsstox_sid'] for r in list(DB['compounds'].find({},{'dsstox_sid':1}))}
l=list(compound_l-pred_l)

In [None]:
d_view.map_sync(run50,l)

In [None]:
runFilteredGenRA('DTXSID80386278',phys_threshold=.5)

In [None]:
getColFPMap('toxn_txrf')