In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('../data/files/test_cohen_dedup.csv')

nb_studies = len(df)
H = np.zeros((nb_studies, nb_studies))

cdnos = list(set(df.cdno))
for i in range(nb_studies) :
    H[i, df[df['cdno'] == df['cdno'][i]].index] = 1

In [2]:
H[np.arange(nb_studies), np.arange(nb_studies)] = 0

In [3]:
H

array([[ 0.,  1.,  1., ...,  0.,  0.,  0.],
       [ 1.,  0.,  1., ...,  0.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  1.,  1.],
       [ 0.,  0.,  0., ...,  1.,  0.,  1.],
       [ 0.,  0.,  0., ...,  1.,  1.,  0.]])

In [21]:
from keras.models import load_model
model_pop = load_model('../store/weights/population_FriOct614:53:522017/0/0-loss.h5')
model_inter = load_model('../store/weights/intervention_FriOct614:53:522017/0/0-loss.h5')
model_outcome = load_model('../store/weights/outcome_FriOct614:53:522017/0/0-loss.h5')

In [24]:
models = {'pop' : model_pop, 'int' : model_inter, 'out' : model_outcome}

In [25]:
pools = {input : model.get_layer('pool') for input, model in models.items()}

In [26]:
inputs = {input : pool.inputs for input, pool in pools.items()}
import keras.backend as K
inputs = {input : x + [K.learning_phase()] for input, x in inputs.items()}

outputs = {input : pool.get_output_at(0) for input, pool in pools.items()}
embed_abstract = {input : K.function(inputs[input], [outputs[input]]) for input in pools.keys()}

In [27]:
embed_abstract

{'int': <keras.backend.tensorflow_backend.Function at 0x7fab7d3e8050>,
 'out': <keras.backend.tensorflow_backend.Function at 0x7fac8fb47cd0>,
 'pop': <keras.backend.tensorflow_backend.Function at 0x7fab7d3e8090>}

In [28]:
import cPickle
import sys
sys.path.insert(0, '../../preprocess')
import vectorizer
cohenvec = cPickle.load(open('../data/vectorizers/cohendata_dedup.p', 'rb'))

In [29]:
X_cohen = cohenvec.X

In [30]:
embedds_dic = {input : embed_abstract[input]([X_cohen, 0])[0] for input in embed_abstract.keys()}

{'int': array([[ 0.29339853,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.02252054,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ..., 
        [ 0.19832364,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]], dtype=float32),
 'out': array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32),
 'pop': array([[ 0.        ,  0.        , 

In [31]:
from sklearn.preprocessing import normalize
embedds_dic = {input : normalize(x, 'l2') for input, x in embedds_dic.items()}

In [32]:
embedds = np.concatenate(embedds_dic.values(), axis=1)

In [33]:
embedds.shape

(1767, 1800)

In [34]:
scores = np.dot(embedds, embedds.T)

In [43]:
scores[np.arange(nb_studies), np.arange(nb_studies)] = -1000

In [44]:
K = 10
args = scores.argsort(axis=1)[:, -K:]

In [45]:
S = np.zeros((nb_studies, K))
for i in range(nb_studies) :
    for j in range(K) :
        S[i, j] = H[i, args[i, j]]

In [46]:
np.mean(S.sum(axis=1)/K)

0.76745897000565921

In [47]:
from sklearn.metrics import roc_auc_score

In [48]:
aucs = [0] * nb_studies

In [49]:
for i in range(nb_studies) :
    aucs[i] = roc_auc_score(H[i], scores[i])

In [50]:
aucs

[0.93099691867160517,
 0.92228205048398637,
 0.87236608671293847,
 0.94546982476890029,
 0.92251548445329767,
 0.90572380092751092,
 0.92978306203118677,
 0.9265772355193127,
 0.80604749603162262,
 0.93575897164555388,
 0.83549908182638744,
 0.73530144106570394,
 0.92715303931028048,
 0.86953375455196247,
 0.88708798904416564,
 0.93346353761399348,
 0.83624607052818334,
 0.93290329608764666,
 0.91727878240841609,
 0.43907373400977312,
 0.87855986803199604,
 0.93903482834822116,
 0.92429736375237315,
 0.81270036415699209,
 0.94659808895390463,
 0.92263998257026358,
 0.93750194528307762,
 0.86600890161536315,
 0.92519997510037655,
 0.92597030719910356,
 0.93547106975006999,
 0.93347131874630396,
 0.9284602695384232,
 0.93542438295620778,
 0.89193563447352864,
 0.92677954495938242,
 0.93006318279436029,
 0.92527778642348035,
 0.92355037505057735,
 0.89747580067851473,
 0.93238196022285158,
 0.93896479815742784,
 0.92406392978306207,
 0.91115503128015185,
 0.90738896324193097,
 0.821251828

In [51]:
np.mean(aucs)

0.88053148753378252

In [59]:
rocs = {}
for cd in cdnos :
    idxs = df[df.cdno == cd].index
    rocs[cd] = np.mean(np.array(aucs)[idxs])

In [60]:
rocs

{'ACEInhibitors_processed': 0.89394103972351469,
 'ADHD_processed': 0.91737582971585607,
 'Antihistamines_processed': 0.89169717084871547,
 'AtypicalAntipsychotics_processed': 0.95623107968148602,
 'BetaBlockers_processed': 0.77803066614246008,
 'CalciumChannelBlockers_processed': 0.73933886439553631,
 'Estrogens_processed': 0.97426321213303446,
 'NSAIDS_processed': 0.89315792909508862,
 'Opiods_processed': 0.90378811416950311,
 'OralHypoglycemics_processed': 0.96544405776927111,
 'ProtonPumpInhibitors_processed': 0.84449454248098899,
 'SkeletalMuscleRelaxants_processed': 0.73691777797403502,
 'Statins_processed': 0.86638610216270218,
 'Triptans_processed': 0.95609768860971733}