In [3]:
from keras.models  import load_model
import keras.backend as K

import cPickle
import sys
sys.path.insert(0, '../../preprocess')
import vectorizer

import pandas as pd
import numpy as np

from gcnn import GCNN
aspects = ['population', 'intervention', 'outcome']

In [2]:
vec = cPickle.load(open('../data/vectorizers/allfields_with_embedding_5000.p', 'rb'))

In [3]:
index = vec.index
C = {'inputs' : ['abstract', 'population', 'intervention', 'outcome']}
for input in C['inputs'] :
    input_range = index[input]
    C[input] = vec.X[input_range[0]:input_range[1]]

In [5]:
model = load_model('../store/weights/single_setup/WedNov1500:02:112017/loss.h5', 
                   custom_objects={'contrastive_loss' : lambda a,b : a, 'GCNN' : GCNN})
#results_folder = '../store/results/single_message_ThuOct1919:10:492017/'

ValueError: None values not supported.

In [5]:
model_c1 = {}
for aspect in aspects :
    model_c1[aspect] = K.function(model.get_layer('pool_'+aspect[0].upper()).inputs, 
                          [model.get_layer('pool_'+aspect[0].upper()).layers[2].output])

In [6]:
filters_pico = {}
sample_length = 5000
sample = np.random.choice(C['abstract'].shape[0], sample_length, replace=False)
for input in C['inputs'] :
    for aspect in aspects :
        filters_pico[(aspect, input)] = model_c1[aspect]([C[input][sample]])[0]

In [7]:
(filters_pico[('population', 'intervention')].sum(axis=1).sum(axis=0) > 0).sum()/300.0

0.87666666666666671

In [8]:
nb_act = {k:(v > 0).sum()/sample_length for k,v in filters_pico.items()}
for key in sorted(nb_act.keys()) :
    print key,nb_act[key]

('intervention', 'abstract') 15
('intervention', 'intervention') 8
('intervention', 'outcome') 1
('intervention', 'population') 2
('outcome', 'abstract') 13
('outcome', 'intervention') 1
('outcome', 'outcome') 5
('outcome', 'population') 2
('population', 'abstract') 12
('population', 'intervention') 1
('population', 'outcome') 1
('population', 'population') 7


In [9]:
activated_words = {}
for aspect in aspects :
    filters = filters_pico[(aspect, 'abstract')]
    word_activations = np.zeros((vec.vocab_size, filters.shape[2]))
    for i in range(filters.shape[2]) :
        filter_i = filters[:,:,i]
        word2filter = np.dstack((C['abstract'][sample], filter_i))
        filter_w2f = word2filter[word2filter[:,:,1] > 0.0]
        for word in filter_w2f :
            word_activations[int(word[0]), i] = word[1]
    activated_words[aspect] = word_activations

In [10]:
top_words_df = pd.DataFrame()
top_words_size = 40
for aspect in aspects :
    popwords = activated_words[aspect].sum(axis=1)
    sorted_idx = np.argsort(popwords)
    top_words = sorted_idx[-top_words_size:]
    top_words = pd.Series([v for k, v in vec.idx2word.items() if k in top_words])
    top_words_df[aspect] = top_words
top_words_df

Unnamed: 0,population,intervention,outcome
0,african,atorvastatin,adhesion
1,african-american,atropine,arteriosus
2,antiretroviral,bleomycin,auc
3,ards,cimetidine,bacteriological
4,asian,cpm,bradycardia
5,bph,epa,cgi-i
6,bulimia,fractions,colonization
7,cirrhotic,glutamine,cure
8,fertilization,hctz,diarrhea
9,fibromyalgia,hes,erythema


<b>Evaluating Filters on Cohen Data</b>

In [11]:
df = pd.read_csv('../data/files/test_cohen_dedup.csv')
cohenvec = cPickle.load(open('../data/vectorizers/cohendata_dedup_5000.p', 'rb'))
cohenvec = cPickle.load(open('../data/vectorizers/decision_aids_vec_5000.p', 'rb'))

In [12]:
filter_aspect = {}
for aspect in aspects :
    filter_aspect[aspect] = model_c1[aspect]([cohenvec.X])[0]

In [13]:
filter_aspect

{'intervention': array([[[ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         ..., 
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.]],
 
        [[ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         ..., 
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.]],
 
        [[ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         ..., 
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.]],
 
        ..., 
        [[ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0

In [14]:
activated_words_cohen = {}
for aspect in aspects :
    filters = filter_aspect[aspect]
    word_activations = np.zeros((cohenvec.vocab_size, filters.shape[2]))
    for i in range(filters.shape[2]) :
        filter_i = filters[:,:,i]
        word2filter = np.dstack((cohenvec.X, filter_i))
        filter_w2f = word2filter[word2filter[:,:,1] > 0.0]
        for word in filter_w2f :
            word_activations[int(word[0]), i] = word[1]
    activated_words_cohen[aspect] = word_activations

In [15]:
top_words_df_cohen = pd.DataFrame()
top_words_size = 40
for aspect in aspects :
    popwords = activated_words_cohen[aspect].sum(axis=1)
    sorted_idx = np.argsort(popwords)
    top_words = sorted_idx[-top_words_size:]
    top_words = pd.Series([v for k, v in cohenvec.idx2word.items() if k in top_words])
    top_words_df_cohen[aspect] = top_words
top_words_df_cohen

Unnamed: 0,population,intervention,outcome
0,accuracy,adjuvant,adherence
1,against,aspirin,antigen
2,allowing,behaviors,anxiety
3,anomalies,booklet,atrial
4,antigen,chemotherapy,attacks
5,behaviors,computer,attitudes
6,canada,consultations,behavior
7,cancer,counseling,behaviors
8,cancers,decisions,burden
9,cesarean,discussion,cesarean


In [None]:
summed = {}
for aspect in aspects :
    summed[aspect] = filter_aspect[aspect].sum(axis=-1)

In [None]:
X_words = map(lambda s : [cohenvec.idx2word[w] for w in s], cohenvec.X)

In [None]:
zipped = zip(X_words, summed['population'], summed['intervention'], summed['outcome'])
total_vals = []
for x,p,i,o in zipped :
    total_vals.append(zip(x,p,i,o))
    
for abst in total_vals :
    for sent in abst :
        if sent[0] != '[0]' :
            if sent[1] > 0 and sent[2] > 0 and sent[3] > 0:
                print "\033[1;33;48m "+sent[0],
            elif sent[1] > 0 :
                print "\033[1;31;48m "+sent[0], 
            elif sent[2] > 0 :
                print "\033[1;32;48m "+sent[0],
            elif sent[3] > 0 :
                print "\033[1;34;48m "+sent[0],
            else :
                print "\033[0;30;48m "+sent[0],
        
    print ''

In [None]:
get_ipython().system('cp "Filter Viz Single.ipynb" '+results_folder)