In [1]:
from keras.models  import load_model
import keras.backend as K

import cPickle
import sys
sys.path.insert(0, '../../preprocess')
import vectorizer

import pandas as pd
import numpy as np

aspects = ['population', 'intervention', 'outcome']

Using TensorFlow backend.


In [2]:
vec = cPickle.load(open('../data/vectorizers/allfields_with_embedding_5000.p', 'rb'))

In [3]:
index = vec.index
C = {'inputs' : ['abstract', 'population', 'intervention', 'outcome']}
for input in C['inputs'] :
    input_range = index[input]
    C[input] = vec.X[input_range[0]:input_range[1]]

In [14]:
model_population = load_model('../store/weights/separate/population_setup/MonOct1619:05:002017/loss.h5')
model_intervention = load_model('../store/weights/separate/intervention_setup/MonOct1619:05:002017/loss.h5')
model_outcome = load_model('../store/weights/separate/outcome_setup/MonOct1619:05:002017/loss.h5')
model = {'population' : model_population, 
         'intervention' : model_intervention, 
         'outcome' : model_outcome}

In [21]:
model_population = load_model('../store/weights/separate/population_setup/MonOct1621:08:272017/loss.h5')
model_intervention = load_model('../store/weights/separate/intervention_setup/MonOct1621:08:272017/loss.h5')
model_outcome = load_model('../store/weights/separate/outcome_setup/MonOct1621:08:272017/loss.h5')
model = {'population' : model_population, 
         'intervention' : model_intervention, 
         'outcome' : model_outcome}

In [22]:
model_c1 = {}
for aspect in aspects :
    model_c1[aspect] = K.function(model[aspect].get_layer('pool').inputs, 
                          [model[aspect].get_layer('pool').layers[2].output])

In [16]:
filters_pico = {}
sample_length = 5000
sample = np.random.choice(C['abstract'].shape[0], sample_length, replace=False)
for input in C['inputs'] :
    for aspect in aspects :
        filters_pico[(aspect, input)] = model_c1[aspect]([C[input][sample]])[0]

In [7]:
for key in sorted(filters_pico.keys()) :
    print key, (filters_pico[key] > 0).sum()/sample_length

('intervention', 'abstract') 112
('intervention', 'intervention') 74
('intervention', 'outcome') 5
('intervention', 'population') 10
('outcome', 'abstract') 78
('outcome', 'intervention') 4
('outcome', 'outcome') 39
('outcome', 'population') 7
('population', 'abstract') 69
('population', 'intervention') 6
('population', 'outcome') 6
('population', 'population') 48


In [8]:
activated_words = {}
for aspect in aspects :
    filters = filters_pico[(aspect, 'abstract')]
    word_activations = np.zeros((vec.vocab_size, filters.shape[2]))
    for i in range(filters.shape[2]) :
        filter_i = filters[:,:,i]
        word2filter = np.dstack((C['abstract'][sample], filter_i))
        filter_w2f = word2filter[word2filter[:,:,1] > 0.0]
        for word in filter_w2f :
            word_activations[int(word[0]), i] = word[1]
    activated_words[aspect] = word_activations

In [9]:
top_words_df = pd.DataFrame()
top_words_size = 40
for aspect in aspects :
    popwords = activated_words[aspect].sum(axis=1)
    sorted_idx = np.argsort(popwords)
    top_words = sorted_idx[-top_words_size:]
    top_words = pd.Series([v for k, v in vec.idx2word.items() if k in top_words])
    top_words_df[aspect] = top_words
top_words_df

Unnamed: 0,population,intervention,outcome
0,als,artesunate,adhesion
1,alzheimer,atenolol,anthropometric
2,ami,behaviors,attachment
3,ards,bleomycin,attitude
4,bph,booklet,attitudes
5,british,campaign,auc
6,bronchiolitis,cefotaxime,bacteriological
7,burns,cognitive-behavioral,biochemical
8,cd,cpm,bmd
9,cf,epa,cgi-i


<b>Evaluating Filters on Cohen Data</b>

In [17]:
df = pd.read_csv('../data/files/test_cohen_dedup.csv')
#cohenvec = cPickle.load(open('../data/vectorizers/cohendata_dedup_5000.p', 'rb'))
cohenvec = cPickle.load(open('../data/vectorizers/decision_aids_vec_5000.p', 'rb'))

In [23]:
filter_aspect = {}
for aspect in aspects :
    filter_aspect[aspect] = model_c1[aspect]([cohenvec.X])[0]

In [24]:
activated_words_cohen = {}
for aspect in aspects :
    filters = filter_aspect[aspect]
    word_activations = np.zeros((cohenvec.vocab_size, filters.shape[2]))
    for i in range(filters.shape[2]) :
        filter_i = filters[:,:,i]
        word2filter = np.dstack((cohenvec.X, filter_i))
        filter_w2f = word2filter[word2filter[:,:,1] > 0.0]
        for word in filter_w2f :
            word_activations[int(word[0]), i] = word[1]
    activated_words_cohen[aspect] = word_activations

In [25]:
top_words_df_cohen = pd.DataFrame()
top_words_size = 40
for aspect in aspects :
    popwords = activated_words_cohen[aspect].sum(axis=1)
    sorted_idx = np.argsort(popwords)
    top_words = sorted_idx[-top_words_size:]
    top_words = pd.Series([v for k, v in cohenvec.idx2word.items() if k in top_words])
    top_words_df_cohen[aspect] = top_words
top_words_df_cohen

Unnamed: 0,population,intervention,outcome
0,adults,adjuvant,acceptability
1,advanced,aspirin,acceptance
2,aged,assist,accuracy
3,anomalies,behaviors,anxiety
4,behaviors,booklet,attitudes
5,canada,computer,behavior
6,cancer,counseling,behaviors
7,cancers,decisions,beliefs
8,cesarean,discussion,burden
9,colorectal,education,detection
