In [None]:
from keras.models  import load_model
import keras.backend as K

import cPickle
import sys
sys.path.insert(0, '../../preprocess')
import vectorizer

from sklearn.preprocessing import normalize
import pandas as pd
import numpy as np

from gcnn import GCNN
aspects = ['population', 'intervention', 'outcome']
aspects = [a[0].upper() for a in aspects]

In [None]:
vec = cPickle.load(open('../data/vectorizers/allfields_with_embedding_5000.p', 'rb'))

In [None]:
da = pd.read_csv('../data/files/decision_aids_filter.csv')
im_map = {'breast cancer': 'BCT', 
          'healthy women at risk of breast cancer': 'BCS', 
          'type II diabetes': 'D',
          'menopausal women': 'MW',
          'pregnant women, previous C section': 'PWC',
          'pregnant women': 'PW',
          'healthy people, at risk colon cancer': 'CCS',
          'prostate cancer': 'PCT',
          'healthy men, contemplating risk of prostate cancer': 'PCS',
          'AF': 'AF',
          'healthy women at genetic risk of breast cancer': 'BCG'}

In [None]:
davec = cPickle.load(open('../data/vectorizers/decision_aids_vec_5000.p'))

In [None]:
file_model = '../store/weights/single_setup/WedNov1500:02:112017/loss.h5'
model = load_model(file_model, custom_objects={'contrastive_loss' : lambda a,b : a})

In [None]:
model_aspect = {}
for aspect in aspects :
    model_aspect[aspect] = K.function(model.get_layer('pool_'+aspect).inputs + [K.learning_phase()], 
                          [model.get_layer('pool_'+aspect).outputs[0]])

In [None]:
pool_da = {}
norm_pool_da = {}

for aspect in aspects :
    pool_da[aspect] = model_aspect[aspect]([davec.X, 0])[0]
    norm_pool_da[aspect] = normalize(np.float64(pool_da[aspect]), 'l2')

In [None]:
irange = vec.index['abstract']
X_abs = vec.X[irange[0]:irange[1]]
pool_cdsr = {}
norm_pool_cdsr = {}

for aspect in aspects :
    pool_cdsr[aspect] = []
    for i in range(int(np.floor(len(X_abs)/1000))) :
        pool_cdsr[aspect].append(model_aspect[aspect]([X_abs[i*1000:(i+1)*1000,:], 0])[0])
    pool_cdsr[aspect] = np.concatenate(pool_cdsr[aspect])
    norm_pool_cdsr[aspect] = normalize(np.float64(pool_cdsr[aspect]), 'l2')

In [None]:
norm_pool = np.concatenate([pool_cdsr[aspect], pool_da[aspect]], axis=0)
cov = np.cov(norm_pool, rowvar=False)

In [None]:
for aspect in aspects :
    print np.isnan(norm_pool_da[aspect]).sum()

In [None]:
from scipy import linalg as LA
LA.eigh(cov)

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
pca_models = {}
for aspect in aspects :
    pca = PCA(n_components=2, svd_solver='arpack', tol=0.0001)
    pca.fit(np.concatenate([norm_pool_cdsr[aspect], norm_pool_da[aspect]], axis=0))
    pca_models[aspect] = pca
    
pcas = {}
for aspect in aspects :
    #pcas[aspect] = TSNE(n_components=2).fit_transform(norm_pool_da[aspect])
    pcas[aspect] = pca_models[aspect].transform(norm_pool_da[aspect])
#     pcas[aspect] = PCA(n_components=2, svd_solver='arpack', tol=100).fit_transform(
#             np.concatenate([norm_pool_cdsr[aspect], norm_pool_da[aspect]], axis=0))

In [None]:
import matplotlib.pyplot as plt
#pcas = norm_pool_da
for aspect in aspects :
    for key in im_map.keys() :
        idxs = da[da['IM_population'] == key].index
        plt.scatter(pcas[aspect][idxs, 0], pcas[aspect][idxs, 1], label=key)
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.title(aspect)
    plt.show()

In [None]:
nb_studies = len(da)
H = np.zeros((nb_studies, nb_studies))
for i in range(nb_studies) :
    H[i, da[da['IM_population'] == da['IM_population'][i]].index] = 1
np.fill_diagonal(H, 0)

scores = {}
for aspect in aspects :
    scores[aspect] = np.dot(norm_pool_da[aspect], norm_pool_da[aspect].T)
    np.fill_diagonal(scores[aspect], -1000)

In [None]:
from sklearn.metrics import roc_auc_score
series = {}
for aspect in aspects :
    aucs = [0] * nb_studies
    for i in range(nb_studies) :
        aucs[i] = roc_auc_score(H[i], scores[aspect][i])
    rocs = {}
    for key in im_map :
        idxs = da[da['IM_population'] == key].index
        rocs[key] = np.mean(np.array(aucs)[idxs])
    rocs['mean'] = np.mean(aucs)
    series[aspect] = pd.Series(rocs, name=aspect)
pd.DataFrame(series.values())

In [None]:
#get_ipython().system('cp "Decision Aids Single.ipynb" '+result_folder)