In [1]:
from __future__ import division
from __future__ import print_function
from operator import itemgetter
from itertools import combinations
import time
import os
import tensorflow as tf
import numpy as np
import networkx as nx
import scipy.sparse as sp
from sklearn import metrics

In [2]:
print(tf.__version__)

1.8.0


In [3]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [4]:
from decagon.deep.optimizer import DecagonOptimizer
from decagon.deep.model import DecagonModel
from decagon.deep.minibatch import EdgeMinibatchIterator
from decagon.utility import rank_metrics, preprocessing
from data.utils_mod import *

In [5]:
# Train on GPU
os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

# Functions

In [6]:
def get_accuracy_scores(edges_pos, edges_neg, edge_type):
    feed_dict.update({placeholders['dropout']: 0})
    feed_dict.update({placeholders['batch_edge_type_idx']: minibatch.edge_type2idx[edge_type]})
    feed_dict.update({placeholders['batch_row_edge_type']: edge_type[0]})
    feed_dict.update({placeholders['batch_col_edge_type']: edge_type[1]})
    rec = sess.run(opt.predictions, feed_dict=feed_dict)

    def sigmoid(x):
        return 1. / (1 + np.exp(-x))

    # Predict on test set of edges
    preds = []
    actual = []
    predicted = []
    edge_ind = 0
    for u, v in edges_pos[edge_type[:2]][edge_type[2]]:
        score = sigmoid(rec[u, v])
        preds.append(score)
        assert adj_mats_orig[edge_type[:2]][edge_type[2]][u,v] == 1, 'Problem 1'

        actual.append(edge_ind)
        predicted.append((score, edge_ind))
        edge_ind += 1

    preds_neg = []
    for u, v in edges_neg[edge_type[:2]][edge_type[2]]:
        score = sigmoid(rec[u, v])
        preds_neg.append(score)
        assert adj_mats_orig[edge_type[:2]][edge_type[2]][u,v] == 0, 'Problem 0'

        predicted.append((score, edge_ind))
        edge_ind += 1

    preds_all = np.hstack([preds, preds_neg])
    preds_all = np.nan_to_num(preds_all)
    labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))])
    predicted = list(zip(*sorted(predicted, reverse=True, key=itemgetter(0))))[1]

    roc_sc = metrics.roc_auc_score(labels_all, preds_all)
    aupr_sc = metrics.average_precision_score(labels_all, preds_all)
    apk_sc = rank_metrics.apk(actual, predicted, k=50)

    return roc_sc, aupr_sc, apk_sc


def construct_placeholders(edge_types):
    placeholders = {
        'batch': tf.placeholder(tf.int32, name='batch'),
        'batch_edge_type_idx': tf.placeholder(tf.int32, shape=(), name='batch_edge_type_idx'),
        'batch_row_edge_type': tf.placeholder(tf.int32, shape=(), name='batch_row_edge_type'),
        'batch_col_edge_type': tf.placeholder(tf.int32, shape=(), name='batch_col_edge_type'),
        'degrees': tf.placeholder(tf.int32),
        'dropout': tf.placeholder_with_default(0., shape=()),
    }
    placeholders.update({
        'adj_mats_%d,%d,%d' % (i, j, k): tf.sparse_placeholder(tf.float32)
        for i, j in edge_types for k in range(edge_types[i,j])})
    placeholders.update({
        'feat_%d' % i: tf.sparse_placeholder(tf.float32)
        for i, _ in edge_types})
    return placeholders

# Load and preprocess data 

In [7]:
# Loading Gene data (PPI)
ppi, gene2idx = load_ppi(fname='data/modif_data/ppi_mini.csv')
ppi_adj = nx.adjacency_matrix(ppi)
ppi_degrees = np.array(ppi_adj.sum(axis=0)).squeeze() 
n_genes = ppi.number_of_nodes() # Number of genes (nodes)
# Loading individual side effects
stitch2se, semono2name, semono2idx = load_mono_se(fname='data/modif_data/mono_mini.csv')
n_semono = len(semono2name)
print('Number of individual side effects: ', n_semono)
# Loading Target data (DTI)
stitch2proteins = load_targets(fname='data/modif_data/target_mini.csv')
# Loading Drug data (DDI)
combo2stitch, combo2se, se2name, drug2idx = load_combo_se(fname='data/modif_data/combo_mini.csv')
# Loading Side effect data (features)
#stitch2se, se2name = load_mono_se(fname='polypharmacy/mono_mini.csv')
n_drugs = len(drug2idx)
print('Number of drugs: ', n_drugs)

Reading: data/modif_data/ppi_mini.csv
Edges: 6609
Nodes: 4499
Reading: data/modif_data/mono_mini.csv
Number of individual side effects:  564
Reading: data/modif_data/target_mini.csv
Reading: data/modif_data/combo_mini.csv
Drug combinations: 22866 Side effects: 6
Drug-drug interactions: 27746
Number of drugs:  590


In [8]:
# Drug-traget adjacency matrix
dti_adj = np.zeros([n_genes,n_drugs],dtype=int)
for drug in drug2idx.keys():
    for gene in stitch2proteins[drug]:
        if gene==set():
            continue
        else:
            idp = gene2idx[str(gene)]
            idd = drug2idx[drug]
            dti_adj[idp,idd] = 1  

In [9]:
dti_adj = sp.csr_matrix(dti_adj)

In [10]:
# DDi adjacency matrix
ddi_adj_list = []
for se in se2name.keys():
    mat = np.zeros([n_drugs,n_drugs],dtype=int)
    for pair in combo2se.keys():
        if se in combo2se[pair]:
            d1,d2 = combo2stitch[pair]
            mat[drug2idx[d1],drug2idx[d2]] = 1
    ddi_adj_list.append(sp.csr_matrix(mat))    
ddi_degrees_list = [np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in ddi_adj_list]

adj_mats_orig = {
    (0, 0): [ppi_adj, ppi_adj.transpose(copy=True)],
    (0, 1): [dti_adj],
    (1, 0): [dti_adj.transpose(copy=True)],
    (1, 1): ddi_adj_list + [x.transpose(copy=True) for x in ddi_adj_list],
}
degrees = {
    0: [ppi_degrees, ppi_degrees],
    1: ddi_degrees_list + ddi_degrees_list, 
}

In [11]:
# featureless (genes)
gene_feat = sp.identity(n_genes)
gene_nonzero_feat, gene_num_feat = gene_feat.shape
gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo())
# features (drugs)
oh_feat = np.zeros([n_drugs,n_semono], dtype=int)
for drug in drug2idx.keys():
    for se in stitch2se[drug]:
        did = drug2idx[drug]
        seid = semono2idx[se]
        oh_feat[did,seid] = 1
drug_feat = sp.csr_matrix(oh_feat)
drug_nonzero_feat = n_semono
drug_num_feat = n_semono
drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo())

In [12]:
# data representation
num_feat = {
    0: gene_num_feat,
    1: drug_num_feat,
}
nonzero_feat = {
    0: gene_nonzero_feat,
    1: drug_nonzero_feat,
}
feat = {
    0: gene_feat,
    1: drug_feat,
}
# Dictionary with the shape of all the matrices of the dictionary adj_mats_orig
edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items()}
edge_type2decoder = {
    (0, 0): 'bilinear',
    (0, 1): 'bilinear',
    (1, 0): 'bilinear',
    (1, 1): 'dedicom',
}
#Dictionary with the number of matrices for each entry of adj_mats_orig
edge_types = {k: len(v) for k, v in adj_mats_orig.items()}
num_edge_types = sum(edge_types.values())
print("Edge types:", "%d" % num_edge_types)

Edge types: 16


In [13]:
print(num_feat)
print(nonzero_feat)

{0: 4434, 1: 550}
{0: 4434, 1: 550}


## Settings and placeholders

In [14]:
val_test_size = 0.05
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_integer('neg_sample_size', 1, 'Negative sample size.')
flags.DEFINE_float('learning_rate', 0.001, 'Initial learning rate.')
flags.DEFINE_integer('epochs', 50, 'Number of epochs to train.')
flags.DEFINE_integer('hidden1', 64, 'Number of units in hidden layer 1.')
flags.DEFINE_integer('hidden2', 32, 'Number of units in hidden layer 2.')
flags.DEFINE_float('weight_decay', 0, 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_float('dropout', 0.1, 'Dropout rate (1 - keep probability).')
flags.DEFINE_float('max_margin', 0.1, 'Max margin parameter in hinge loss')
flags.DEFINE_integer('batch_size', 512, 'minibatch size.')
flags.DEFINE_boolean('bias', True, 'Bias term.')
# Important -- Do not evaluate/print validation performance every iteration as it can take
# substantial amount of time
PRINT_PROGRESS_EVERY = 150


In [15]:
print("Defining placeholders")
placeholders = construct_placeholders(edge_types)

Defining placeholders


In [16]:
# MACHETAZO!! Soluciona el bug de Jupyter con tensorflow que proporciona un flag -f
tf.app.flags.DEFINE_string('f', '', 'kernel')

## Create minibatch iterator, model and optimizer

In [17]:
print("Create minibatch iterator")
minibatch = EdgeMinibatchIterator(
    adj_mats=adj_mats_orig,
    feat=feat,
    edge_types=edge_types,
    batch_size=FLAGS.batch_size,
    val_test_size=val_test_size
)

Create minibatch iterator
Minibatch edge type: (0, 1, 0)
Constructing test edges= 0000/0801
Constructing val edges= 0000/0801
Train edges= 14424
Val edges= 0801
Test edges= 0801
Minibatch edge type: (1, 0, 0)
Constructing test edges= 0000/0801


  rowdegree_mat_inv = sp.diags(np.nan_to_num(np.power(rowsum, -0.5)).flatten())
  coldegree_mat_inv = sp.diags(np.nan_to_num(np.power(colsum, -0.5)).flatten())


Constructing val edges= 0000/0801
Train edges= 14424
Val edges= 0801
Test edges= 0801
Minibatch edge type: (0, 0, 0)
Constructing test edges= 0000/0647
Constructing val edges= 0000/0647
Train edges= 11664
Val edges= 0647
Test edges= 0647
Minibatch edge type: (0, 0, 1)
Constructing test edges= 0000/0647
Constructing val edges= 0000/0647
Train edges= 11664
Val edges= 0647
Test edges= 0647
Minibatch edge type: (1, 1, 0)
Constructing test edges= 0000/0022
Constructing val edges= 0000/0022
Train edges= 0402
Val edges= 0022
Test edges= 0022
Minibatch edge type: (1, 1, 1)
Constructing test edges= 0000/0189
Constructing val edges= 0000/0189
Train edges= 3421
Val edges= 0189
Test edges= 0189
Minibatch edge type: (1, 1, 2)
Constructing test edges= 0000/0124
Constructing val edges= 0000/0124
Train edges= 2234
Val edges= 0124
Test edges= 0124
Minibatch edge type: (1, 1, 3)
Constructing test edges= 0000/0147
Constructing val edges= 0000/0147
Train edges= 2646
Val edges= 0147
Test edges= 0147
Miniba

In [18]:
print("Create model")
model = DecagonModel(
    placeholders=placeholders,
    num_feat=num_feat,
    nonzero_feat=nonzero_feat,
    edge_types=edge_types,
    decoders=edge_type2decoder,
)

Create model
Instructions for updating:
dim is deprecated, use axis instead


In [19]:
print("Create optimizer")
with tf.name_scope('optimizer'):
    opt = DecagonOptimizer(
        embeddings=model.embeddings,
        latent_inters=model.latent_inters,
        latent_varies=model.latent_varies,
        degrees=degrees,
        edge_types=edge_types,
        edge_type2dim=edge_type2dim,
        placeholders=placeholders,
        batch_size=FLAGS.batch_size,
        margin=FLAGS.max_margin
    )

Create optimizer


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [20]:
print("Initialize session")
sess = tf.Session()
sess.run(tf.global_variables_initializer())
feed_dict = {}

Initialize session


# Train model

In [22]:
print("Train model")
for epoch in range(FLAGS.epochs):

    minibatch.shuffle()
    itr = 0
    while not minibatch.end():
        # Construct feed dictionary
        feed_dict = minibatch.next_minibatch_feed_dict(placeholders=placeholders)
        feed_dict = minibatch.update_feed_dict(
            feed_dict=feed_dict,
            dropout=FLAGS.dropout,
            placeholders=placeholders)

        t = time.time()

        # Training step: run single weight update
        outs = sess.run([opt.opt_op, opt.cost, opt.batch_edge_type_idx], feed_dict=feed_dict)
        train_cost = outs[1]
        batch_edge_type = outs[2]

        if itr % PRINT_PROGRESS_EVERY == 0:
            val_auc, val_auprc, val_apk = get_accuracy_scores(
                minibatch.val_edges, minibatch.val_edges_false,
                minibatch.idx2edge_type[minibatch.current_edge_type_idx])

            print("Epoch:", "%04d" % (epoch + 1), "Iter:", "%04d" % (itr + 1), "Edge:", "%04d" % batch_edge_type,
                  "train_loss=", "{:.5f}".format(train_cost),
                  "val_roc=", "{:.5f}".format(val_auc), "val_auprc=", "{:.5f}".format(val_auprc),
                  "val_apk=", "{:.5f}".format(val_apk), "time=", "{:.5f}".format(time.time() - t))

        itr += 1
#sess.close()
print("Optimization finished!")
print("Total time: ", time.time()-t)

Train model
Epoch: 0001 Iter: 0001 Edge: 0002 train_loss= 436.42828 val_roc= 0.89174 val_auprc= 0.86522 val_apk= 0.90518 time= 0.12993
Epoch: 0001 Iter: 0151 Edge: 0001 train_loss= 74.67413 val_roc= 0.95012 val_auprc= 0.95428 val_apk= 0.93168 time= 0.27760
Epoch: 0001 Iter: 0301 Edge: 0002 train_loss= 373.37909 val_roc= 0.91530 val_auprc= 0.88832 val_apk= 0.89002 time= 0.14255
Epoch: 0002 Iter: 0001 Edge: 0002 train_loss= 358.67566 val_roc= 0.91818 val_auprc= 0.89357 val_apk= 0.87376 time= 0.14029
Epoch: 0002 Iter: 0151 Edge: 0001 train_loss= 78.46939 val_roc= 0.95775 val_auprc= 0.96091 val_apk= 0.95208 time= 0.26353
Epoch: 0002 Iter: 0301 Edge: 0002 train_loss= 335.62213 val_roc= 0.92790 val_auprc= 0.89283 val_apk= 0.80133 time= 0.14011
Epoch: 0003 Iter: 0001 Edge: 0002 train_loss= 301.38019 val_roc= 0.93459 val_auprc= 0.90072 val_apk= 0.80147 time= 0.14022
Epoch: 0003 Iter: 0151 Edge: 0001 train_loss= 73.81236 val_roc= 0.94972 val_auprc= 0.96076 val_apk= 1.00000 time= 0.27485
Epoch: 

Epoch: 0023 Iter: 0151 Edge: 0001 train_loss= 67.88560 val_roc= 0.95078 val_auprc= 0.95202 val_apk= 0.96925 time= 0.29250
Epoch: 0023 Iter: 0301 Edge: 0002 train_loss= 104.52620 val_roc= 0.98825 val_auprc= 0.98318 val_apk= 1.00000 time= 0.15135
Epoch: 0024 Iter: 0001 Edge: 0002 train_loss= 110.70338 val_roc= 0.98840 val_auprc= 0.98429 val_apk= 1.00000 time= 0.15908
Epoch: 0024 Iter: 0151 Edge: 0001 train_loss= 68.77203 val_roc= 0.94908 val_auprc= 0.94994 val_apk= 1.00000 time= 0.29045
Epoch: 0024 Iter: 0301 Edge: 0002 train_loss= 101.16736 val_roc= 0.98863 val_auprc= 0.98556 val_apk= 1.00000 time= 0.14915
Epoch: 0025 Iter: 0001 Edge: 0002 train_loss= 87.86604 val_roc= 0.99003 val_auprc= 0.98580 val_apk= 0.97559 time= 0.15303
Epoch: 0025 Iter: 0151 Edge: 0001 train_loss= 67.94505 val_roc= 0.94498 val_auprc= 0.94857 val_apk= 1.00000 time= 0.29947
Epoch: 0025 Iter: 0301 Edge: 0002 train_loss= 117.31165 val_roc= 0.98741 val_auprc= 0.98040 val_apk= 0.97702 time= 0.14469
Epoch: 0026 Iter: 00

Epoch: 0046 Iter: 0001 Edge: 0002 train_loss= 81.00751 val_roc= 0.99114 val_auprc= 0.98172 val_apk= 0.92151 time= 0.14630
Epoch: 0046 Iter: 0151 Edge: 0001 train_loss= 53.61271 val_roc= 0.95398 val_auprc= 0.95034 val_apk= 1.00000 time= 0.29137
Epoch: 0046 Iter: 0301 Edge: 0002 train_loss= 81.70570 val_roc= 0.98968 val_auprc= 0.98337 val_apk= 0.97919 time= 0.19254
Epoch: 0047 Iter: 0001 Edge: 0002 train_loss= 101.42620 val_roc= 0.99058 val_auprc= 0.98403 val_apk= 0.97351 time= 0.14587
Epoch: 0047 Iter: 0151 Edge: 0001 train_loss= 63.82941 val_roc= 0.95561 val_auprc= 0.95382 val_apk= 1.00000 time= 0.26633
Epoch: 0047 Iter: 0301 Edge: 0002 train_loss= 67.93335 val_roc= 0.99019 val_auprc= 0.98311 val_apk= 0.97457 time= 0.14241
Epoch: 0048 Iter: 0001 Edge: 0002 train_loss= 80.76517 val_roc= 0.99144 val_auprc= 0.98495 val_apk= 1.00000 time= 0.14227
Epoch: 0048 Iter: 0151 Edge: 0001 train_loss= 55.94589 val_roc= 0.95366 val_auprc= 0.95088 val_apk= 1.00000 time= 0.28040
Epoch: 0048 Iter: 0301 

In [None]:
t/3600