# DECAGON Training
Test notebook for training

## Python 2

In [1]:
from __future__ import division
from __future__ import print_function
from operator import itemgetter
from itertools import combinations, chain
import time
import datetime
import os
import tensorflow as tf
import numpy as np
import networkx as nx
import scipy.sparse as sp
from sklearn import metrics
import pandas as pd
import psutil
import pickle
from decagon.deep.optimizer import DecagonOptimizer
from decagon.deep.model import DecagonModel
from decagon.deep.minibatch import EdgeMinibatchIterator
from decagon.utility import rank_metrics, preprocessing

In [2]:
# Train on GPU
#os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
#os.environ["CUDA_VISIBLE_DEVICES"] = '0'
#config = tf.ConfigProto()
#config.gpu_options.allow_growth = True

In [3]:
# psutil & time BEGIN
start = time.time() #in seconds
pid = os.getpid()
ps= psutil.Process(pid)

# Import Data from previous computations

In [4]:
# Path to input file. Goes as parameter in script
in_file = './data/data_structures/DECAGON/DECAGON_toy_genes_500_drugs_400_se_4'

In [5]:
words = in_file.split('_')
DSE = False
BDM = False
DOCK = False
BIND = False
if 'DSE' in words: DSE = True
if 'BDM' in words: BDM = True
if 'docking' in words: DOCK = True
elif 'binding' in words: BIND = True
d_text = DOCK*'_docking'+BIND*'_binding'

In [6]:
with open(in_file, 'rb') as f:
    DS = pickle.load(f)
    for key in DS.keys():
        globals()[key]=DS[key]
        print(key,"Imported successfully")

edge2name Imported successfully
se_mono_name2idx Imported successfully
gene2idx Imported successfully
nonzero_feat Imported successfully
edge_type2dim Imported successfully
adj_mats_orig Imported successfully
edge_type2decoder Imported successfully
se_combo_name2idx Imported successfully
drug2idx Imported successfully
degrees Imported successfully
edge_types Imported successfully
num_edge_types Imported successfully
num_feat Imported successfully
feat Imported successfully


In [23]:
edge_types

{(0, 0): 1, (0, 1): 1, (1, 0): 1, (1, 1): 4}

In [7]:
n_genes = len(gene2idx)
n_drugs = len(drug2idx)
n_se_combo = len(se_combo_name2idx)
n_se_mono = len(se_mono_name2idx)
print(n_genes,n_drugs,n_se_combo,n_se_mono,DSE)

500 400 4 600 False


# Functions

In [8]:
def sigmoid(x):
        return 1. / (1 + np.exp(-x))

In [9]:
def get_accuracy_scores(edges_pos, edges_neg, edge_type, noise=False):
    """ Returns the AUROC, AUPRC and Accuracy of the dataset corresponding to the edge
    'edge_type' given as a tuple. The parameters 'edges_pos' and 'edges_neg' are the list 
    of edges of positive and negative interactions respectively of a given dataset, i.e., 
    train, test or validation.
    """
    feed_dict.update({placeholders['dropout']: 0})
    feed_dict.update({placeholders['batch_edge_type_idx']: minibatch.edge_type2idx[edge_type]})
    feed_dict.update({placeholders['batch_row_edge_type']: edge_type[0]})
    feed_dict.update({placeholders['batch_col_edge_type']: edge_type[1]})
    rec = sess.run(opt.predictions, feed_dict=feed_dict)
    # Predict on set of edges
    preds = []
    for u, v in edges_pos:
        score = sigmoid(rec[u, v])
        preds.append(score)
        if not noise:
            assert adj_mats_orig[edge_type[:2]][edge_type[2]][u,v] > 0, 'Problem 1'
    preds_neg = []
    for u, v in edges_neg:
        score = sigmoid(rec[u, v])
        preds_neg.append(score)
        if not noise:
            assert adj_mats_orig[edge_type[:2]][edge_type[2]][u,v] == 0, 'Problem 0'
    preds_all = np.hstack([preds, preds_neg])
    preds_all = np.nan_to_num(preds_all)
    labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))])

    roc_sc = metrics.roc_auc_score(labels_all, preds_all)
    aupr_sc = metrics.average_precision_score(labels_all, preds_all)
    acc = metrics.accuracy_score(labels_all, np.round(preds_all))

    return roc_sc, aupr_sc, acc

In [10]:
def construct_placeholders(edge_types):
    placeholders = {
        'batch': tf.placeholder(tf.int32, name='batch'),
        'batch_edge_type_idx': tf.placeholder(tf.int32, shape=(), name='batch_edge_type_idx'),
        'batch_row_edge_type': tf.placeholder(tf.int32, shape=(), name='batch_row_edge_type'),
        'batch_col_edge_type': tf.placeholder(tf.int32, shape=(), name='batch_col_edge_type'),
        'degrees': tf.placeholder(tf.int32),
        'dropout': tf.placeholder_with_default(0., shape=()),
    }
    placeholders.update({
        'adj_mats_%d,%d,%d' % (i, j, k): tf.sparse_placeholder(tf.float32)
        for i, j in edge_types for k in range(edge_types[i,j])})
    placeholders.update({
        'feat_%d' % i: tf.sparse_placeholder(tf.float32)
        for i, _ in edge_types})
    return placeholders

## Settings and placeholders

In [11]:
noise = 0.05
val_test_size = 0.15
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_integer('neg_sample_size', 1, 'Negative sample size.')
flags.DEFINE_float('learning_rate', 0.001, 'Initial learning rate.')
flags.DEFINE_integer('epochs', 10, 'Number of epochs to train.')
flags.DEFINE_integer('hidden1', 64, 'Number of units in hidden layer 1.')
flags.DEFINE_integer('hidden2', 32, 'Number of units in hidden layer 2.')
flags.DEFINE_float('weight_decay', 0, 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_float('dropout', 0.1, 'Dropout rate (1 - keep probability).')
flags.DEFINE_float('max_margin', 0.1, 'Max margin parameter in hinge loss')
flags.DEFINE_integer('batch_size', 128, 'minibatch size.')
flags.DEFINE_boolean('bias', True, 'Bias term.')

In [12]:
print("Defining placeholders")
placeholders = construct_placeholders(edge_types)

Defining placeholders


In [13]:
# MACHETAZO!! Soluciona el bug de Jupyter con tensorflow que proporciona un flag -f
flags.DEFINE_string('f', '', 'kernel')

## Load minibatch

In [14]:
noise_str = bool(noise)*('_noise_' + str(noise))
mb_file = 'data/data_structures/MINIBATCH/MINIBATCH_'+words[2]+d_text+\
            '_genes_'+str(n_genes)+'_drugs_'+\
            str(n_drugs)+'_se_'+str(n_se_combo)+'_batchsize_'+str(FLAGS.batch_size)+\
            '_valsize_'+str(val_test_size) + noise_str
print(mb_file)

data/data_structures/MINIBATCH/MINIBATCH_toy_genes_500_drugs_400_se_4_batchsize_128_valsize_0.15_noise_0.05


In [15]:
with open(mb_file, 'rb') as f:
    minibatch = pickle.load(f)

## Create model and optimizer

In [16]:
print("Create model")
model = DecagonModel(
    placeholders=placeholders,
    num_feat=num_feat,
    nonzero_feat=nonzero_feat,
    edge_types=edge_types,
    decoders=edge_type2decoder,
)

Create model
Instructions for updating:
dim is deprecated, use axis instead


In [17]:
print("Create optimizer")
with tf.name_scope('optimizer'):
    opt = DecagonOptimizer(
        embeddings=model.embeddings,
        latent_inters=model.latent_inters,
        latent_varies=model.latent_varies,
        degrees=degrees,
        edge_types=edge_types,
        edge_type2dim=edge_type2dim,
        placeholders=placeholders,
        batch_size=FLAGS.batch_size,
        margin=FLAGS.max_margin
    )

Create optimizer


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [18]:
print("Initialize session")
sess = tf.Session()
sess.run(tf.global_variables_initializer())
feed_dict = {}
pre_train_time = time.time()-start

Initialize session


# Train model

In [19]:
out_file = 'results_training/TRAIN_'+words[2]+d_text+DSE*('_DSE_'+str(n_se_mono))+BDM*('_BDM')\
            +'_genes_'+str(n_genes)+'_drugs_'+str(n_drugs)+'_se_'+str(n_se_combo)+'_epochs_'+\
            str(FLAGS.epochs)+'_h1_'+str(FLAGS.hidden1)+'_h2_'+str(FLAGS.hidden2)+\
            '_lr_'+str(FLAGS.learning_rate)+'_dropout_'+str(FLAGS.dropout)+'_valsize_'+\
            str(val_test_size) + noise_str
#out_file = 'results_training/sandboxish'
print(out_file)
output_data = {}

results_training/TRAIN_toy_genes_500_drugs_400_se_4_epochs_10_h1_64_h2_32_lr_0.001_dropout_0.1_valsize_0.15_noise_0.05


In [21]:
# Metric structures initialization
val_metrics = np.zeros([FLAGS.epochs,num_edge_types,3])
train_metrics = np.zeros([FLAGS.epochs,num_edge_types,3])
# Start training
print("Train model")
for epoch in range(FLAGS.epochs):
    t = time.time()
    minibatch.shuffle()
    itr = 0
    while not minibatch.end():
        # Construct feed dictionary
        feed_dict = minibatch.next_minibatch_feed_dict(placeholders=placeholders)
        feed_dict = minibatch.update_feed_dict(
            feed_dict=feed_dict,
            dropout=FLAGS.dropout,
            placeholders=placeholders)
        # Training step: run single weight update
        outs = sess.run([opt.opt_op, opt.cost, opt.batch_edge_type_idx], feed_dict=feed_dict)
        if (itr+1)%1000==0:print('Iteration',itr)
        itr += 1
    # Train & validation accuracy over all train data per epoch
    print('======================================================================================================================')
    print("Epoch", "%04d" % (epoch + 1),'finished!')
    print("Time=", "{:.5f}".format(time.time()-t))
    for r in range(num_edge_types):
        i,j,k = minibatch.idx2edge_type[r]
        print('Metrics for ', edge2name[i,j][k])
        train_metrics[epoch,r,:] = get_accuracy_scores(
            minibatch.train_edges[i,j][k], minibatch.train_edges_false[i,j][k],(i,j,k))
        val_metrics[epoch,r,:] = get_accuracy_scores(
            minibatch.val_edges[i,j][k], minibatch.val_edges_false[i,j][k],(i,j,k))
        print("AUROC:Train=", "{:.4f}".format(train_metrics[epoch,r,0])
              ,"Validation=", "{:.4f}".format(val_metrics[epoch,r,0])
              ,"AUPRC:Train=", "{:.4f}".format(train_metrics[epoch,r,1])
              ,"Validation=", "{:.4f}".format(val_metrics[epoch,r,1])
              ,"Accuracy:Train=", "{:.4f}".format(train_metrics[epoch,r,2])
              ,"Validation=", "{:.4f}".format(val_metrics[epoch,r,2]))
    output_data['val_metrics'] = val_metrics
    output_data['train_metrics'] = train_metrics
    output_data['epoch'] = epoch + 1
    with open(out_file,'wb') as f:
        pickle.dump(output_data, f, protocol=2)

Train model
Epoch 0001 finished!
Time= 8.94718
Metrics for  DTI
AUROC:Train= 0.8414 Validation= 0.6525 AUPRC:Train= 0.8781 Validation= 0.6466 Accuracy:Train= 0.8182 Validation= 0.5849
Metrics for  TDI
AUROC:Train= 0.8141 Validation= 0.6226 AUPRC:Train= 0.8750 Validation= 0.6716 Accuracy:Train= 0.8083 Validation= 0.5943
Metrics for  PPI
AUROC:Train= 0.9715 Validation= 0.7584 AUPRC:Train= 0.9638 Validation= 0.7651 Accuracy:Train= 0.9188 Validation= 0.7065
Metrics for  0
AUROC:Train= 0.5014 Validation= 0.4979 AUPRC:Train= 0.5015 Validation= 0.4979 Accuracy:Train= 0.5009 Validation= 0.4984
Metrics for  1
AUROC:Train= 0.5079 Validation= 0.4829 AUPRC:Train= 0.5056 Validation= 0.4791 Accuracy:Train= 0.5036 Validation= 0.4927
Metrics for  2
AUROC:Train= 0.5202 Validation= 0.5161 AUPRC:Train= 0.5231 Validation= 0.5088 Accuracy:Train= 0.5115 Validation= 0.5056
Metrics for  3
AUROC:Train= 0.5616 Validation= 0.5017 AUPRC:Train= 0.5404 Validation= 0.5314 Accuracy:Train= 0.5674 Validation= 0.5175
Ep

AUROC:Train= 0.5239 Validation= 0.5063 AUPRC:Train= 0.5192 Validation= 0.5069 Accuracy:Train= 0.5165 Validation= 0.5034
Metrics for  1
AUROC:Train= 0.5437 Validation= 0.5172 AUPRC:Train= 0.5310 Validation= 0.5125 Accuracy:Train= 0.5303 Validation= 0.5152
Metrics for  2
AUROC:Train= 0.6210 Validation= 0.5498 AUPRC:Train= 0.5973 Validation= 0.5331 Accuracy:Train= 0.5872 Validation= 0.5112
Metrics for  3
AUROC:Train= 0.7459 Validation= 0.5029 AUPRC:Train= 0.7244 Validation= 0.5114 Accuracy:Train= 0.6685 Validation= 0.4737
Epoch 0009 finished!
Time= 6.51362
Metrics for  DTI
AUROC:Train= 0.9896 Validation= 0.8117 AUPRC:Train= 0.9893 Validation= 0.8004 Accuracy:Train= 0.9466 Validation= 0.6226
Metrics for  TDI
AUROC:Train= 0.9951 Validation= 0.8067 AUPRC:Train= 0.9949 Validation= 0.8361 Accuracy:Train= 0.9545 Validation= 0.6792
Metrics for  PPI
AUROC:Train= 0.9973 Validation= 0.8503 AUPRC:Train= 0.9968 Validation= 0.8988 Accuracy:Train= 0.9803 Validation= 0.8424
Metrics for  0
AUROC:Train= 0

In [None]:
# End of training. Metric structure handling   
print("Optimization finished!")
test_metrics = np.zeros([num_edge_types,3])
for et in range(num_edge_types):
    i,j,k = minibatch.idx2edge_type[et]
    test_metrics[et,:] = get_accuracy_scores(
        minibatch.test_edges[i,j][k], minibatch.test_edges_false[i,j][k], (i,j,k),
        noise=bool(noise))
    print("Edge type=", edge2name[i,j][k])
    print("Edge type:", "%04d" % et, "Test AUROC score", "{:.5f}".format(test_metrics[et,0]))
    print("Edge type:", "%04d" % et, "Test AUPRC score", "{:.5f}".format(test_metrics[et,1]))
    print("Edge type:", "%04d" % et, "Test Accuracy score", "{:.5f}".format(test_metrics[et,2]))
    print()
output_data['test_metrics'] = test_metrics
memUse = ps.memory_info()
print('Virtual memory:', memUse.vms*1e-09,'Gb')
print('RSS Memory:', memUse.rss*1e-09,'Gb')
train_time=time.time()-pre_train_time
output_data['pre_train_time'] = pre_train_time
output_data['train_time'] = train_time
output_data['edge2name'] = edge2name
output_data['drug2idx'] = drug2idx
output_data['gene2idx'] = gene2idx
output_data['vms'] = memUse.vms
output_data['rss'] = memUse.rss
with open(out_file,'wb') as f:
    pickle.dump(output_data, f, protocol=2)
print('Total time:', datetime.timedelta(seconds=time.time()-start))

In [25]:
opt_dic = {edge_type: [None]*n for edge_type, n in edge_types.items()}
for i, j in edge_types:
    for k in range(edge_types[i,j]):
        et = minibatch.edge_type2idx[i,j,k]
        feed_dict.update({placeholders['dropout']: 0})
        feed_dict.update({placeholders['batch_edge_type_idx']: et})
        feed_dict.update({placeholders['batch_row_edge_type']: i})
        feed_dict.update({placeholders['batch_col_edge_type']: j})
        opt_dic[i,j][k] = sess.run(opt.predictions, feed_dict=feed_dict)
with open('data/data_structures/intento_optimizer','wb') as f:
    pickle.dump(opt_dic, f, protocol=2)