In [1]:
#@title Imports  { form-width: "30%" }
from __future__ import division
from __future__ import print_function
from operator import itemgetter
from itertools import combinations
import time
import os

import tensorflow as tf
from tensorflow.python.summary.writer.writer import FileWriter
import numpy as np
import networkx as nx
import scipy.sparse as sp
from sklearn import metrics
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle

from decagon.deep.optimizer import DecagonOptimizer
from decagon.deep.model import DecagonModel
from decagon.deep.minibatch import EdgeMinibatchIterator
from decagon.utility import rank_metrics, preprocessing

# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True

np.random.seed(0)
%load_ext autoreload
%autoreload 2
%load_ext tensorboard.notebook

In [2]:
def get_accuracy_scores(edges_pos, edges_neg, edge_type):
    feed_dict.update({placeholders['dropout']: 0})
    feed_dict.update({placeholders['batch_edge_type_idx']: minibatch.edge_type2idx[edge_type]})
    feed_dict.update({placeholders['batch_row_edge_type']: edge_type[0]})
    feed_dict.update({placeholders['batch_col_edge_type']: edge_type[1]})
    rec = sess.run(opt.predictions, feed_dict=feed_dict)

    def sigmoid(x):
        return 1. / (1 + np.exp(-x))

    # Predict on test set of edges
    preds = []
    actual = []
    predicted = []
    edge_ind = 0
    for u, v in edges_pos[edge_type[:2]][edge_type[2]]:
        score = sigmoid(rec[u, v])
        preds.append(score)
        assert adj_mats_orig[edge_type[:2]][edge_type[2]][u,v] == 1, 'Problem 1'

        actual.append(edge_ind)
        predicted.append((score, edge_ind))
        edge_ind += 1

    preds_neg = []
    for u, v in edges_neg[edge_type[:2]][edge_type[2]]:
        score = sigmoid(rec[u, v])
        preds_neg.append(score)
        assert adj_mats_orig[edge_type[:2]][edge_type[2]][u,v] == 0, 'Problem 0'

        predicted.append((score, edge_ind))
        edge_ind += 1
        
    preds_all = np.hstack([preds, preds_neg])
    preds_all = np.nan_to_num(preds_all)
    labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))])
    predicted = list(zip(*sorted(predicted, reverse=True, key=itemgetter(0))))[1]

    roc_sc = metrics.roc_auc_score(labels_all, preds_all)
    aupr_sc = metrics.average_precision_score(labels_all, preds_all)
    apk_sc = rank_metrics.apk(actual, predicted, k=50)

    return roc_sc, aupr_sc, apk_sc


def construct_placeholders(edge_types):
    placeholders = {
        'batch': tf.placeholder(tf.int64, name='batch'),
        'batch_edge_type_idx': tf.placeholder(tf.int64, shape=(), name='batch_edge_type_idx'),
        'batch_row_edge_type': tf.placeholder(tf.int64, shape=(), name='batch_row_edge_type'),
        'batch_col_edge_type': tf.placeholder(tf.int64, shape=(), name='batch_col_edge_type'),
        'degrees': tf.placeholder(tf.int64),
        'dropout': tf.placeholder_with_default(0., shape=()),
    }
    placeholders.update({
        'adj_mats_%d,%d,%d' % (i, j, k): tf.sparse_placeholder(tf.float32)
        for i, j in edge_types for k in range(edge_types[i,j])})
    placeholders.update({
        'feat_%d' % i: tf.sparse_placeholder(tf.float32)
        for i, _ in edge_types})
    return placeholders

In [3]:
val_test_size = 0.05
data_folder = "./data/Full/"

In [4]:
# Map for Organization ids
orgIds_map = {}
with open(data_folder + "organizationIds.pickle", "rb") as _orgf:
    _index = 0
    for _id in pickle.load(_orgf):
        if _id not in orgIds_map:
            orgIds_map[_index] =_id
            _index += 1
len(orgIds_map)

649079

In [5]:
companies, CCI = set(), {}

with open(data_folder + "Organization_Organization.csv", encoding="utf8") as _oof:
    for line in _oof.readlines():
        _cce = line.strip().split(",")
        _c1, _c2, _e = _cce[0], _cce[1], _cce[2]
        companies.add(_c1)
        companies.add(_c2)
        if _e not in CCI: CCI[_e] = CCI.get(_e, [])
        CCI[_e].append((_c1, _c2))
          
n_companies = len(companies)  
n_compcomp_rel_types = len(CCI)
map_companies = {_p:_i for _i, _p in enumerate(list(companies))}
n_companies

205178

In [6]:
persons, CPI = set(), {}

with open(data_folder + "Person_Organization.csv") as _opf:
    for _line in _opf.readlines():
        _cpe = _line.strip().split(",")
        _c, _p, _e = _cpe[0], _cpe[1], _cpe[2] 
        if _c in map_companies:
            persons.add(_p)
            if _e not in CPI: CPI[_e] = CPI.get(_e, [])
            CPI[_e].append((_c, _p))

n_persons = len(persons)
map_persons = {_p:_i for _i, _p in enumerate(list(persons))}
n_persons

355642

In [7]:
comp_comp_adj_list = []
for _e in tqdm(CCI):
    _mat = np.array([[map_companies[_u], map_companies[_v]] for _u, _v in CCI[_e]])
    _data = np.ones(len(CCI[_e]))
    comp_comp_adj_list.append(sp.csr_matrix((_data, (_mat[:, 0], _mat[:, 1])), shape=(n_companies, n_companies)))
comp_degrees_list = [np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in comp_comp_adj_list]

100%|██████████| 4/4 [00:01<00:00,  2.51it/s]


In [8]:
pers_comp_adj_list = []
for _e in tqdm(CPI):
    _mat = np.array([[map_persons[_v], map_companies[_u]] for _u, _v in CPI[_e]])
    _data = np.ones(len(CPI[_e]))
    pers_comp_adj_list.append(sp.csr_matrix((_data, (_mat[:, 0], _mat[:, 1])), shape=(n_persons, n_companies)))
pers_comp_degrees_list = [np.array(_adj.sum(axis=0)).squeeze() for _adj in pers_comp_adj_list]

100%|██████████| 4/4 [00:03<00:00,  1.17it/s]


In [9]:
# data representation
adj_mats_orig = {
#     (0, 0): [pers_adj, pers_adj.transpose(copy=True)],
    (0, 1): pers_comp_adj_list,
    (1, 0): [x.transpose(copy=True) for x in pers_comp_adj_list],
    (1, 1): comp_comp_adj_list + [x.transpose(copy=True) for x in comp_comp_adj_list],
#     (1, 2): [comp_bankr_adj],
#     (2, 1): [bankr_comp_adj]
}
degrees = {
    0: pers_comp_degrees_list,
    1: comp_degrees_list + comp_degrees_list,
#     2: [np.array([np.sum(bankr_comp_adj)]), np.array([np.sum(bankr_comp_adj)])]
}

In [10]:
# featureless (genes)
pers_feat = sp.identity(n_persons)
pers_nonzero_feat, pers_num_feat = pers_feat.shape
pers_feat = preprocessing.sparse_to_tuple(pers_feat.tocoo())

# features (drugs)
comp_feat = sp.identity(n_companies)
comp_nonzero_feat, comp_num_feat = comp_feat.shape
comp_feat = preprocessing.sparse_to_tuple(comp_feat.tocoo())

# features (drugs)
# banrp_feat = sp.identity(n_bankruptcy)
# banrp_nonzero_feat, banrp_num_feat = banrp_feat.shape
# banrp_feat = preprocessing.sparse_to_tuple(banrp_feat.tocoo())

In [11]:
# data representation
num_feat = {
    0: pers_num_feat,
    1: comp_num_feat,
#     2: banrp_num_feat
}
nonzero_feat = {
    0: pers_nonzero_feat,
    1: comp_nonzero_feat,
#     2: banrp_nonzero_feat
}
feat = {
    0: pers_feat,
    1: comp_feat,
#     2: banrp_feat
}

edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items()}
edge_type2decoder = {
#     (0, 0): 'bilinear',
    (0, 1): 'bilinear',
    (1, 0): 'bilinear',
    (1, 1): 'dedicom',
#     (1, 2): 'bilinear',
#     (2, 1): 'bilinear'
}

edge_types = {k: len(v) for k, v in adj_mats_orig.items()}
num_edge_types = sum(edge_types.values())
print("Edge types:", "%d" % num_edge_types)

Edge types: 16


In [12]:
###########################################################
#
# Settings and placeholders
#
###########################################################

flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_integer('neg_sample_size', 1, 'Negative sample size.')
flags.DEFINE_float('learning_rate', 0.001, 'Initial learning rate.')
flags.DEFINE_integer('epochs', 1, 'Number of epochs to train.')
flags.DEFINE_integer('hidden1', 64, 'Number of units in hidden layer 1.')
flags.DEFINE_integer('hidden2', 32, 'Number of units in hidden layer 2.')
flags.DEFINE_float('weight_decay', 0, 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_float('dropout', 0.1, 'Dropout rate (1 - keep probability).')
flags.DEFINE_float('max_margin', 0.1, 'Max margin parameter in hinge loss')
flags.DEFINE_integer('batch_size', 512, 'minibatch size.')
flags.DEFINE_boolean('bias', True, 'Bias term.')
# Important -- Do not evaluate/print validation performance every iteration as it can take
# substantial amount of time
PRINT_PROGRESS_EVERY = 150

In [13]:
print("Defining placeholders")
placeholders = construct_placeholders(edge_types)
tf.app.flags.DEFINE_string('f', '', 'kernel')
placeholders

Defining placeholders


{'batch': <tf.Tensor 'batch:0' shape=<unknown> dtype=int64>,
 'batch_edge_type_idx': <tf.Tensor 'batch_edge_type_idx:0' shape=() dtype=int64>,
 'batch_row_edge_type': <tf.Tensor 'batch_row_edge_type:0' shape=() dtype=int64>,
 'batch_col_edge_type': <tf.Tensor 'batch_col_edge_type:0' shape=() dtype=int64>,
 'degrees': <tf.Tensor 'Placeholder:0' shape=<unknown> dtype=int64>,
 'dropout': <tf.Tensor 'PlaceholderWithDefault:0' shape=() dtype=float32>,
 'adj_mats_0,1,0': <tensorflow.python.framework.sparse_tensor.SparseTensor at 0x246df85d630>,
 'adj_mats_0,1,1': <tensorflow.python.framework.sparse_tensor.SparseTensor at 0x246d1d1cd68>,
 'adj_mats_0,1,2': <tensorflow.python.framework.sparse_tensor.SparseTensor at 0x246dfa14cc0>,
 'adj_mats_0,1,3': <tensorflow.python.framework.sparse_tensor.SparseTensor at 0x246dfa4e828>,
 'adj_mats_1,0,0': <tensorflow.python.framework.sparse_tensor.SparseTensor at 0x246dfa4ea90>,
 'adj_mats_1,0,1': <tensorflow.python.framework.sparse_tensor.SparseTensor at 0

In [14]:
# Create minibatch iterator

print("Create minibatch iterator")
minibatch = EdgeMinibatchIterator(
    adj_mats=adj_mats_orig,
    feat=feat,
    edge_types=edge_types,
    batch_size=FLAGS.batch_size,
    val_test_size=val_test_size
)

Create minibatch iterator
Minibatch edge type: (0, 1, 0)
Constructing test edges= 0000/14438
Constructing test edges= 1000/14438
Constructing test edges= 2000/14438
Constructing test edges= 3000/14438
Constructing test edges= 4000/14438
Constructing test edges= 5000/14438
Constructing test edges= 6000/14438
Constructing test edges= 7000/14438
Constructing test edges= 8000/14438
Constructing test edges= 9000/14438
Constructing test edges= 10000/14438
Constructing test edges= 11000/14438
Constructing test edges= 12000/14438
Constructing test edges= 13000/14438
Constructing test edges= 14000/14438
Constructing val edges= 0000/14438
Constructing val edges= 1000/14438
Constructing val edges= 2000/14438
Constructing val edges= 3000/14438
Constructing val edges= 4000/14438
Constructing val edges= 5000/14438
Constructing val edges= 6000/14438
Constructing val edges= 7000/14438
Constructing val edges= 8000/14438
Constructing val edges= 9000/14438
Constructing val edges= 10000/14438
Constructing

In [15]:
# create model

print("Create model")
model = DecagonModel(
    placeholders=placeholders,
    num_feat=num_feat,
    nonzero_feat=nonzero_feat,
    edge_types=edge_types,
    decoders=edge_type2decoder,
)

Create model
Instructions for updating:
dim is deprecated, use axis instead


In [None]:
# create optimizer

print("Create optimizer")
with tf.name_scope('optimizer'):
    opt = DecagonOptimizer(
        embeddings=model.embeddings,
        latent_inters=model.latent_inters,
        latent_varies=model.latent_varies,
        degrees=degrees,
        edge_types=edge_types,
        edge_type2dim=edge_type2dim,
        placeholders=placeholders,
        batch_size=FLAGS.batch_size,
        margin=FLAGS.max_margin
    )

print("Initialize session")
sess = tf.Session()
# FileWriter("output", sess.graph)
sess.run(tf.global_variables_initializer())
feed_dict = {}



Create optimizer


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Initialize session


In [65]:
###########################################################
#
# Train model
#
###########################################################

print("Train model")
for epoch in range(FLAGS.epochs):

    minibatch.shuffle()
    itr = 0
    while not minibatch.end():
        
        # Construct feed dictionary
        feed_dict = minibatch.next_minibatch_feed_dict(placeholders=placeholders)
        feed_dict = minibatch.update_feed_dict(
            feed_dict=feed_dict,
            dropout=FLAGS.dropout,
            placeholders=placeholders)

        t = time.time()
        
        # Training step: run single weight update
        outs = sess.run([opt.opt_op, opt.cost, opt.batch_edge_type_idx], feed_dict=feed_dict)
        train_cost = outs[1]
        batch_edge_type = outs[2]

        if itr % PRINT_PROGRESS_EVERY == 0:
            val_auc, val_auprc, val_apk = get_accuracy_scores(
                minibatch.val_edges, minibatch.val_edges_false,
                minibatch.idx2edge_type[minibatch.current_edge_type_idx])

            print("Epoch:", "%04d" % (epoch + 1), "Iter:", "%04d" % (itr + 1), "Edge:", "%04d" % batch_edge_type,
                  "train_loss=", "{:.5f}".format(train_cost),
                  "val_roc=", "{:.5f}".format(val_auc), "val_auprc=", "{:.5f}".format(val_auprc),
                  "val_apk=", "{:.5f}".format(val_apk), "time=", "{:.5f}".format(time.time() - t))

        itr += 1

print("Optimization finished!")

for et in range(num_edge_types):
    roc_score, auprc_score, apk_score = get_accuracy_scores(
        minibatch.test_edges, minibatch.test_edges_false, minibatch.idx2edge_type[et])
    print("Edge type=", "[%02d, %02d, %02d]" % minibatch.idx2edge_type[et])
    print("Edge type:", "%04d" % et, "Test AUROC score", "{:.5f}".format(roc_score))
    print("Edge type:", "%04d" % et, "Test AUPRC score", "{:.5f}".format(auprc_score))
    print("Edge type:", "%04d" % et, "Test AP@k score", "{:.5f}".format(apk_score))
    print()
