In [31]:
from __future__ import division
from __future__ import print_function
from operator import itemgetter
from itertools import combinations
import time
import os

import tensorflow.compat.v1 as tf

import numpy as np
import networkx as nx
import scipy.sparse as sp
from sklearn import metrics

from decagon.deep.optimizer import DecagonOptimizer
from decagon.deep.model import DecagonModel
from decagon.deep.minibatch import EdgeMinibatchIterator
from decagon.utility import rank_metrics, preprocessing

tf.disable_eager_execution()

########## scipy: version 1.9로 바꿔야함

#깔아야하는 packages
#networkx
#scikit-learn
#tensorflow

# Train on CPU (hide GPU) due to memory constraints
os.environ['CUDA_VISIBLE_DEVICES'] = ""

# Train on GPU
# os.environ["CUDA_DEVICE_ORDER"] = 'PCI_BUS_ID'
# os.environ["CUDA_VISIBLE_DEVICES"] = '0'
# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True

np.random.seed(0)

In [32]:

###########################################################
#
# Functions
#
###########################################################


def get_accuracy_scores(edges_pos, edges_neg, edge_type):
    feed_dict.update({placeholders['dropout']: 0})
    feed_dict.update({placeholders['batch_edge_type_idx']: minibatch.edge_type2idx[edge_type]})
    feed_dict.update({placeholders['batch_row_edge_type']: edge_type[0]})
    feed_dict.update({placeholders['batch_col_edge_type']: edge_type[1]})
    rec = sess.run(opt.predictions, feed_dict=feed_dict)

    def sigmoid(x):
        return 1. / (1 + np.exp(-x))

    # Predict on test set of edges
    preds = []
    actual = []
    predicted = []
    edge_ind = 0
    for u, v in edges_pos[edge_type[:2]][edge_type[2]]:
        score = sigmoid(rec[u, v])
        preds.append(score)
        assert adj_mats_orig[edge_type[:2]][edge_type[2]][u,v] == 1, 'Problem 1'

        actual.append(edge_ind)
        predicted.append((score, edge_ind))
        edge_ind += 1

    preds_neg = []
    for u, v in edges_neg[edge_type[:2]][edge_type[2]]:
        score = sigmoid(rec[u, v])
        preds_neg.append(score)
        assert adj_mats_orig[edge_type[:2]][edge_type[2]][u,v] == 0, 'Problem 0'

        predicted.append((score, edge_ind))
        edge_ind += 1

    preds_all = np.hstack([preds, preds_neg])
    preds_all = np.nan_to_num(preds_all)
    labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))])
    predicted = list(zip(*sorted(predicted, reverse=True, key=itemgetter(0))))[1]

    roc_sc = metrics.roc_auc_score(labels_all, preds_all)
    aupr_sc = metrics.average_precision_score(labels_all, preds_all)
    apk_sc = rank_metrics.apk(actual, predicted, k=50)

    return roc_sc, aupr_sc, apk_sc


def construct_placeholders(edge_types):
    placeholders = {
        'batch': tf.placeholder(tf.int32, name='batch'),
        'batch_edge_type_idx': tf.placeholder(tf.int32, shape=(), name='batch_edge_type_idx'),
        'batch_row_edge_type': tf.placeholder(tf.int32, shape=(), name='batch_row_edge_type'),
        'batch_col_edge_type': tf.placeholder(tf.int32, shape=(), name='batch_col_edge_type'),
        'degrees': tf.placeholder(tf.int32),
        'dropout': tf.placeholder_with_default(0., shape=()),
    }
    placeholders.update({
        'adj_mats_%d,%d,%d' % (i, j, k): tf.sparse_placeholder(tf.float32)
        for i, j in edge_types for k in range(edge_types[i,j])})
    placeholders.update({
        'feat_%d' % i: tf.sparse_placeholder(tf.float32)
        for i, _ in edge_types})
    return placeholders

In [33]:

###########################################################
#
# Load and preprocess data (This is a dummy toy example!)
#
###########################################################

####
# The following code uses artificially generated and very small networks.
# Expect less than excellent performance as these random networks do not have any interesting structure.
# The purpose of main.py is to show how to use the code!
#
# All preprocessed datasets used in the drug combination study are at: http://snap.stanford.edu/decagon:
# (1) Download datasets from http://snap.stanford.edu/decagon to your local machine.
# (2) Replace dummy toy datasets used here with the actual datasets you just downloaded.
# (3) Train & test the model.
####

val_test_size = 0.05

In [34]:
print(type(gene_adj))
print(type(gene_drug_adj))
print(type(drug_drug_adj_list))
print("transposed gene_adj")
print(gene_adj.transpose(copy=True))

<class 'scipy.sparse._csr.csr_matrix'>
<class 'scipy.sparse._csr.csr_matrix'>
<class 'list'>
transposed gene_adj
  (118460, 114785)	1
  (3607, 114785)	1
  (8338, 114785)	1
  (11266, 114785)	1
  (11340, 114785)	1
  (23172, 114785)	1
  (221656, 114785)	1
  (221937, 114785)	1
  (342371, 114786)	1
  (9662, 114786)	1
  (55200, 114786)	1
  (137695, 114787)	1
  (1901, 114787)	1
  (2775, 114787)	1
  (2781, 114787)	1
  (3184, 114787)	1
  (4914, 114787)	1
  (5793, 114787)	1
  (7448, 114787)	1
  (8089, 114787)	1
  (10217, 114787)	1
  (23528, 114787)	1
  (29785, 114787)	1
  (51150, 114787)	1
  (51343, 114787)	1
  (54825, 114787)	1
  (57559, 114787)	1
  (84548, 114787)	1
  (285613, 114787)	1


In [35]:
print([gene_adj, gene_adj.transpose(copy=True)])
print(type(gene_adj))
print(gene_adj[0,3])
print(gene_adj[0].shape)
print(gene_adj[0][0])
print(type(drug_drug_adj_list))

[<110255169x110255169 sparse matrix of type '<class 'numpy.int8'>'
	with 29 stored elements in Compressed Sparse Row format>, <110255169x110255169 sparse matrix of type '<class 'numpy.int8'>'
	with 29 stored elements in Compressed Sparse Column format>]
<class 'scipy.sparse._csr.csr_matrix'>
0
(1, 110255169)

<class 'list'>


In [36]:
import csv

# for line in reader:
    

In [37]:
import pandas as pd



In [38]:
# protein-protein

df = pd.read_csv('/mnt/nas2/seogyeong/bio-decagon-ppi.csv', sep=',', header=0)
print(type(df.values))
print(df.values.T[0])
gene_1max = np.max( df.values.T[0] )
gene_2max = np.max( df.values.T[1] )
gene_max = gene_1max
if gene_max < gene_2max:
    gene_max = gene_2max
print(gene_max)

gene_adj = sp.csr_matrix((gene_max, gene_max), dtype=np.int8)

a =  0

f = open("/mnt/nas2/seogyeong/bio-decagon-ppi.csv")
reader = csv.reader(f)
count = 0

print(gene_adj)
print(gene_adj.shape)

for line in reader:
    #print(line)
    count = count + 1
    if a==0:
        a=1
        continue
    if count % 25 == 0:
        print("count: ",count, ", ", line)
    gene_adj[int(line[0]), int(line[1])] = 1
print("hey")
print(gene_adj)
gene_degrees = np.array(gene_adj.sum(axis=0)).squeeze()
print(gene_degrees)
# print(df)

<class 'numpy.ndarray'>
[114787 114787 114787 ...   5635   5636   8480]
110255169

(110255169, 110255169)


  self._set_intXint(row, col, x.flat[0])


count:  25 ,  ['114785', '221937']
count:  50 ,  ['114784', '4690']
count:  75 ,  ['114781', '491']
count:  100 ,  ['114781', '8454']
count:  125 ,  ['114781', '10755']
count:  150 ,  ['114789', '4035']
count:  175 ,  ['28996', '25942']
count:  200 ,  ['28996', '4204']
count:  225 ,  ['28996', '7090']
count:  250 ,  ['28996', '6497']
count:  275 ,  ['28996', '4999']
count:  300 ,  ['28992', '11198']
count:  325 ,  ['28992', '7520']
count:  350 ,  ['28999', '120']
count:  375 ,  ['28998', '6128']
count:  400 ,  ['28998', '54948']
count:  425 ,  ['28998', '51250']
count:  450 ,  ['28998', '6164']
count:  475 ,  ['28998', '4358']
count:  500 ,  ['28998', '79590']
count:  525 ,  ['28998', '51023']
count:  550 ,  ['28998', '60558']
count:  575 ,  ['28998', '51649']
count:  600 ,  ['28998', '7189']
count:  625 ,  ['28998', '6235']
count:  650 ,  ['28998', '51264']
count:  675 ,  ['55349', '64577']
count:  700 ,  ['55349', '2588']
count:  725 ,  ['55349', '416']
count:  750 ,  ['55293', '9563

KeyboardInterrupt: 

In [None]:
#protein-protein check
print("1")
print(gene_gene_adj)
print("2")

close(f)

In [None]:

# gene_drug_adj = sp.csr_matrix((10 * np.random.randn(n_genes, n_drugs) > 15).astype(int))
# drug_gene_adj = gene_drug_adj.transpose(copy=True)

f = open("/mnt/nas2/seogyeong/bio-decagon-targets-all.csv")
reader = csv.reader(f)

drug_max = 0
a =  0
for line in reader:
    print(int(line[0][3:]))
    if a==0:
        a=1
        continue
    if( drug_max < int(line[0][3:])) drug_max = int(line[0][3:])
print(drug_max)
close(f)

f = open("/mnt/nas2/seogyeong/bio-decagon-targets-all.csv")
reader = csv.reader(f)
gene_drug_adj = sp.csr_matrix((gene_max, drug_max), dtype=np.int8)

a =  0
for line in reader:
    print(line)
    if a==0:
        a=1
        continue
    gene_drug_adj[int(line[1]), int(line[0][3:])] = 1
print("hey")
print(gene_drug_adj)
drug_gene_adj = gene_drug_adj.transpose(copy=True)

close(f)





In [None]:
#drug_gene check
print("1")
print(gene_drug_adj)
print("2")

In [None]:
# drug_drug check


#drug_drug_adj_list = []
#tmp = np.dot(drug_gene_adj, gene_drug_adj)
#for i in range(n_drugdrug_rel_types):
#    mat = np.zeros((n_drugs, n_drugs))
#    for d1, d2 in combinations(list(range(n_drugs)), 2):
#        if tmp[d1, d2] == i + 4:
#            mat[d1, d2] = mat[d2, d1] = 1.
#    drug_drug_adj_list.append(sp.csr_matrix(mat))
#    print("hmm")
#    print(drug_drug_adj_list)
# drug_degrees_list = [np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list]





drug_drug_adj_list = []

# category
df = pd.read_csv('/mnt/nas2/seogyeong/bio-decagon-effectcategories.csv', sep=',', header=0)
print(type(df.values))
print(df.values.T[0])
category = df.values.T[0]




for cat in category:
    gene_drug_adj = sp.csr_matrix((drug_max, drug_max), dtype=np.int8)
    f = open("/mnt/nas2/seogyeong/bio-decagon-effectcategories.csv")
    reader = csv.reader(f)
    
    for line in reader:
        if line[2] == cat:
            gene_drug_adj[int(line[0][3:]), int(line[1][3:]) ] = 1
            gene_drug_adj[int(line[1][3:]), int(line[0][3:]) ] = 1
    drug_drug_adj_list.append(gene_drug_adj)
    close(f)
drug_degrees_list = [np.array(drug_adj.sum(axis=0)).squeeze() for drug_adj in drug_drug_adj_list]


In [None]:
# data representation
adj_mats_orig = {
    (0, 0): [gene_adj, gene_adj.transpose(copy=True)],
    (0, 1): [gene_drug_adj],
    (1, 0): [drug_gene_adj],
    (1, 1): drug_drug_adj_list + [x.transpose(copy=True) for x in drug_drug_adj_list],
}

In [None]:

# featureless (genes)
gene_feat = sp.identity(gene_max)
gene_nonzero_feat, gene_num_feat = gene_feat.shape
gene_feat = preprocessing.sparse_to_tuple(gene_feat.tocoo())

# features (drugs)
drug_feat = sp.identity(drug_max)
drug_nonzero_feat, drug_num_feat = drug_feat.shape
drug_feat = preprocessing.sparse_to_tuple(drug_feat.tocoo())

# data representation
num_feat = {
    0: gene_num_feat,
    1: drug_num_feat,
}
nonzero_feat = {
    0: gene_nonzero_feat,
    1: drug_nonzero_feat,
}
feat = {
    0: gene_feat,
    1: drug_feat,
}

edge_type2dim = {k: [adj.shape for adj in adjs] for k, adjs in adj_mats_orig.items()}
edge_type2decoder = {
    (0, 0): 'bilinear',
    (0, 1): 'bilinear',
    (1, 0): 'bilinear',
    (1, 1): 'dedicom',
}
print(adj_mats_orig)
edge_types = {k: len(v) for k, v in adj_mats_orig.items()}
print(edge_types)
num_edge_types = sum(edge_types.values())
print("Edge types:", "%d" % num_edge_types)


In [None]:

###########################################################
#
# Settings and placeholders
#
###########################################################

flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_integer('neg_sample_size', 1, 'Negative sample size.')
flags.DEFINE_float('learning_rate', 0.001, 'Initial learning rate.')
flags.DEFINE_integer('epochs', 50, 'Number of epochs to train.')
flags.DEFINE_integer('hidden1', 64, 'Number of units in hidden layer 1.')
flags.DEFINE_integer('hidden2', 32, 'Number of units in hidden layer 2.')
flags.DEFINE_float('weight_decay', 0, 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_float('dropout', 0.1, 'Dropout rate (1 - keep probability).')
flags.DEFINE_float('max_margin', 0.1, 'Max margin parameter in hinge loss')
flags.DEFINE_integer('batch_size', 512, 'minibatch size.')
flags.DEFINE_boolean('bias', True, 'Bias term.')
# Important -- Do not evaluate/print validation performance every iteration as it can take
# substantial amount of time
PRINT_PROGRESS_EVERY = 150

print("Defining placeholders")
placeholders = construct_placeholders(edge_types)
print(placeholders)


In [None]:

###########################################################
#
# Create minibatch iterator, model and optimizer
#
###########################################################

print("Create minibatch iterator")
minibatch = EdgeMinibatchIterator(
    adj_mats=adj_mats_orig,
    feat=feat,
    edge_types=edge_types,
    batch_size=FLAGS.batch_size,
    val_test_size=val_test_size
)

print("Create model")
model = DecagonModel(
    placeholders=placeholders,
    num_feat=num_feat,
    nonzero_feat=nonzero_feat,
    edge_types=edge_types,
    decoders=edge_type2decoder,
)

print("Create optimizer")
with tf.name_scope('optimizer'):
    opt = DecagonOptimizer(
        embeddings=model.embeddings,
        latent_inters=model.latent_inters,
        latent_varies=model.latent_varies,
        degrees=degrees,
        edge_types=edge_types,
        edge_type2dim=edge_type2dim,
        placeholders=placeholders,
        batch_size=FLAGS.batch_size,
        margin=FLAGS.max_margin
    )

print("Initialize session")
sess = tf.Session()
sess.run(tf.global_variables_initializer())
feed_dict = {}


In [None]:

###########################################################
#
# Train model
#
###########################################################

print("Train model")
for epoch in range(FLAGS.epochs):

    minibatch.shuffle()
    itr = 0
    while not minibatch.end():
        # Construct feed dictionary
        feed_dict = minibatch.next_minibatch_feed_dict(placeholders=placeholders)
        feed_dict = minibatch.update_feed_dict(
            feed_dict=feed_dict,
            dropout=FLAGS.dropout,
            placeholders=placeholders)

        t = time.time()

        # Training step: run single weight update
        outs = sess.run([opt.opt_op, opt.cost, opt.batch_edge_type_idx], feed_dict=feed_dict)
        train_cost = outs[1]
        batch_edge_type = outs[2]

        if itr % PRINT_PROGRESS_EVERY == 0:
            val_auc, val_auprc, val_apk = get_accuracy_scores(
                minibatch.val_edges, minibatch.val_edges_false,
                minibatch.idx2edge_type[minibatch.current_edge_type_idx])

            print("Epoch:", "%04d" % (epoch + 1), "Iter:", "%04d" % (itr + 1), "Edge:", "%04d" % batch_edge_type,
                  "train_loss=", "{:.5f}".format(train_cost),
                  "val_roc=", "{:.5f}".format(val_auc), "val_auprc=", "{:.5f}".format(val_auprc),
                  "val_apk=", "{:.5f}".format(val_apk), "time=", "{:.5f}".format(time.time() - t))

        itr += 1

print("Optimization finished!")

for et in range(num_edge_types):
    roc_score, auprc_score, apk_score = get_accuracy_scores(
        minibatch.test_edges, minibatch.test_edges_false, minibatch.idx2edge_type[et])
    print("Edge type=", "[%02d, %02d, %02d]" % minibatch.idx2edge_type[et])
    print("Edge type:", "%04d" % et, "Test AUROC score", "{:.5f}".format(roc_score))
    print("Edge type:", "%04d" % et, "Test AUPRC score", "{:.5f}".format(auprc_score))
    print("Edge type:", "%04d" % et, "Test AP@k score", "{:.5f}".format(apk_score))
    print()
