In [25]:
from comet_ml import Experiment

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from random import choice
import urllib.request  # the lib that handles the url stuff
import time

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import zero_one_loss
from itertools import *


url = "https://raw.githubusercontent.com/gracexwho/drug-drug-interactions/master/ChCh-Miner_durgbank-chem-chem.tsv"
url_data = urllib.request.urlopen(url) 

G = nx.read_edgelist(url_data)

print(G.number_of_nodes())
print(G.number_of_edges())


# Create an experiment
experiment = Experiment(api_key="yeThLw8MLFuaMF3cVW1b9IsIt",
                        project_name="Node2Vec", workspace="gracexwho")

# Report any information you need by:

################# CONTROL ##################

hyper_params = {"learning_rate": 0.03, "epochs": 20, "num_walks": 100, "walk_length": 10, "window_size": 3}
experiment.log_parameters(hyper_params)

num_train = 1200
num_val = 100

################# CONTROL ##################

1514
48514


COMET INFO: ----------------------------
COMET INFO: Comet.ml Experiment Summary:
COMET INFO:   Data:
COMET INFO:     url: https://www.comet.ml/gracexwho/node2vec/1339ef6c37c343cf89d1772301c5d244
COMET INFO:   Metrics:
COMET INFO:                          loss: 1.386304259300232
COMET INFO:         sys.gpu.0.free_memory: 4216324096
COMET INFO:     sys.gpu.0.gpu_utilization: 0
COMET INFO:        sys.gpu.0.total_memory: 4294967296
COMET INFO:         sys.gpu.0.used_memory: 78643200
COMET INFO: ----------------------------
COMET INFO: old comet version (1.0.55) detected. current: 2.0.1 please update your comet lib with command: `pip install --no-cache-dir --upgrade comet_ml`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/gracexwho/node2vec/a7767292f5de4cfd8e8d9fd6c9c118c6



In [22]:
# Generate random walks
pairs = []

for i in range(hyper_params['num_walks']):
    current = choice(list(G.nodes()))
    walk = [current]
    y = []
    
    for w in range(hyper_params['walk_length']):
        # walk to an adjacent node
        # error: some adjacent nodes are NOT IN the training set
        c = list(G.adj[current])
        current = choice(c)
        walk.append(current)
    
    # take permutations as closely related within the window size
    y = [permutations(walk[i : i+hyper_params['window_size']], 2) for i in range(len(walk)-hyper_params['window_size'])]
    z = []
    for l in y:
        z.extend(list(l))
    pairs.extend(z)

# remove duplicates
pairs = list(dict.fromkeys(pairs))
#print(pairs)


Pairs = [(id1, id2) …]

-Define ID map (pre-processing step)
-Before training loop, do

Pairs = (get_id(pair[0]) , get_id(pair[1] for pair in pairs)

Id_map = {}
Def get_id(id_string):
	If id_string is in id_map:
		Return id_map(id_string)
	Else:
		Id = len(id_map)
		id_map(id_string) = Id
		Return Id


For epoch in range(num_epochs):
Shuffle(pairs)
Index = 0
Batch_size = 64
While index+batch_size < length(pairs):
	Batch = pairs[index:index+batch_size]      // index[min(index+batch_size, len(pairs))
	Index += batch_size

Def process_batch(batch):
	Left_ids = LongTensor([pair[0] for pair in batch])
	Right_ids = LongTensor([pair[1] for pair in batch])
	Neg_ids = LongTensor([np.randint(0, maxnodeid) for _ in batch)

	Left_embeds = embedding(left_ids)
	 tensor batch size x embedding dimension
	Right_embeds =
	Neg_embeds =...

	Pos_score = left_embed x right_embed then summed across axis=0
	Neg_score = left_embed x neg_embed

	Loss = get_loss(pos_score, neg_score)

-one-hot encoding
-random walk permutation
	-center node and then do pairing two left and two right

In [18]:
# Now creating the mapping
# map nodes -> R^d
# decoder in training

class Encoder(nn.Module):
  # should return VECTORS for each node
    def __init__(self):
        super(Encoder, self).__init__()
        
        #self.dropout = nn.Dropout(p=0.2)
        # one layer, return embeds(node_ids) which is a long tensor
        #learnrate might be too big if doesn't decrease
        self.embed = nn.Embedding(G.number_of_nodes(), 256)

    
    def forward(self, x):
        # take the node name as input and 
        x = self.embed(x)
        return x

# embeds.weight = embeds.weight/np.sqrt(mbed_dim)
  # Loss function can't be a float, it should be a tensor
  # and also DON'T unwrap a tensor at any point, that gets rid of grad
  # Keep it in tensor operators: maybe change node_dict into a tensor

    
def LossFunction(u, v, n):
  # Now this takes in 3 MATRICES
# Sum over node pairs: -log(sigmoid(dot prod)) - sum over n in N (log (1-sigma(dot prod zii zn)))
# N = randomly sample
  
    similar = Decoder(u, v)
    diff = Decoder(u, n)
    
    loss = -np.log(torch.mean(nn.Sigmoid(similar))) - np.log(torch.mean(1 - nn.Sigmoid(diff)))  
    
    return loss
  
    
# calculate inner product between 2 matrices
def Decoder(a, b):
    c = []
    for row_a in a:
        for row_b in b:
            c.append(torch.dot(row_a, row_b))
    
    return torch.cuda.FloatTensor(c)

In [26]:
import random


id_map = {}


def get_id(id_string):
    if id_string in id_map.keys():
        return id_map[id_string]
    else:
        ID = len(id_map)
        id_map[id_string] = ID
        return ID
    
        
def process_batch(batch):
    left_ids = torch.LongTensor([pair[0] for pair in batch])
    right_ids = torch.LongTensor([pair[1] for pair in batch])
    neg_ids = torch.LongTensor([np.random.randint(0, G.number_of_nodes()) for _ in batch])
    
    #print(left_ids)
    left_embeds = model(left_ids)
    right_embeds = model(right_ids)
    neg_embeds = model(neg_ids)
    
    pos_score = torch.mm(torch.t(left_embeds), right_embeds)
    neg_score = torch.mm(torch.t(left_embeds), neg_embeds)
    
    loss = get_loss(pos_score, neg_score)
    return loss
    
                          
def get_loss(pos, neg):
    m = nn.Sigmoid()
    loss = -torch.mean(torch.log(m(pos))) - torch.mean(torch.log(1 - m(neg)))
    return loss



model = Encoder()
model.embed.weight.data = (model.embed.weight.data/np.sqrt(256))

optimizer = optim.SGD(model.parameters(), lr=hyper_params['learning_rate'])

epochs = hyper_params['epochs']

pairs = [(get_id(pair[0]) , get_id(pair[1])) for pair in pairs]

for e in range(50):
    random.shuffle(pairs)
    train_loss = 0
    batch_size = 64
    batch = []
    index=0
    
    while index+batch_size < len(pairs):
        batch = pairs[index:min(index+batch_size, len(pairs))]
        index += batch_size
        
        optimizer.zero_grad()
        loss = process_batch(batch)
        #print(batch)
        train_loss += loss
        loss.backward()        #retain_graph=True
        optimizer.step()
        
    print(train_loss)
        
    
    

tensor(70.7134, grad_fn=<AddBackward0>)
tensor(70.7134, grad_fn=<AddBackward0>)
tensor(70.7128, grad_fn=<AddBackward0>)
tensor(70.7138, grad_fn=<AddBackward0>)
tensor(70.7137, grad_fn=<AddBackward0>)
tensor(70.7144, grad_fn=<AddBackward0>)
tensor(70.7134, grad_fn=<AddBackward0>)
tensor(70.7131, grad_fn=<AddBackward0>)
tensor(70.7136, grad_fn=<AddBackward0>)
tensor(70.7129, grad_fn=<AddBackward0>)
tensor(70.7136, grad_fn=<AddBackward0>)
tensor(70.7132, grad_fn=<AddBackward0>)
tensor(70.7138, grad_fn=<AddBackward0>)
tensor(70.7126, grad_fn=<AddBackward0>)
tensor(70.7133, grad_fn=<AddBackward0>)
tensor(70.7136, grad_fn=<AddBackward0>)
tensor(70.7128, grad_fn=<AddBackward0>)
tensor(70.7127, grad_fn=<AddBackward0>)
tensor(70.7132, grad_fn=<AddBackward0>)
tensor(70.7132, grad_fn=<AddBackward0>)
tensor(70.7133, grad_fn=<AddBackward0>)
tensor(70.7130, grad_fn=<AddBackward0>)
tensor(70.7130, grad_fn=<AddBackward0>)
tensor(70.7120, grad_fn=<AddBackward0>)
tensor(70.7134, grad_fn=<AddBackward0>)


## OLD CODE

In [5]:
# Generate Training/Validation Set

start_time = time.time()


# generate the pairs FIRST THEN split the pairs into validation set and training set

training_nodes = torch.zeros([1, num_train], dtype=torch.float)
validation_nodes = torch.zeros([1, num_val], dtype=torch.float)
node_dict = {}
# encode Nodes as numbers

index = 0
for node in list(G.nodes()):
    node_dict[index] = node
    index += 1

training = []
validation = []

graph_nodes = list(range(len(node_dict)))

# SHOULD SEPARATE VALIDATION NODES FROM TRAINING NODES
# choose from a list of nodes
    
for i in range(num_train):
    c = choice(graph_nodes)
    training_nodes[0][i] = c
    graph_nodes.remove(c)
    training.append(node_dict[c])
    
    if i < num_val:
        validation_nodes[0][i] = choice(graph_nodes)
        validation.append(node_dict[c])

y_true = []
#for x in validation_nodes[0]:
#    node_x = node_dict[x.item()]
#    for y in validation_nodes[0]:
#        node_y = node_dict[y.item()]
#        if (node_x, node_y) in list(G.edges()):
#            y_true.append(1)
#        else:
#            y_true.append(0)
        

print(training_nodes.shape)
print(validation_nodes.shape)

print("--- %s minutes ---" % ((time.time() - start_time)//60))

torch.Size([1, 1200])
torch.Size([1, 100])
--- 0.0 minutes ---


In [3]:
#pairs_train = pairs[0:num_train]
#pairs_val = pairs[num_train:num_train+num_val]

#print(len(pairs_train))
#print(len(pairs_val))

1200
100


In [41]:
a = torch.randn(4, 4)
b = torch.randn(4, 4)

c = Decoder(a, b)
print(a)
print(b)
print(c)



tensor([[ 0.1395, -1.7461, -0.9168,  1.2227],
        [ 0.8206,  0.7132,  0.1555,  0.7413],
        [ 0.8960,  0.8743, -0.6854, -0.6067],
        [-1.4658, -0.9448, -0.3169,  0.8085]])
tensor([[-0.0531, -1.3703,  1.6524,  1.8320],
        [-0.4788,  1.2953,  0.4166, -0.7193],
        [-0.4933, -2.2002, -1.4886,  0.2413],
        [ 0.6649,  0.5782, -0.9959,  1.1086]])
tensor([ 3.1104, -3.5900,  5.4327,  1.3517,  0.5941,  0.0625, -2.0264,  1.6249,
        -3.4897,  0.8544, -1.4916,  1.1113,  2.3300, -1.2357,  3.4686, -0.3089],
       device='cuda:0')


In [None]:
# Training Model

start_time = time.time()

model = Encoder(G.number_of_nodes())
model.embed.weight.data = (model.embed.weight.data/np.sqrt(256))
model.cuda()

optimizer = optim.SGD(model.parameters(), lr=hyper_params['learning_rate'])

epochs = hyper_params['epochs']

 # have to somehow get vectors representing each node
 # pass in a SINGLE NODE to encoder and get back a vector
 # Comet.ml

encoded = {}


for e in range(epochs):
    train_loss = 0
    running_loss = 0
    
    model.train()
    
    
    # generate a matrix for say 10 (u,v) pairs in Pairs and then use torch tensor operations to calculate loss
    # you're generating a matrix of NODE EMBEDDINGS for 10 u then 10 v
    
    u_matrix = torch.zeros(1, 256)
    v_matrix = torch.zeros(1, 256)
    n_matrix = torch.zeros(1, 256)
    
    for loop in range(100):
    
        for index in range(64):
        # should I pop the pair chosen to make sure it isn't chosen again for now?
            (u,v) = choice(pairs)
            n = choice(training_nodes[0])
            node_u = model(torch.cuda.LongTensor([[u]])) 
            node_v = model(torch.cuda.LongTensor([[v]]))
            node_n = model(torch.cuda.LongTensor([[n]]))
            torch.cat((u_matrix, node_u), dim=0)
            torch.cat((v_matrix, node_v), dim=0)
            torch.cat((v_matrix, node_n), dim=0)
    
        u_matrix = u_matrix[1:]
        v_matrix = v_matrix[1:]
        n_matrix = n_matrix[1:]
        
        optimizer.zero_grad()
        train_loss = LossFunction(u_matrix, v_matrix, n_matrix)
        running_loss += train_loss.item()
        train_loss.backward()        #retain_graph=True
        optimizer.step()
            
            

    print(f"Training loss: {running_loss}")
    
    
    model.eval()
    y_score = []
    index = []
    
    for x in validation_nodes[0]:
        for y in validation_nodes[0]:
            a = model(torch.cuda.LongTensor([[x]])) 
            b = model(torch.cuda.LongTensor([[y]]))  
            result = Decoder(a, b)
            result = result.cpu()
            result = result.detach().numpy()
            y_score.append(result)
            index.append(similarity[(node_dict[u.item()], node_dict[v.item()])])
            
            
    

print(model.embed.weight.data)

print("--- %s minutes ---" % ((time.time() - start_time)//60))
    









In [59]:
#x = [2, 3, 4, 5, 6]
#y = map(lambda v : v * 5, x)
b = [[1, 2], [3, 4], [5, 6]]

b = lambda c: [a for b in c for a in b]
print(b)

#print(list(y))
#a = ['A', 'B', 'C', 'D']
#print(a[1 : 3])
#print(list(permutations(a, 2)))

<function <lambda> at 0x0000024724E15AE8>


In [80]:
a = [1, 2, 3, 4, 5]
print(choice(a - 5))

TypeError: unsupported operand type(s) for -: 'list' and 'int'

## Rejected Code from node2vec-link

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import zero_one_loss


start_time = time.time()




class LogisticRegression(torch.nn.Module):
     
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(1, 1)
    def forward(self, x):
        y_pred = F.sigmoid(self.linear(x))
        return y_pred
    
    

#embedding = nn.Embedding.from_pretrained(weight)


# index out of range: maybe you should get_id for all of the nodes originally

#left = [pair[0] for pair in train_edges]
#right = [pair[1] for pair in train_edges]
#left = [get_id(l) for l in left]
#right = [get_id(r) for r in right]
    
#left_ids = torch.cat([weight[ids] for ids in left], -1)
#right_ids = torch.cat([weight[ids] for ids in right], -1)
    
#dot_prod = torch.dot(left_ids, right_ids)
#element_mul = left_ids * right_ids
#print(dot_prod)
#print(element_mul.shape)

# need to implement negatives as well
# change y_label and dp into TENSORS


torch.cuda.empty_cache()

train_edges = list(T.edges())
val_edges = list(V.edges())

y_label = torch.ones((len(train_edges), 1))
neg = torch.zeros((num_neg, 1))
y_label = torch.cat((y_label, neg))
y_label = torch.squeeze(y_label)


negs = random.sample(list(nx.non_edges(T)), num_neg)
train_edges = train_edges + negs


link_model = LogisticRegression()
criterion = torch.nn.BCELoss(size_average=True)
optimizer_link = torch.optim.SGD(link_model.parameters(), lr = 0.1) 
weight = model.embed.weight.data
link_model.to('cuda')


left = [pair[0] for pair in train_edges]
right = [pair[1] for pair in train_edges]
left = [get_id(l) for l in left]
right = [get_id(r) for r in right]
    
    
left_ids = [weight[ids] for ids in left]
lf = torch.stack(left_ids)
right_ids = [weight[ids] for ids in right]
rg = torch.stack(right_ids)
    

    
dot_prod = torch.bmm(lf.view(num_train, 1, 256), rg.view(num_train, 256, 1)) 
dot_prod = torch.squeeze(dot_prod)
dot_prod = dot_prod.cuda()
    
dotproduct = dot_prod.cpu()
dotproduct = dotproduct.numpy()
y_label_roc = y_label.numpy()
    
element_mul = lf * rg
    
for e in range(20):

    score = roc_auc_score(y_label_roc, dotproduct)
    print(score)
    pred_y = link_model.linear.weight.data.cpu().numpy()
    score2 = roc_auc_score(y_label_roc, pred_y()
    print(score2)
           
    y_pred = link_model(dot_prod)
    loss = criterion(y_pred, y_label)
    optimizer.zero_grad() 
    loss.backward() 
    optimizer.step() 
    print(loss)
        
        
    #dp.append(dot_prod)
        
    #dp = torch.t(torch.FloatTensor(dp))
    #print(dp.shape)
    

# or instead do AUC_score somehow...just generate a whole list of y_pred and then do AUC 
    

        
print("--- %s minutes ---" % ((time.time() - start_time)//60))
    



In [None]:
################# unbatched ################
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import zero_one_loss


start_time = time.time()




class LogisticRegression(torch.nn.Module):
     
    def __init__(self):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(1, 1)
    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred



dp = []
link_model = LogisticRegression()
alpha = 0.9

for e in range(20):

    link_loss = 0
    
    for u, v in train_edges:
        y_label = torch.FloatTensor([0])
        u_embed = weight[get_id(u)]
        v_embed = weight[get_id(v)]
        
        dot_prod = torch.dot(u_embed, v_embed)
        element_mul = u_embed * v_embed
        dp.append(dot_prod.item())
        
        dot_prod = torch.FloatTensor([dot_prod.item()])

        
        if (u,v) in list(G.edges()):
            y_label = torch.FloatTensor([1])
            
        y_pred = link_model(dot_prod)
        loss = criterion(y_pred, y_label)
        link_loss = alpha * link_loss + (1-alpha)*loss
        optimizer.zero_grad() 
        loss.backward() 
        optimizer.step() 
    
    print(link_loss/(len(train_edges)+num_neg))
    
score = roc_auc_score(y_label_roc, np.array(dp))
print(score)
pred_y = link_model.linear.weight.data.numpy()
score2 = roc_auc_score(y_label_roc, pred_y)
print(score2)

print("--- %s minutes ---" % ((time.time() - start_time)//60))

In [5]:
test1 = [1, 2, 3]
test2 = [3, 4, 5]
test3 = (test1, test2)

for t1 in test3:
    print(t1)

[1, 2, 3]
[3, 4, 5]
