In [5]:
from utils import random_walk, generate_walks, read_train_val_graph 
from utils import apply_word2vec_on_features, create_and_normalize_adjacency

In [6]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
import pandas as pd
import numpy as np
from time import time
from gensim.models import Word2Vec
import networkx as nx

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [7]:
def save_subgraph_in_file(nbr_nodes, source_path='../input_data/edgelist.txt', destination_path='../input_data/small_edgelist.txt'):
    G = nx.read_edgelist(source_path, delimiter=',', create_using=nx.Graph(), nodetype=int)
    G = G.subgraph(range(size))
    nx.write_edgelist(G, path=destination_path, delimiter=',')
    print(G.number_of_nodes(), 'nodes,', G.number_of_edges(), 'edges Graph saved in', destination_path)
    return
save_subgraph_in_file(nbr_nodes=138499)


20000 nodes, 156145 edges Graph saved in ../input_data/small_edgelist.txt


In [52]:
class GNN(nn.Module):
    def __init__(self, n_feat, n_hidden, n_class, dropout):
        super(GNN, self).__init__()
        self.fc1 = nn.Linear(n_feat, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, n_hidden)
        self.fc4 = nn.Linear(n_hidden, n_class)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x_in, adj, pairs):
        
        h1 = self.fc1(x_in)
        z1 = self.relu(torch.mm(adj, h1))
        z1 = self.dropout(z1)

        h2 = self.fc2(z1)
        z2 = self.relu(torch.mm(adj, h2))

        x = z2[pairs[0,:],:] - z2[pairs[1,:],:] # embedded features (z2) of node 0 - embedded features of node 1
        # could we add a new dimension to pairs to specify if same author(s)? and then what could we do?
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)

        return F.log_softmax(x, dim=1)

In [57]:
l = list(['a', 'b', 'c'])
l = list(map(lambda x: x.replace('a', 'Ishan'), l))
l


['Ishan', 'b', 'c']

In [9]:
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

In [34]:
path = '../input_data/edgelist.txt'

G_train, train_edges, val_edges, y_val, nodes = read_train_val_graph(val_ratio=0.1, path=path)
walks = generate_walks(G=G_train, num_walks=10, walk_length=15)
features_np = apply_word2vec_on_features(features=walks, nodes=nodes)
adj, indices = create_and_normalize_adjacency(G_train)

Number of nodes of total set: 138499
Number of edges of total set: 1091955
Number of edges of training set: 983161
Returned G_train, train_edges, val_edges, y_val and nodes objects
Loaded from edgelist.txt and with a training validation split ratio = 0.1
Start generating walks....
Random walks generated in in 82s!
Start applying Word2Vec...
Word2vec model trained on features in 4 min!
(138499, 128) features numpy array created in 4 min!


  adj = nx.adjacency_matrix(G) # Obtains the adjacency matrix of the training graph


Created a normalized adjancency matrix of shape (138499, 138499)
Created indices (2, 2104821) with the positions of non zeros in adj matrix


In [35]:
np.shape(y_val), np.shape(G_train), np.shape(train_edges), np.shape(val_edges)

((217588,), (138499,), (983161, 2), (217588, 2))

In [36]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
np.shape(y_val), np.shape(G_train), np.shape(train_edges), np.shape(val_edges)

In [None]:
features_np.shape

In [None]:
G_train.number_of_edges()

In [37]:
# Create class labels
y = np.zeros(2*indices.shape[1])
y[:indices.shape[1]] = 1 # Concatenated ones for edges indices and later in the model we add zeros for random indices.


In [38]:
# Transforms the numpy matrices/vectors to torch tensors.
features = torch.FloatTensor(features_np).to(device)
y = torch.LongTensor(y).to(device)
if type(adj) != torch.Tensor:
    adj = sparse_mx_to_torch_sparse_tensor(adj).to(device)
indices = torch.LongTensor(indices).to(device)

In [49]:
epochs = 1
n_hidden = 128
dropout_rate = 0.2
n_class = 2
n_features = features.shape[1]

# Creates the model and specifies the optimizer
model = GNN(n_features, n_hidden, n_class, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [40]:
print(indices.shape)
#print(rand_indices.shape)
#print(pairs.shape)
print(y.shape)

torch.Size([2, 2104821])
torch.Size([4209642])


In [41]:
print(len(y)/2)

2104821.0


In [50]:
import time

# Train model
model.train()
start_time = time.time()
for epoch in range(epochs):
    t = time.time()
    optimizer.zero_grad()
    rand_indices = torch.randint(0, features.size(0), (indices.size(0),indices.size(1)), device=adj.device)# We take random indices each time we run an epoch
    pairs = torch.cat((indices, rand_indices), dim=1) # Concatenate the edges indices and random indices.   
    output = model(features, adj, pairs) # we run the model that gives the output.
    loss_train = F.nll_loss(output, y) # we are using nll_loss as loss to optimize, we store it in loss_train. We compare to y which is stable and contains the tag ones and zeros.
    #print(type(loss_train), '\n', loss_train.shape)
    acc_train = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y.cpu().numpy())# just to show it in the out put message of the training
    loss_train.backward() # The back propagation ? --> Computes the gradient of current tensor w.r.t. graph leaves
    optimizer.step() # Performs a single optimization step (parameter update).
    
    if epoch % 5 == 0:
        print('Epoch: {:03d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(loss_train.item()),
              'acc_train: {:.4f}'.format(acc_train.item()),
              'time: {:.4f} s'.format(time.time() - t),
             'total_time: {} min'.format(round((time.time() - start_time)/60)))

print("Optimization Finished in {} min!".format(round((time.time() - start_time)/60)))
print()

Epoch: 001 loss_train: 0.6933 acc_train: 0.4999 time: 20.7952 s total_time: 0 min
Optimization Finished in 0 min!



In [51]:
# Evaluating the model
model.eval()
eval_pairs = np.array(np.transpose(val_edges))
print(eval_pairs.shape)
eval_pairs = torch.LongTensor(eval_pairs).to(device)
print(eval_pairs.shape)
eval_output = model(features, adj, eval_pairs)
print(eval_output.shape)
y_pred = torch.exp(eval_output)
y_pred = y_pred.detach().cpu().numpy()

y_pred_true = list()
for element in y_pred:
    y_pred_true.append(element[1])
    
print('Log loss:', log_loss(y_val, y_pred_true))

(2, 217588)
torch.Size([2, 217588])
torch.Size([217588, 2])
Log loss: 0.6937010884284973


In [None]:
#print(output.shape)
print(features.shape, adj.shape, eval_pairs.shape)

In [None]:
eval_pairs.shape

In [None]:
z2[pairs[1,:]]

In [None]:
adj = nx.adjacency_matrix(G_train) # Obtains the adjacency matrix of the training graph
adj = normalize_adjacency(adj) # Normalizes the adjacency matrix only by adding ones to diag
indices = np.array(adj.nonzero()) # Gets the positions of non zeros of adj into indices

In [None]:
# features initializaed with word2vec embedding of the random walks


In [None]:
# Create class labels
y = np.zeros(4*G_train.number_of_edges())
y[:2*G_train.number_of_edges()] = 1 # Concatenated ones for edges indices and later in the model we add zeros for random indices.
train_edges[0]

In [None]:
t = time()
X_train = np.zeros((4*m, 2*features_np.shape[1]))

for i, edge in enumerate(train_edges):
    X_train[i] = np.concatenate((features_np[train_edges[i][0]], features_np[train_edges[i][1]]), axis=0)
    X_train[m+i] = np.concatenate((features_np[train_edges[i][0]], features_np[train_edges[i][1]]), axis=0)
    X_train[2*m+i] = np.concatenate((features_np[randint(0,n-1)], features_np[randint(0,n-1)]), axis=0)
    X_train[3*m+i] = np.concatenate((features_np[randint(0,n-1)], features_np[randint(0,n-1)]), axis=0)
    
X_val = np.zeros((len(val_edges), 2*features_np.shape[1]))
for i, edge in enumerate(val_edges):
    X_val[i] = np.concatenate((features_np[val_edges[i][0]], features_np[val_edges[i][1]]), axis=0)
    
print('X_train and X_val created in {} s!'.format(round(time()-t)))

In [None]:
t = time()
# Use logistic regression to predict if two nodes are linked by an edge
clf = LogisticRegression()
clf.fit(X_train, y)
y_pred = clf.predict_proba(X_val)
y_pred = y_pred[:,1]
print('Logistic regression performed in {} s!'.format(round(time()-t)))

In [None]:
print('Log loss:', log_loss(y_val, y_pred))

In [None]:
from datetime import datetime

# Read test data. Each sample is a pair of nodes
test_edges = list()
with open('../test_data/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        test_edges.append((node_to_idx[int(t[0])], node_to_idx[int(t[1])]))

X_test = np.zeros((len(test_edges), 2*features_np.shape[1]))
for i, edge in enumerate(test_edges):
    X_test[i] = np.concatenate((features_np[test_edges[i][0]], features_np[test_edges[i][1]]), axis=0)

t = time()
# Use logistic regression to predict if two nodes are linked by an edge
y_pred = clf.predict_proba(X_test)
y_pred = y_pred[:,1]
print('Logistic regression performed in {} s!'.format(round(time()-t)))


today = datetime.today().strftime('%Y-%m-%d')
random_nb = randint(0, 100000)

pd.DataFrame(y_pred, columns={'predicted'}).to_csv(
"../submissions_files/{}-submission-{}.csv".format(today, random_nb), header=True, index=True, index_label='id'
)
    
    
# # Testing
# node_pairs = np.array(np.transpose(node_pairs))
# pairs = torch.LongTensor(node_pairs).to(device)
# output = model(features, adj, pairs)
# y_pred = torch.exp(output)
# y_pred = y_pred.detach().cpu().numpy()

# y_pred_true = list()
# for element in y_pred:
#     y_pred_true.append(element[1])
    

    
# today = datetime.today().strftime('%Y-%m-%d')
# random_nb = randint(0, 100000)

# pd.DataFrame(y_pred_true, columns={'predicted'}).to_csv(
# "../submissions_files/{}-submission-{}.csv".format(today, random_nb), header=True, index=True, index_label='id'
# )