In [1]:
from utils import random_walk, generate_walks, read_train_val_graph, sparse_mx_to_torch_sparse_tensor
from utils import apply_word2vec_on_features, create_and_normalize_adjacency, save_subgraph_in_file

In [2]:
from random import random
from random import randint

In [3]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
import pandas as pd
import numpy as np
from time import time
from gensim.models import Word2Vec
import networkx as nx

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [27]:
def read_train_val_graph(path='../input_data/edgelist.txt', val_ratio=0.1):
    G = nx.read_edgelist(path, delimiter=',', create_using=nx.Graph(), nodetype=int)
    nodes = list(G.nodes())
    n = G.number_of_nodes()
    m = G.number_of_edges()
    edges = list(G.edges())

    print('Number of nodes:', n, 'number of edges:', m,'in All the set')

    node_to_idx = dict()
    for i, node in enumerate(nodes):
        node_to_idx[node] = i

    val_edges = list()
    G_train = G

    for edge in edges:
        if random() < val_ratio and edge[0] < n and edge[1] < n:
            val_edges.append(edge)

    # We remove the val edges from the graph G
    for edge in val_edges:
        G_train.remove_edge(edge[0], edge[1])

    n = G_train.number_of_nodes()
    m = G_train.number_of_edges()
    train_edges = list(G_train.edges())

    print('Number of nodes:', n, 'number of edges:', m, 'in the Training set')
    print('len(nodes)', len(nodes))

    y_val = [1]*len(val_edges)

    n_val_edges = len(val_edges)
    
    print('Creating random val_edges...')
    for i in range(n_val_edges):
        n1 = nodes[randint(0, n-1)]
        n2 = nodes[randint(0, n-1)]
        (n1, n2) = (min(n1, n2), max(n1, n2))
        #I added the next secion to handle nodes that are out of range when I work with a subgraph to test quickly
        while n2 > n: #or (n1, n2) in train_edges:
            n1 = nodes[randint(0, n-1)]
            n2 = nodes[randint(0, n-1)]
            (n1, n2) = (min(n1, n2), max(n1, n2))
        val_edges.append((n1, n2))


    y_val.extend([0]*(n_val_edges))
    
    print('Returned G_train, train_edges, val_edges, y_val, nodes and node_to_idx objects')
    print('Loaded from', path[path.rfind('/')+1:], 'and with a training validation split ratio =', val_ratio)
    
    return G_train, train_edges, val_edges, y_val, nodes, node_to_idx

In [9]:
max(np.transpose(val_edges)[0])

938

In [28]:
path = '../input_data/edgelist.txt'

G_train, train_edges, val_edges, y_val, nodes, node_to_idx = read_train_val_graph(val_ratio=0.1, path=path)
walks = generate_walks(G=G_train, num_walks=10, walk_length=15)
features_np = apply_word2vec_on_features(features=walks, nodes=nodes)
adj, indices = create_and_normalize_adjacency(G_train)

Number of nodes: 138499 number of edges: 1091955 in All the set
Number of nodes: 138499 number of edges: 983215 in the Training set
len(nodes) 138499
Creating random val_edges...
Returned G_train, train_edges, val_edges, y_val, nodes and node_to_idx objects
Loaded from edgelist.txt and with a training validation split ratio = 0.1
Start generating walks....
Random walks generated in in 70s!
Start applying Word2Vec...
Word2vec model trained on features in 3 min!
(138499, 128) features numpy array created in 3 min!


  adj = nx.adjacency_matrix(G) # Obtains the adjacency matrix of the training graph


Created a normalized adjancency matrix of shape (138499, 138499)
Created indices (2, 2104929) with the positions of non zeros in adj matrix


In [29]:
# Create class labels
y = np.zeros(2*indices.shape[1])
y[:indices.shape[1]] = 1 # Concatenated ones for edges indices and later in the model we add zeros for random indices.


In [30]:
# Transforms the numpy matrices/vectors to torch tensors.
features = torch.FloatTensor(features_np).to(device)
y = torch.LongTensor(y).to(device)
if type(adj) != torch.Tensor:
    adj = sparse_mx_to_torch_sparse_tensor(adj).to(device)
indices = torch.LongTensor(indices).to(device)

In [31]:
class GNN(nn.Module):
    def __init__(self, n_feat, n_hidden, n_class, dropout):
        super(GNN, self).__init__()
        self.fc1 = nn.Linear(n_feat, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, n_hidden)
        self.double_fc3 = nn.Linear((2*n_hidden), n_hidden)
        self.fc4 = nn.Linear(n_hidden, n_class)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        
        

    def forward(self, x_in, adj, pairs):
        
        h1 = self.fc1(x_in)
        z1 = self.relu(torch.mm(adj, h1))
        z1 = self.dropout(z1)

        h2 = self.fc2(z1)
        z2 = self.relu(torch.mm(adj, h2))

        #x = z2[pairs[0,:],:] - z2[pairs[1,:],:] # embedded features (z2) of node 0 - embedded features of node 1
        x1 = z2[pairs[0]]
        x2 = z2[pairs[1]]
        # could we add a new dimension to pairs to specify if same author(s)? and then what could we do?
              
        x = torch.cat((x1, x2), dim=1)        
        x = self.relu(self.double_fc3(x))        
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)

        return F.log_softmax(x, dim=1)

In [38]:
epochs = 200
n_hidden = 128
dropout_rate = 0.2
n_class = 2
n_features = features.shape[1]

# Creates the model and specifies the optimizer
model = GNN(n_features, n_hidden, n_class, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [39]:
import time

# Train model
model.train()
start_time = time.time()
for epoch in range(epochs):
    t = time.time()
    optimizer.zero_grad()
    rand_indices = torch.randint(0, features.size(0), (indices.size(0),indices.size(1)), device=adj.device)# We take random indices each time we run an epoch
    pairs = torch.cat((indices, rand_indices), dim=1) # Concatenate the edges indices and random indices.   
    output = model(features, adj, pairs) # we run the model that gives the output.
    loss_train = F.nll_loss(output, y) # we are using nll_loss as loss to optimize, we store it in loss_train. We compare to y which is stable and contains the tag ones and zeros.
    #print(type(loss_train), '\n', loss_train.shape)
    acc_train = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y.cpu().numpy())# just to show it in the out put message of the training
    loss_train.backward() # The back propagation ? --> Computes the gradient of current tensor w.r.t. graph leaves
    optimizer.step() # Performs a single optimization step (parameter update).
    
    if epoch % 5 == 0:
        print('Epoch: {:03d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(loss_train.item()),
              'acc_train: {:.4f}'.format(acc_train.item()),
              'time: {:.4f} s'.format(time.time() - t),
             'total_time: {} min'.format(round((time.time() - start_time)/60)))

print("Optimization Finished in {} min!".format(round((time.time() - start_time)/60)))
print()

Epoch: 001 loss_train: 0.6937 acc_train: 0.5000 time: 36.8019 s total_time: 1 min
Epoch: 006 loss_train: 0.6661 acc_train: 0.6584 time: 26.8333 s total_time: 3 min
Epoch: 011 loss_train: 0.5805 acc_train: 0.6776 time: 26.8028 s total_time: 5 min
Epoch: 016 loss_train: 0.4668 acc_train: 0.7837 time: 25.6495 s total_time: 8 min
Epoch: 021 loss_train: 0.4163 acc_train: 0.8175 time: 26.5373 s total_time: 10 min
Epoch: 026 loss_train: 0.3964 acc_train: 0.8271 time: 27.8933 s total_time: 12 min
Epoch: 031 loss_train: 0.3858 acc_train: 0.8342 time: 28.1627 s total_time: 14 min
Epoch: 036 loss_train: 0.3716 acc_train: 0.8395 time: 28.2425 s total_time: 17 min
Epoch: 041 loss_train: 0.3572 acc_train: 0.8439 time: 24.3255 s total_time: 19 min
Epoch: 046 loss_train: 0.3482 acc_train: 0.8488 time: 24.2366 s total_time: 21 min
Epoch: 051 loss_train: 0.3369 acc_train: 0.8538 time: 24.5765 s total_time: 23 min
Epoch: 056 loss_train: 0.3244 acc_train: 0.8617 time: 24.9516 s total_time: 25 min
Epoch: 0

In [5]:
# Evaluating the model
model.eval()
eval_pairs = np.array(np.transpose(val_edges))
print(eval_pairs.shape)
eval_pairs = torch.LongTensor(eval_pairs).to(device)
print(eval_pairs.shape)
eval_output = model(features, adj, eval_pairs)
print(eval_output.shape)
y_pred = torch.exp(eval_output)
y_pred = y_pred.detach().cpu().numpy()

y_pred_true = list()
for element in y_pred:
    y_pred_true.append(element[1])
    
print('Log loss:', log_loss(y_val, y_pred_true))

NameError: name 'model' is not defined

In [42]:
from datetime import datetime
import pandas as pd



# Read test data. Each sample is a pair of nodes
node_pairs = list()
with open('../test_data/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs.append((node_to_idx[int(t[0])], node_to_idx[int(t[1])]))

# Testing
model.eval()
node_pairs = np.array(np.transpose(node_pairs))
pairs = torch.LongTensor(node_pairs).to(device)
output = model(features, adj, pairs)
y_pred = torch.exp(output)
y_pred = y_pred.detach().cpu().numpy()

y_pred_true = list()
for element in y_pred:
    y_pred_true.append(element[1])
    

    
today = datetime.today().strftime('%Y-%m-%d')
random_nb = randint(0, 1000)

pd.DataFrame(y_pred_true, columns={'predicted'}).to_csv(
"../submissions_files/{}-submission-{}.csv".format(today, random_nb), header=True, index=True, index_label='id'
)