In [45]:
import csv
import networkx as nx
import numpy as np
from random import randint
from random import random
import pandas as pd
from unidecode import unidecode
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer


from scipy.sparse import identity, diags, csr_matrix
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from sklearn.metrics import log_loss, accuracy_score


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [46]:
def text_to_list(text):
    return unidecode(text).split(',')

In [47]:
def new_normalize_adjacency(A):
    n = A.shape[0] 
    A = A + identity(n)
    return A

In [48]:
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

In [49]:
class MessagePassing(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MessagePassing, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x, adj):
        
        x = self.cf(x)
        out = torch.mm(adj, x)
        
        return out

In [66]:
class GNN(nn.Module):
    def __init__(self, n_feat, n_hidden, n_class, dropout):
        super(GNN, self).__init__()
#         self.fc11 = nn.Linear(n_feat, round(n_feat/10))
#         self.fc12 = nn.Linear(round(n_feat/10), round(n_feat/100))
#         self.fc13 = nn.Linear(round(n_feat/100), round(n_feat/200))
#         self.fc14 = nn.Linear(round(n_feat/200), n_hidden)
        self.fc11 = nn.Linear(n_feat, n_hidden)
        self.fc12 = nn.Linear(n_hidden, n_hidden)
        self.fc13 = nn.Linear(n_hidden, n_hidden)
        self.fc14 = nn.Linear(n_hidden, n_hidden)
                              
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, n_hidden)
        self.fc4 = nn.Linear(n_hidden, n_class)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x_in, adj, pairs):
        
        # Authors embedding using 4 levels MLP to densify representation from sparse 147950 features to n_hidden embedded features
        h11 = self.fc11(x_in)
        z11 = self.relu(torch.mm(adj, h11)) # remove the multiplication with adj each time ?
#         h12 = self.fc12(z11)
#         z12 = self.relu(torch.mm(adj, h12))
#         h13 = self.fc13(z12)
#         z13 = self.relu(torch.mm(adj, h13))
#         h14 = self.fc13(z13)
#         z14 = self.relu(torch.mm(adj, h14))
        
        z1 = self.dropout(z11)
        #print('h1.shape=', z1.shape, ' adj.shape=', adj.shape)
        h2 = self.fc2(z1)
        
        z2 = self.relu(torch.mm(adj, h2))
        
        x = z2[pairs[0,:],:] - z2[pairs[1,:],:]
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)

        return F.log_softmax(x, dim=1)

In [51]:
G = nx.read_edgelist('../input_data/edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
edges = list(G.edges())

val_edges = list()
G_train = G

for edge in edges:
    if random() < 0.1:
        val_edges.append(edge)

# We remove the val edges from the graph G
for edge in val_edges:
    G_train.remove_edge(edge[0], edge[1])

n = G_train.number_of_nodes()
m = G_train.number_of_edges()
train_edges = list(G_train.edges())
    
print('Number of nodes of training set:', n)
print('Number of edges of training set:', m)

y_val = [1]*len(val_edges)

n_val_edges = len(val_edges)

# Create random pairs of nodes
for i in range(n_val_edges):
    n1 = nodes[randint(0, n-1)]
    n2 = nodes[randint(0, n-1)]
    (n1, n2) = (min(n1, n2), max(n1, n2))
    val_edges.append((n1, n2))
    
# Remove from val_edges edges that exist in both train and val

for edge in list(set(val_edges) & set(train_edges)):
    val_edges.remove(edge)
    
n_val_edges = len(val_edges) - len(y_val) #because we removed from val_edges edges that exist in both
y_val.extend([0]*n_val_edges)

Number of nodes of training set: 138499
Number of edges of training set: 982771


In [52]:
# features initializaed with a sparce representation of authors of the papers

authors = pd.read_csv('../input_data/authors.txt', sep = '|', header=None)
authors = authors.rename(columns={0: "paper_id", 2: "authors"})
authors['authors'] = authors['authors'].apply(text_to_list)
mlb = MultiLabelBinarizer()
df = pd.DataFrame(mlb.fit_transform(authors['authors']),columns=mlb.classes_, index=authors.index)
features_np = df.values

features_np = np.random.randn(G_train.number_of_nodes(), 32) # Generates node features randomly
print(type(features_np))
features_np.shape

<class 'numpy.ndarray'>


(138499, 32)

In [53]:
# from gensim.models import Word2Vec

# model = Word2Vec(vector_size=1000, window=5, min_count=0, sg=1, workers=8)
# model.build_vocab(features_np)
# model.train(features_np, total_examples=model.corpus_count, epochs=5) 
# model.wv['32098']

In [54]:
adj = nx.adjacency_matrix(G_train)# Obtains the adjacency matrix of the training graph
print(type(adj))
print(adj.shape)
adj = new_normalize_adjacency(adj) # Normalizes the adjacency matrix only by adding ones to diag
print(type(adj), '\n')

indices = np.array(adj.nonzero())

# Create class labels
y = np.zeros(2*len(indices[0]))
y[:len(indices[0])] = 1 # Concatenated ones for edges indices and zeros for random indices.

# Transforms the numpy matrices/vectors to torch tensors.
print(type(features_np))
#features_np = csr_matrix(features_np)
#print(type(features_np))
#features = sparse_mx_to_torch_sparse_tensor(features_np).to(device)
#print(type(features_np))
features = torch.FloatTensor(features_np).to(device)
y = torch.LongTensor(y).to(device)
adj = sparse_mx_to_torch_sparse_tensor(adj).to(device)
indices = torch.LongTensor(indices).to(device)


  adj = nx.adjacency_matrix(G_train)# Obtains the adjacency matrix of the training graph


<class 'scipy.sparse._csr.csr_matrix'>
(138499, 138499)
<class 'scipy.sparse._csr.csr_matrix'> 

<class 'numpy.ndarray'>


In [55]:
# Hyperparameters
epochs = 20
n_hidden = 128
dropout_rate = 0.2
n_class = 2
n_features = features.shape[1]

# Creates the model and specifies the optimizer
model = GNN(n_features, n_hidden, n_class, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [56]:
print(features.shape)
print(adj.shape)

torch.Size([138499, 32])
torch.Size([138499, 138499])


In [67]:
import time

# Train model
model.train()
start_time = time.time()
for epoch in range(epochs):
    t = time.time()
    optimizer.zero_grad()
    rand_indices = torch.randint(0, features.shape[0], (indices.shape[0],indices.shape[1]), device=adj.device)# We take random indices each time we run an epoch
    pairs = torch.cat((indices, rand_indices), dim=1) # Concatenate the edges indices and random indices.   
    
    output = model(features, adj, pairs) # we run the model that gives the output.
    loss_train = F.nll_loss(output, y) # we are using nll_loss as loss to optimize, we store it in loss_train. We compare to y which is stable and contains the tag ones and zeros.
    #print(type(loss_train), '\n', loss_train.shape)
    acc_train = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y.cpu().numpy())# just to show it in the out put message of the training
    loss_train.backward() # The back propagation ? --> Computes the gradient of current tensor w.r.t. graph leaves
    optimizer.step() # Performs a single optimization step (parameter update).
    
    if epoch % 5 == 0:
        print('Epoch: {:03d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(loss_train.item()),
              'acc_train: {:.4f}'.format(acc_train.item()),
              'time: {:.4f}s'.format(time.time() - t),
             'total_time: {}min'.format(round((time.time() - start_time)/60)))

print("Optimization Finished in {} min!".format(round((time.time() - start_time)/60)))
print()

h1.shape= torch.Size([138499, 128])  adj.shape= torch.Size([138499, 138499])
Epoch: 001 loss_train: 9610.7598 acc_train: 0.4737 time: 27.4452s total_time: 0min
h1.shape= torch.Size([138499, 128])  adj.shape= torch.Size([138499, 138499])
h1.shape= torch.Size([138499, 128])  adj.shape= torch.Size([138499, 138499])
h1.shape= torch.Size([138499, 128])  adj.shape= torch.Size([138499, 138499])
h1.shape= torch.Size([138499, 128])  adj.shape= torch.Size([138499, 138499])
h1.shape= torch.Size([138499, 128])  adj.shape= torch.Size([138499, 138499])
Epoch: 006 loss_train: 824.4444 acc_train: 0.4812 time: 20.7298s total_time: 2min
h1.shape= torch.Size([138499, 128])  adj.shape= torch.Size([138499, 138499])
h1.shape= torch.Size([138499, 128])  adj.shape= torch.Size([138499, 138499])
h1.shape= torch.Size([138499, 128])  adj.shape= torch.Size([138499, 138499])
h1.shape= torch.Size([138499, 128])  adj.shape= torch.Size([138499, 138499])
h1.shape= torch.Size([138499, 128])  adj.shape= torch.Size([13849

In [68]:
# Validation on val subset then calculate loss between prediction (y_pred) and valid y (y_val)
node_pairs = np.array(np.transpose(val_edges))
pairs = torch.LongTensor(node_pairs).to(device)
pred_output = model(features, adj, pairs)
y_pred = torch.exp(pred_output)
y_pred = y_pred.detach().cpu().numpy()
print('Log loss:', log_loss(y_val, y_pred))

h1.shape= torch.Size([138499, 128])  adj.shape= torch.Size([138499, 138499])
Log loss: 0.8243089599514166


In [59]:
print(len(pairs[0]), len(indices[0]), adj.shape, len(y))

218359 2104041 torch.Size([138499, 138499]) 4208082


In [60]:
len(pairs[0])

218359

In [61]:
#y_val
y_pred

array([[0.0e+00, 1.0e+00],
       [0.0e+00, 1.0e+00],
       [0.0e+00, 1.0e+00],
       ...,
       [0.0e+00, 1.0e+00],
       [3.9e-44, 1.0e+00],
       [0.0e+00, 1.0e+00]], dtype=float32)

In [62]:
# features initializaed randomly because not yet ready
features_np = np.random.randn(G_train.number_of_nodes(), 32) # Generates node features randomly
features_np.shape


(138499, 32)

In [63]:
print(indices.shape)
print(min(features_np[0]))

torch.Size([2, 2104041])
-1.9374463378232334


In [64]:
G_train.number_of_edges()*2 # = 1965036 

1965542