In [167]:
import csv
import networkx as nx
import numpy as np
from random import randint
from random import random
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss, accuracy_score
from random import choice
from gensim.models import Word2Vec
import keras

from scipy.sparse import identity, diags

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [168]:
def normalize_adjacency(A):
    n = A.shape[0]
    A = A + identity(n)
    degs = A.dot(np.ones(n))
    inv_degs = np.power(degs, -1)
    D_inv = diags(inv_degs)
    A_hat = D_inv.dot(A)
    return A_hat

def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    print(type(sparse_mx))
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

def random_walk(G, node, walk_length):
    walk = [node]
  
    for i in range(walk_length-1):
        neibor_nodes = list(G.neighbors(walk[-1]))
        if len(neibor_nodes) > 0:
            next_node = choice(neibor_nodes)
            walk.append(next_node)
    walk = [str(node) for node in walk] # in case the nodes are in string format, we don't need to cast into string, but if the nodes are in numeric or integer, we need this line to cast into string
    return walk

def generate_walks(G, num_walks, walk_length):
  # Runs "num_walks" random walks from each node, and returns a list of all random walk
    walks = list()  
    for i in range(num_walks):
        for node in G.nodes():
            walk = random_walk(G, node, walk_length)
            walks.append(walk)
        #print('walks : ', walks)
    return walks

class GNN(nn.Module):
    def __init__(self, n_feat, n_hidden, n_class, dropout):
        super(GNN, self).__init__()
        self.fc1 = nn.Linear(n_feat, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, n_hidden)
        self.fc4 = nn.Linear(n_hidden, n_class)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x_in, adj, pairs):
        
        h1 = self.fc1(x_in)
        z1 = self.relu(torch.mm(adj, h1))
        z1 = self.dropout(z1)

        h2 = self.fc2(z1)
        z2 = self.relu(torch.mm(adj, h2))
        
        x = z2[pairs[0,:],:] - z2[pairs[1,:],:]
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)

        return F.log_softmax(x, dim=1)

In [169]:
G = nx.read_edgelist('../input_data/edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
edges = list(G.edges())

print('Number of nodes of total set:', n)
print('Number of edges of total set:', m)

node_to_idx = dict()
for i, node in enumerate(nodes):
    node_to_idx[node] = i

val_edges = list()
G_train = G

for edge in edges:
    if random() < 0.1:
        val_edges.append(edge)

# We remove the val edges from the graph G
for edge in val_edges:
    G_train.remove_edge(edge[0], edge[1])

n = G_train.number_of_nodes()
m = G_train.number_of_edges()
train_edges = list(G_train.edges())
    
print('Number of nodes of training set:', n)
print('Number of edges of training set:', m)

y_val = [1]*len(val_edges)

n_val_edges = len(val_edges)

# Create random pairs of nodes
for i in range(n_val_edges):
    n1 = nodes[randint(0, n-1)]
    n2 = nodes[randint(0, n-1)]
    (n1, n2) = (min(n1, n2), max(n1, n2))
    val_edges.append((n1, n2))
    
# Remove from val_edges edges that exist in both train and val

for edge in list(set(val_edges) & set(train_edges)):
    val_edges.remove(edge)
    
n_val_edges = len(val_edges) - len(y_val) #because we removed from val_edges edges that exist in both
y_val.extend([0]*n_val_edges)

Number of nodes of total set: 138499
Number of edges of total set: 1091955
Number of nodes of training set: 138499
Number of edges of training set: 982328


### Is it fine to create walks only from the G_train?

In [170]:
walks = generate_walks(G=G_train, num_walks=10, walk_length=15)

In [171]:
model = Word2Vec(vector_size=128, window=5, min_count=0, sg=1, workers=8)
model.build_vocab(walks)
model.train(walks, total_examples=model.corpus_count, epochs=5) 
model.wv['32098']

array([-0.39484984, -0.46734977,  0.26839542, -0.26394665,  0.19405559,
       -0.6791535 ,  0.8497028 ,  0.22450827, -0.8088502 , -0.13068233,
        0.6681871 , -0.2724623 , -0.40058526, -0.5342294 , -0.14299789,
        0.532179  , -0.3065716 , -0.08885144,  0.22567934, -1.0684742 ,
        0.44301444,  0.31018835, -0.22123612,  0.09345564,  0.24089716,
       -0.47015134, -0.20757306, -0.27669477, -0.09022675, -0.44280678,
        0.07159458, -0.52626   ,  0.9142592 , -0.46609974,  0.07174298,
       -0.2335478 , -0.03954452, -0.81300277,  0.07897256, -0.5355212 ,
        0.01435002,  0.00409464, -0.25541407, -0.24264942,  0.27601284,
       -0.22861034, -0.2940636 ,  0.5951595 , -0.12762967, -0.4103339 ,
        0.26178753,  0.8978525 , -0.18115993,  0.02725913, -0.5648883 ,
        0.21274579,  0.22102082, -0.6636808 ,  0.13150162,  0.8125767 ,
       -0.6556572 ,  0.25510177, -0.26354107, -0.11755897, -0.0264527 ,
        0.36873448, -0.24239857,  0.44601217,  0.24678412, -0.08

In [172]:
adj = nx.adjacency_matrix(G_train) # Obtains the adjacency matrix of the training graph
indices = np.array(adj.nonzero()) # Gets the positions of non zeros of adj into indices
adj = normalize_adjacency(adj) # Normalizes the adjacency matrix only by adding ones to diag

  adj = nx.adjacency_matrix(G_train) # Obtains the adjacency matrix of the training graph


In [173]:
# features initializaed randomly because not yet ready
features_np = []

for node in G_train.nodes():
    features_np.append(model.wv[str(node)])

features_np = np.array(features_np)

In [174]:
# Create class labels
y = np.zeros(4*G_train.number_of_edges())
y[:2*G_train.number_of_edges()] = 1 # Concatenated ones for edges indices and zeros for random indices.

# Transforms the numpy matrices/vectors to torch tensors.
features = torch.FloatTensor(features_np).to(device)
y = torch.LongTensor(y).to(device)
print(type(adj))
adj = sparse_mx_to_torch_sparse_tensor(adj).to(device)
print(type(adj))
indices = torch.LongTensor(indices).to(device)



<class 'scipy.sparse._csr.csr_matrix'>
<class 'scipy.sparse._csr.csr_matrix'>
<class 'torch.Tensor'>


In [175]:
# Hyperparameters
epochs = 40
n_hidden = 128
dropout_rate = 0.2
n_class = 2
n_features = features.shape[1]

# Creates the model and specifies the optimizer
model = GNN(n_features, n_hidden, n_class, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [176]:
import time

# Train model
model.train()
start_time = time.time()
for epoch in range(epochs):
    t = time.time()
    optimizer.zero_grad()
    rand_indices = torch.randint(0, features.size(0), (indices.size(0),indices.size(1)), device=adj.device)# We take random indices each time we run an epoch
    pairs = torch.cat((indices, rand_indices), dim=1) # Concatenate the edges indices and random indices.   
    output = model(features, adj, pairs) # we run the model that gives the output.
    loss_train = F.nll_loss(output, y) # we are using nll_loss as loss to optimize, we store it in loss_train. We compare to y which is stable and contains the tag ones and zeros.
    #print(type(loss_train), '\n', loss_train.shape)
    acc_train = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y.cpu().numpy())# just to show it in the out put message of the training
    loss_train.backward() # The back propagation ? --> Computes the gradient of current tensor w.r.t. graph leaves
    optimizer.step() # Performs a single optimization step (parameter update).
    
    if epoch % 5 == 0:
        print('Epoch: {:03d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(loss_train.item()),
              'acc_train: {:.4f}'.format(acc_train.item()),
              'time: {:.4f}s'.format(time.time() - t),
             'total_time: {}min'.format(round((time.time() - start_time)/60)))

print("Optimization Finished in {} min!".format(round((time.time() - start_time)/60)))
print()

Epoch: 001 loss_train: 22.6118 acc_train: 0.4894 time: 35.8183s total_time: 1min
Epoch: 006 loss_train: 8.9000 acc_train: 0.5023 time: 24.3090s total_time: 2min
Epoch: 011 loss_train: 3.8519 acc_train: 0.5182 time: 14.4293s total_time: 4min
Epoch: 016 loss_train: 1.6082 acc_train: 0.5588 time: 20.0659s total_time: 6min
Epoch: 021 loss_train: 0.8859 acc_train: 0.5880 time: 19.3813s total_time: 7min
Epoch: 026 loss_train: 0.6487 acc_train: 0.6367 time: 19.1522s total_time: 9min
Epoch: 031 loss_train: 0.6059 acc_train: 0.6787 time: 12.8576s total_time: 10min
Epoch: 036 loss_train: 0.6023 acc_train: 0.6791 time: 15.9874s total_time: 12min
Optimization Finished in 13 min!



In [177]:
node_pairs = np.array(np.transpose(val_edges))
pairs = torch.LongTensor(node_pairs).to(device)
output = model(features, adj, pairs)
y_pred_val = torch.exp(output)
y_pred_val = y_pred_val.detach().cpu().numpy()

In [178]:
y_pred_val[:, 1]

array([0.38938335, 0.3865336 , 0.38906032, ..., 0.37584653, 0.3831838 ,
       0.33335236], dtype=float32)

### Why is it too big compared to loss obtained in Kaggle?! And that the log loss > 1 even if values are between 0 and 1?

In [179]:
print('Log loss:', log_loss(y_val, y_pred_val[:,1]))

Log loss: nan


  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


In [180]:
from datetime import datetime
import pandas as pd

today = datetime.today().strftime('%Y-%m-%d')
random_nb = randint(0, 100000)

# Read test data. Each sample is a pair of nodes
node_pairs = list()
with open('../test_data/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs.append((node_to_idx[int(t[0])], node_to_idx[int(t[1])]))

# Testing
model.eval()
node_pairs = np.array(np.transpose(node_pairs))
pairs = torch.LongTensor(node_pairs).to(device)
output = model(features, adj, pairs)
y_pred = torch.exp(output)
y_pred = y_pred.detach().cpu().numpy()

y_pred_true = list()
for element in y_pred:
    y_pred_true.append(element[1])
    

pd.DataFrame(y_pred_true, columns={'predicted'}).to_csv(
"../submissions_files/{}-submission-{}.csv".format(today, random_nb), header=True, index=True, index_label='id'
)

CNN with Sigmoid to predict if they connected based on the abstract.

A good way to aggregate the text is to calculate the average (mean)

Another approach is to use directly a GNN. Take the abstract, compute the embeddng of the words. Take the mean of the node.
Then you can take the features.

For thr GN, we have only the 

pairs is a tensor, contains a pair of nodes that contains all the positive samples and some of the negative samples. y: half of them are equal to one, and half of them are connected. rand_indices are random pairs that are considered as not connected.

As we have a non directed. We can take twice every edge (2*m instead of 2*m for y. Or we can take the edges only once.

One vector for the abstract using the word2vec embedding or any other similar approach.

Or we can directly use a CNN.We can take a CNN and feed pais of abstracts in the CNN, the CNN will produce one vector for the first abstract and one vector for the second. We can combine these two vectors.

Then we can use an MLP to produce a vector, and then we can concatenate the two vectors from CNN and MLP.

There is a pretrained word embedding (Google provided a pretrained embedding).

Embedding of each word. Then the CNN will provide one vector for the abstract.

Each abstract has a different number of words. the representation of the CNN will have a fixed size of the embedding vector.


