In [1]:
from utils import random_walk, generate_walks, read_train_val_graph, sparse_mx_to_torch_sparse_tensor
from utils import apply_word2vec_on_features, create_and_normalize_adjacency, save_subgraph_in_file, train_model

In [2]:
from random import random
from random import randint
from datetime import datetime
import torch

In [3]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
import pandas as pd
import numpy as np
from time import time
from gensim.models import Word2Vec
import networkx as nx

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [4]:
save_subgraph_in_file(nbr_nodes=5000)

5000 nodes, 25591 edges Graph extracted from edgelist.txt
4964 nodes, 25591 edges Graph saved in small_edgelist.txt
4999


In [5]:
path = '../input_data/edgelist.txt'

G, G_train, train_edges, val_edges, val_indices, y_val, nodes, node_to_idx = read_train_val_graph(val_ratio=0.1, path=path)
walks = generate_walks(G=G_train, num_walks=10, walk_length=15)
features_np = apply_word2vec_on_features(features=walks, nodes=nodes, vector_size=128)
adj, indices = create_and_normalize_adjacency(G_train)

Number of nodes: 138499 number of edges: 1091955 in All the set
Number of nodes: 138499 number of edges: 982325 in the Training set
len(nodes) 138499
Creating random val_edges...
Returned G_train, train_edges, val_edges, y_val, nodes and node_to_idx objects
Loaded from edgelist.txt and with a training validation split ratio = 0.1
Start generating walks....
Random walks generated in in 56s!
Start applying Word2Vec...
Word2vec model trained on features in 3 min!
(138499, 128) features numpy array created in 3 min!


  adj = nx.adjacency_matrix(G) # Obtains the adjacency matrix of the training graph


Created a normalized adjancency matrix of shape (138499, 138499)
Created indices (2, 2103149) with the positions of non zeros in adj matrix


In [6]:
counter = 0
non_counter = 0
for val_edge in val_edges:
    if val_edge in G.edges():
        counter += 1
    else:
        non_counter += 1
print('val_edges in G', counter)
print('val_edges not in G', non_counter)

counter = 0
for val_edge in val_edges:
    if val_edge in G_train.edges():
        counter += 1
print('val_edges in G_train', counter)

val_edges in G 109639
val_edges not in G 109621
val_edges in G_train 8


In [7]:
max(np.transpose(val_edges)[1])

138498

In [8]:
# Create class labels
y = np.zeros(2*indices.shape[1])
y[:indices.shape[1]] = 1 # Concatenated ones for edges indices and later in the model we add zeros for random indices.


In [9]:
# Transforms the numpy matrices/vectors to torch tensors.
features = torch.FloatTensor(features_np).to(device)
y = torch.LongTensor(y).to(device)
if type(adj) != torch.Tensor:
    adj = sparse_mx_to_torch_sparse_tensor(adj).to(device)
indices = torch.LongTensor(indices).to(device)

In [10]:
from unidecode import unidecode

def text_to_list(text):
    return unidecode(text).split(',')

def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    is_common = 1 if len(lst3)>0 else 0
    return len(lst3), is_common

#def columns_intersection(df):
    

def add_authors_to_pairs (pairs, authors):
    np_pairs = np.array(pairs)

    pairs_df = pd.DataFrame(np.transpose(pairs)).rename(columns={0: "paper_1", 1: "paper_2"})
    pairs_df = pairs_df.merge(authors, left_on='paper_1', right_on='paper_id', how='left').rename(columns={'authors': "authors_1"})
    pairs_df = pairs_df.merge(authors, left_on='paper_2', right_on='paper_id', how='left').rename(columns={'authors': "authors_2"})
    pairs_df.drop(['paper_id_x', 'paper_id_y'], axis=1, inplace=True)

    pairs_df['nb_common_author'] = pairs_df.apply(lambda row: intersection(row['authors_1'], row['authors_2'])[0], axis=1)
    pairs_df['is_common_author'] = pairs_df.apply(lambda row: intersection(row['authors_1'], row['authors_2'])[1], axis=1)

    pairs_tensor = torch.LongTensor(np.transpose(pairs_df[["paper_1", "paper_2", 'is_common_author', 'nb_common_author']].values.tolist()))
    
    return pairs_tensor



In [11]:
class GNN(nn.Module):
    def __init__(self, n_feat, n_hidden, n_class, dropout):
        super(GNN, self).__init__()
        self.fc1 = nn.Linear(n_feat, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, n_hidden)
        self.double_fc3 = nn.Linear((2*n_hidden), n_hidden)
        self.fc4 = nn.Linear(n_hidden, n_class)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        
        

    def forward(self, x_in, adj, pairs):
        
        h1 = self.fc1(x_in)
        z1 = self.relu(torch.mm(adj, h1))
        z1 = self.dropout(z1)

        h2 = self.fc2(z1)
        z2 = self.relu(torch.mm(adj, h2))
        z2 = self.dropout(z2)

        x = z2[pairs[0]] - z2[pairs[1]] # embedded features (z2) of node 0 - embedded features of node 1
        
        #x_auth = 2*pairs[2].reshape([len(pairs[2]), 1])
        #x_nb_auth = 2*pairs[3].reshape([len(pairs[3]), 1]) 
        
        #x = torch.cat((x, x_auth, x_nb_auth), dim=1) #pairs[3] : number of same authors or 1 if same author
        
        #x1 = z2[pairs[0]]
        #x2 = z2[pairs[1]]
        # could we add a new dimension to pairs to specify if same author(s)? and then what could we do?
              
        #x = torch.cat((x1, x2), dim=1)        
        #x = self.relu(self.double_fc3(x))        
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)

        return F.log_softmax(x, dim=1)

In [12]:
epochs = 200
n_hidden = 128
dropout_rate = 0.2
n_class = 2
n_features = features.shape[1]

# Creates the model and specifies the optimizer
model = GNN(n_features, n_hidden, n_class, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [13]:
help(train_model)

Help on function train_model in module utils:

train_model(model, optimizer, features, adj, indices, y, val_indices, y_val, epochs)



In [14]:
def train_model(model, optimizer, features, adj, indices, y, val_indices, y_val, epochs):
    # Train model
    model.train()
    start_time = time()
    #val_indices = add_authors_to_pairs(val_indices) #we add the authors to val_pairs
    #indices = add_authors_to_pairs(indices)
    today = datetime.today().strftime('%Y-%m-%d')
    for epoch in range(epochs):
        t = time()
        optimizer.zero_grad()
        rand_indices = torch.randint(0, features.size(0), (indices.size(0),indices.size(1)), device=adj.device)# We take random indices each time we run an epoch
        #rand_indices = add_authors_to_pairs(rand_indices)
        pairs = torch.cat((indices, rand_indices), dim=1) # Concatenate the edges indices and random indices.   
        #pairs = add_authors_to_pairs(pairs) #we add the authors to the pairs
        output = model(features, adj, pairs) # we run the model that gives the output.
        loss_train = F.nll_loss(output, y) # we are using nll_loss as loss to optimize, we store it in loss_train. We compare to y which is stable and contains the tag ones and zeros.
        acc_train = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y.cpu().numpy())# just to show it in the out put message of the training
        loss_train.backward() # The back propagation ? --> Computes the gradient of current tensor w.r.t. graph leaves
        optimizer.step() # Performs a single optimization step (parameter update).
        
        model.eval()
        output = model(features, adj, val_indices)
        y_val = torch.LongTensor(y_val).to(device)
        loss_val = F.nll_loss(output, y_val)
        acc_val = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y_val.cpu().numpy())

        if epoch % 5 == 0:
            print('Epoch: {:03d}'.format(epoch+1),
                  'loss_train: {:.4f}'.format(loss_train.item()),
                  'loss_val: {:.4f}'.format(loss_val.item()),
                  'acc_train: {:.4f}'.format(acc_train.item()),
                  'acc_val: {:.4f}'.format(acc_val.item()),
                  'time: {:.4f} s'.format(time() - t),
                 'total_time: {} min'.format(round((time() - start_time)/60)))
        if epochs % 50 == 0:
            model_path = "../submissions_files/{}-model-{}epochs.pt".format(today, epoch)
            torch.save(model.state_dict(), model_path)


    print("Optimization Finished in {} min!".format(round((time() - start_time)/60)))
    return model

In [15]:
indices.shape

torch.Size([2, 2103149])

In [16]:
#y_val = torch.FloatTensor(y_val).to(device)
trained_model = train_model(model, optimizer, features, adj, indices, y, val_indices, y_val, epochs)
model_nb = randint(0, 1000)
today = datetime.today().strftime('%Y-%m-%d')
model_path = "../submissions_files/{}-model-{}epochs-{}.pt".format(today, epochs, model_nb)
torch.save(trained_model.state_dict(), model_path)
print('Model saved in', model_path)

Epoch: 001 loss_train: 0.6946 loss_val: 0.6816 acc_train: 0.5000 acc_val: 0.6919 time: 26.8777 s total_time: 0 min
Epoch: 006 loss_train: 0.5066 loss_val: 0.5251 acc_train: 0.6755 acc_val: 0.7525 time: 8.2545 s total_time: 1 min
Epoch: 011 loss_train: 0.3882 loss_val: 0.4235 acc_train: 0.8688 acc_val: 0.8475 time: 8.0930 s total_time: 2 min
Epoch: 016 loss_train: 0.2847 loss_val: 0.3515 acc_train: 0.9128 acc_val: 0.8800 time: 8.8492 s total_time: 3 min
Epoch: 021 loss_train: 0.2052 loss_val: 0.3135 acc_train: 0.9297 acc_val: 0.8966 time: 8.6157 s total_time: 3 min
Epoch: 026 loss_train: 0.1580 loss_val: 0.3046 acc_train: 0.9415 acc_val: 0.9067 time: 8.4362 s total_time: 4 min
Epoch: 031 loss_train: 0.1360 loss_val: 0.3018 acc_train: 0.9474 acc_val: 0.9118 time: 8.9058 s total_time: 5 min
Epoch: 036 loss_train: 0.1283 loss_val: 0.3155 acc_train: 0.9503 acc_val: 0.9131 time: 8.5398 s total_time: 5 min
Epoch: 041 loss_train: 0.1231 loss_val: 0.3201 acc_train: 0.9526 acc_val: 0.9153 time: 

KeyboardInterrupt: 

In [23]:
PATH = "../submissions_files/2023-03-23-model-51epochs.pt"
model.load_state_dict(torch.load(PATH))
model.eval()

GNN(
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=128, bias=True)
  (double_fc3): Linear(in_features=256, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
)

In [19]:
rand_indices = torch.randint(0, features.size(0), (indices.size(0),indices.size(1)), device=adj.device)# We take random indices each time we run an epoch
pairs = torch.cat((indices, rand_indices), dim=1) # Concatenate the edges indices and random indices.   
output = model(features, adj, pairs) # we run the model that gives the output.
loss_train = F.nll_loss(output, y)
print(loss_train.item())

0.12730279564857483


In [None]:
model.eval()
output = model(features, adj, val_indices)
y_val = torch.LongTensor(y_val).to(device)
loss_val = F.nll_loss(output, y_val)
acc_val = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y_val.cpu().numpy())
print(loss_val.item())

In [20]:
# Evaluating the model
print(model.eval())
eval_pairs = np.array(np.transpose(val_edges))
#print(eval_pairs.shape)
eval_pairs = torch.LongTensor(eval_pairs).to(device)
#print(eval_pairs.shape)
eval_output = model(features, adj, eval_pairs)
#print(eval_output.shape)
y_pred = torch.exp(eval_output)
y_pred = y_pred.detach().cpu().numpy()

#y_val_pred_true = list()

y_val_pred_true = y_pred[:, 1]

    
print('Log loss:', log_loss(y_val, y_val_pred_true))

#y_val = torch.tensor(y_val).to(device)
#y_val_pred_true = torch.tensor(y_val_pred_true).to(device)
#print(y_val, y_val_pred_true)

GNN(
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=128, bias=True)
  (double_fc3): Linear(in_features=256, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
)
Log loss: 2.554288012771918


In [24]:
from datetime import datetime
import pandas as pd



# Read test data. Each sample is a pair of nodes
node_pairs = list()
with open('../test_data/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs.append((node_to_idx[int(t[0])], node_to_idx[int(t[1])]))

# Testing
model.eval()
node_pairs = np.array(np.transpose(node_pairs))
pairs = torch.LongTensor(node_pairs).to(device)
output = model(features, adj, pairs)
y_pred = torch.exp(output)
y_pred = y_pred.detach().cpu().numpy()

y_pred_true = list()
for element in y_pred:
    y_pred_true.append(element[1])
    

    
today = datetime.today().strftime('%Y-%m-%d')
random_nb = randint(0, 1000)

pd.DataFrame(y_pred_true, columns={'predicted'}).to_csv(
"../submissions_files/{}-submission-{}-{}.csv".format(today, 2, random_nb), header=True, index=True, index_label='id'
)

In [None]:
break

In [None]:
y_val = y_val.detach().cpu().numpy()
y_val

In [None]:
df = pd.DataFrame(y_pred)
df['y_val'] = list(y_val)
df.to_csv('../submissions_files/comparison_file.csv', sep=';')

In [None]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_val, y_pred[:, 1]), mean_absolute_error(y_val, y_pred[:, 0])

In [None]:
log_loss(y_val, y_pred[:, 1]), log_loss(y_val, y_pred[:, 0])

In [None]:
test_features = np.array([[5, 3, 0], [2, 1, 0], [4, 0, 1], [0, 3, 2]])
test_features = torch.LongTensor(test_features).to(device)
test_pairs = [[0, 0, 1, 2, 3, 2, 0, 1, 2, 3], [1, 2, 0, 0, 2, 3, 0, 1, 2, 3]]
test_pairs = torch.LongTensor(test_pairs).to(device)
print(test_features.shape, test_pairs.shape)
test_features[test_pairs[0]]

In [None]:
y_pred.shape, np.shape(y_val), np.shape(y_pred_true)

In [None]:
#print(output.shape)
print(features.shape, adj.shape, eval_pairs.shape)

In [None]:
z2[pairs[1,:]]

In [None]:
break

In [None]:
break

In [None]:
# Create class labels
y = np.zeros(4*G_train.number_of_edges())
y[:2*G_train.number_of_edges()] = 1 # Concatenated ones for edges indices and later in the model we add zeros for random indices.
train_edges[0]

In [None]:
t = time()
X_train = np.zeros((4*m, 2*features_np.shape[1]))

for i, edge in enumerate(train_edges):
    X_train[i] = np.concatenate((features_np[train_edges[i][0]], features_np[train_edges[i][1]]), axis=0)
    X_train[m+i] = np.concatenate((features_np[train_edges[i][0]], features_np[train_edges[i][1]]), axis=0)
    X_train[2*m+i] = np.concatenate((features_np[randint(0,n-1)], features_np[randint(0,n-1)]), axis=0)
    X_train[3*m+i] = np.concatenate((features_np[randint(0,n-1)], features_np[randint(0,n-1)]), axis=0)
    
X_val = np.zeros((len(val_edges), 2*features_np.shape[1]))
for i, edge in enumerate(val_edges):
    X_val[i] = np.concatenate((features_np[val_edges[i][0]], features_np[val_edges[i][1]]), axis=0)
    
print('X_train and X_val created in {} s!'.format(round(time()-t)))

In [None]:
t = time()
# Use logistic regression to predict if two nodes are linked by an edge
clf = LogisticRegression()
clf.fit(X_train, y)
y_pred = clf.predict_proba(X_val)
y_pred = y_pred[:,1]
print('Logistic regression performed in {} s!'.format(round(time()-t)))

In [None]:
print('Log loss:', log_loss(y_val, y_pred))

In [None]:
from datetime import datetime

# Read test data. Each sample is a pair of nodes
test_edges = list()
with open('../test_data/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        test_edges.append((node_to_idx[int(t[0])], node_to_idx[int(t[1])]))

X_test = np.zeros((len(test_edges), 2*features_np.shape[1]))
for i, edge in enumerate(test_edges):
    X_test[i] = np.concatenate((features_np[test_edges[i][0]], features_np[test_edges[i][1]]), axis=0)

t = time()
# Use logistic regression to predict if two nodes are linked by an edge
y_pred = clf.predict_proba(X_test)
y_pred = y_pred[:,1]
print('Logistic regression performed in {} s!'.format(round(time()-t)))


today = datetime.today().strftime('%Y-%m-%d')
random_nb = randint(0, 100000)

pd.DataFrame(y_pred, columns={'predicted'}).to_csv(
"../submissions_files/{}-submission-{}.csv".format(today, random_nb), header=True, index=True, index_label='id'
)
    
    
# # Testing
# node_pairs = np.array(np.transpose(node_pairs))
# pairs = torch.LongTensor(node_pairs).to(device)
# output = model(features, adj, pairs)
# y_pred = torch.exp(output)
# y_pred = y_pred.detach().cpu().numpy()

# y_pred_true = list()
# for element in y_pred:
#     y_pred_true.append(element[1])
    

    
# today = datetime.today().strftime('%Y-%m-%d')
# random_nb = randint(0, 100000)

# pd.DataFrame(y_pred_true, columns={'predicted'}).to_csv(
# "../submissions_files/{}-submission-{}.csv".format(today, random_nb), header=True, index=True, index_label='id'
# )