In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

from utils import random_walk, generate_walks, read_train_val_graph, sparse_mx_to_torch_sparse_tensor
from utils import apply_word2vec_on_features, create_and_normalize_adjacency, save_subgraph_in_file, train_model

In [15]:
path = '../input_data/edgelist.txt'

G, G_train, train_edges, val_edges, val_indices, y_val, nodes, node_to_idx = read_train_val_graph(val_ratio=0.1, path=path)

Number of nodes: 138499 number of edges: 1091955 in All the set
Number of nodes: 138499 number of edges: 982859 in the Training set
len(nodes) 138499
Creating random val_edges...
Returned G_train, train_edges, val_edges, y_val, nodes and node_to_idx objects
Loaded from edgelist.txt and with a training validation split ratio = 0.1


In [5]:
class GNN(nn.Module):
    def __init__(self, n_feat, n_hidden, n_class, dropout):
        super(GNN, self).__init__()
        self.fc1 = nn.Linear(n_feat, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden+2, n_hidden)
        self.double_fc3 = nn.Linear((2*n_hidden), n_hidden)
        self.fc4 = nn.Linear(n_hidden, n_class)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        
        

    def forward(self, x_in, adj, pairs):
        
        h1 = self.fc1(x_in)
        z1 = self.relu(torch.mm(adj, h1))
        z1 = self.dropout(z1)

        h2 = self.fc2(z1)
        z2 = self.relu(torch.mm(adj, h2))

        x = z2[pairs[0]] - z2[pairs[1]] # embedded features (z2) of node 0 - embedded features of node 1
        
        x_auth = 2*pairs[2].reshape([len(pairs[2]), 1])
        x_nb_auth = 2*pairs[3].reshape([len(pairs[3]), 1]) 
        
        x = torch.cat((x, x_auth, x_nb_auth), dim=1) #pairs[3] : number of same authors or 1 if same author
        
        #x1 = z2[pairs[0]]
        #x2 = z2[pairs[1]]
        # could we add a new dimension to pairs to specify if same author(s)? and then what could we do?
              
        #x = torch.cat((x1, x2), dim=1)        
        #x = self.relu(self.double_fc3(x))        
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)

        return F.log_softmax(x, dim=1)

In [9]:
epochs = 200
n_hidden = 128
dropout_rate = 0.2
n_class = 2
n_features = 128

# Creates the model and specifies the optimizer
model = GNN(n_features, n_hidden, n_class, dropout_rate).to(device)

In [11]:
PATH = "../submissions_files/2023-03-23-model-200epochs.pt"
model.load_state_dict(torch.load(PATH))
model.eval()

GNN(
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (fc3): Linear(in_features=130, out_features=128, bias=True)
  (double_fc3): Linear(in_features=256, out_features=128, bias=True)
  (fc4): Linear(in_features=128, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
)

In [18]:
from datetime import datetime
import pandas as pd



# Read test data. Each sample is a pair of nodes
node_pairs = list()
with open('../test_data/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs.append((node_to_idx[int(t[0])], node_to_idx[int(t[1])]))

# Testing
model.eval()
node_pairs = np.array(np.transpose(node_pairs))
pairs = torch.LongTensor(node_pairs).to(device)
output = model(features, adj, pairs)
y_pred = torch.exp(output)
y_pred = y_pred.detach().cpu().numpy()

y_pred_true = list()
for element in y_pred:
    y_pred_true.append(element[1])
    

    
today = datetime.today().strftime('%Y-%m-%d')
random_nb = randint(0, 1000)

pd.DataFrame(y_pred_true, columns={'predicted'}).to_csv(
"../submissions_files/{}-submission-{}-{}.csv".format(today, model_nb, random_nb), header=True, index=True, index_label='id'
)

NameError: name 'features' is not defined