In [None]:
!pip install unidecode

import numpy as np
import random
from random import randint
from datetime import datetime
from sklearn.metrics import log_loss, accuracy_score
import networkx as nx
import torch
import torch.optim as optim
import torch.nn.functional as F
import os
from random import choice
from urllib.request import urlopen
import gzip
import pickle
from tqdm.notebook import tqdm
import requests
import io
from scipy import sparse
import matplotlib.pyplot as plt



device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [1]:
from time import time
import torch.nn as nn

from read_data import read_train_val_graph
from data_processing import create_and_normalize_adjacency


In [2]:
t = time()
G, G_train, train_edges, val_edges, val_indices, y_val, nodes, node_to_idx = read_train_val_graph()

print('Graph loaded and seperated, val indices generated and node to index mapping returned in {:.0f} s'.format(time()-t))

Number of nodes: 138499 number of edges: 1091955 in the Complete set
Number of nodes: 138499 number of edges: 982856 in the Training set
len(nodes) 138499
Returned G_train, train_edges, val_edges, y_val, nodes and node_to_idx objects
Loaded from edgelist.txt and with a training validation split ratio = 0.1
Graph loaded and seperated, val indices generated and node to index mapping returned in 10 s


In [3]:
adj, indices = create_and_normalize_adjacency(G_train)


  adj = nx.adjacency_matrix(G) # Obtains the adjacency matrix of the training graph


Created a normalized adjancency matrix of shape (138499, 138499)
Created indices (2, 2104211) with the positions of non zeros in adj matrix


In [None]:
class GNN(nn.Module):
    def __init__(self, n_text, n_text_auth, n_auth, n_feat, n_hidden, n_class, sub_class, dropout):
        super(GNN, self).__init__()
        self.abstract_emb = nn.Linear(n_text_auth, n_hidden)
        # self.abst_auth = nn.Linear(n_text_auth, n_hidden)
        self.auth_emb = nn.Linear(n_auth, n_hidden)
        self.fc1 = nn.Linear(n_feat+2*n_hidden, n_hidden)        
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc21 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(2*n_hidden, n_hidden)
        self.fc4 = nn.Linear(n_hidden, sub_class)
        self.fc5 = nn.Linear(sub_class, n_class)        
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.norm = nn.LayerNorm(n_auth+n_text+n_feat+n_text_auth)
        
        

    def forward(self, x_in, abstract, abst_auth, auth, adj, pairs):

        #y = torch.cat((abstract, abst_auth, auth, x_in), dim=1)
        #y = torch.cat((abstract, auth, abst_auth, x_in), dim=1)
        #y = self.norm(y)
        y = self.abstract_emb(abstract)
        y = self.relu(y)
        y = self.dropout(y)
        del(abstract)

        # yz = self.abst_auth(abst_auth)
        # yz = self.relu(yz)
        # yz = self.dropout(yz)
        del(abst_auth)

        z = self.auth_emb(auth)
        z = self.relu(z)
        z = self.dropout(z)
        del(auth)

        x_in = torch.cat((x_in, y, z), dim=1)

        
        h1 = self.fc1(x_in)
        z1 = self.relu(torch.spmm(adj, h1))
        z1 = self.dropout(z1)
        #del(y)

        h2 = self.fc2(z1)
        z2 = self.relu(torch.spmm(adj, h2))
        z2 = self.dropout(z2)
        del(h2, z1)

        h2 = self.fc21(z2)
        z2 = self.relu(torch.spmm(adj, h2))
        z2 = self.dropout(z2)


        z2 = torch.cat((z2, h1), dim=1)

        x = z2[pairs[0]] - z2[pairs[1]]
        # x = torch.cat((z2[pairs[0]] , z2[pairs[1]]), dim=1)
        del(z2)

        x = self.relu(self.fc3(x))
        x = self.dropout(x)
    
        del(pairs)
        
        x = self.relu(self.fc4(x))
        x = self.dropout(x)
        
        x = self.fc5(x)
        return F.log_softmax(x, dim=1)




In [None]:
def train_model(model, learning_rate, abstract, text_auth, auth, features, adj, indices, val_edges, y_val, epochs, run_number, window = 10):
    # Train model
    start_time = time()
    
    print('Initializing the optimizer with learning rate:', learning_rate)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) #optimizer with halving learning rate in training
    try: os.mkdir('./outputs')
    except: pass

    today = datetime.today().strftime('%Y-%m-%d-%H:%M')
    list_loss_val = []
    list_loss_train = []
    list_epochs = []

    
    # Create class labels
    y = np.zeros(2*indices.shape[1])
    y[:indices.shape[1]] = 1 # Concatenated ones for edges indices and later in the model we add zeros for random indices.
    y = torch.LongTensor(y).to(device)

    # Create negative edges by taking a random pairs of nodes (there is 1/10,000 chance of getting a positive edge)
    rand_indices = torch.randint(0, features.shape[0], size=(indices.shape[0],indices.shape[1])).to(device)
    pairs = torch.cat((indices, rand_indices), dim=1)

    # Creating negative val indices by taking random pairs of nodes
    rand_indices = torch.randint(0, features.shape[0], size=(val_edges.shape[0],val_edges.shape[1])).to(device)
    val_indices = torch.cat((val_edges, rand_indices), dim=1)
    
    print('Start training...')

    for epoch in range(epochs):
        t = time()
        optimizer.zero_grad()     

        model.train()
        output = model(features, abstract, text_auth, auth, adj, pairs).to(device) # we run the model that gives the output.
        loss_train = F.nll_loss(output, y) # we are using nll_loss as loss to optimize, we store it in loss_train. We compare to y which is stable and contains the tag ones and zeros.
        acc_train = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y.cpu().numpy())# just to show it in the out put message of the training
        loss_train.backward() # The back propagation ? --> Computes the gradient of current tensor w.r.t. graph leaves
        optimizer.step() # Performs a single optimization step (parameter update).
        
        model.eval()
        output = model(features, abstract, text_auth, auth, adj, val_indices).to(device)
        #y_val = torch.LongTensor(y_val).to(device)
        loss_val = F.nll_loss(output, y_val)
        list_loss_val.append(loss_val.item())
        list_loss_train.append(loss_train.item())
        list_epochs.append(epoch)
        acc_val = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y_val.cpu().numpy())        


        if epoch % 5 == 0:
            print('Epoch: {:03d}'.format(epoch+1),
                  'loss_train: {:.4f}'.format(loss_train.item()),
                  'loss_val: {:.4f}'.format(loss_val.item()),
                  'acc_train: {:.4f}'.format(acc_train.item()),
                  'acc_val: {:.4f}'.format(acc_val.item()),
                  'time: {} s'.format(int(round(time()) - round(t))),
                 'total_time: {} min'.format(round((time() - start_time)/60)))
            


    print("Optimization Finished in {} min!".format(round((time() - start_time)/60)))
    return model, list_loss_val, list_loss_train, list_epochs



In [None]:
def prepare_data_to_train (features, authors, adj, auth_matrix, indices, val_indices, y_val):
    
    print('Preparing the data for training...')
    
    t = time()
    
    y_val = torch.LongTensor(y_val).to(device)
    
    features = torch.FloatTensor(features).to(device)
    
    indices = torch.LongTensor(indices).to(device)
    val_indices = torch.LongTensor(val_indices).to(device)
    adj = sparse_mx_to_torch_sparse_tensor(adj).to(device)
    auth_matrix = sparse_mx_to_torch_sparse_tensor(auth_matrix).to(device)
    
    print('Data converted into torch tensors and authors added to indices in {:.0f} min'.format((time()-t)/60))

    return features, adj, auth_matrix, indices, val_indices, y_val 

In [None]:
features_torch, adj_torch, auth_torch, indices_torch, val_indices_torch, y_val_torch = prepare_data_to_train(walks_wv, authors, adj, auth_matrix, indices, val_indices, y_val)
tfidf_matrix_torch = torch.FloatTensor(tfidf_reduced).to(device)
authors_reduced_torch = torch.FloatTensor(authors_reduced).to(device)
bert_abstract_torch = torch.FloatTensor(bert_abstract_embedding).to(device)

In [None]:
torch.cuda.empty_cache()

#Create the model
n_hidden = 64
dropout_rate = 0.2
sub_class = 16
n_class = 2
text_embedding = bert_abstract_torch
text_auth_emb = tfidf_matrix_torch
n_text = text_embedding.shape[1]
n_text_auth = text_auth_emb.shape[1]
n_auth = authors_reduced_torch.shape[1] 
n_features = features_torch.shape[1]

model = GNN(n_text, n_text_auth, n_auth, n_features, n_hidden, n_class, sub_class, dropout_rate).to(device)


# Train the model
epochs = 220
run_number = randint(0, 1000)
learning_rate = 0.01

trained_model, list_loss_val, list_loss_train, list_epochs = train_model(model, learning_rate, text_embedding, 
                            text_auth_emb, authors_reduced_torch, features_torch, adj_torch, indices_torch, 
                            val_indices_torch, y_val_torch, epochs, run_number)
