# Packages and functions

In [9]:
!pip install unidecode

import numpy as np
from random import random
from random import randint
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
import pandas as pd
import numpy as np
from time import time
from gensim.models import Word2Vec
import networkx as nx
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
from random import choice
from scipy.sparse import identity, diags
from unidecode import unidecode
from urllib.request import urlopen
import gzip
import pickle
from gensim.parsing.preprocessing import remove_stopwords
import re
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from tqdm.notebook import tqdm



device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#device = torch.device("cpu")

zsh:1: command not found: pip


In [2]:
def save_subgraph_in_file(nbr_nodes, source_path='../input_data/edgelist.txt', destination_path='../input_data/small_edgelist.txt'):
    G = nx.read_edgelist(source_path, delimiter=',', create_using=nx.Graph(), nodetype=int)
    G = G.subgraph(range(nbr_nodes))
    nx.write_edgelist(G, path=destination_path, delimiter=',')
    print(G.number_of_nodes(), 'nodes,', G.number_of_edges(), 'edges Graph extracted from', source_path[source_path.rfind('/')+1:])
    G = nx.read_edgelist(destination_path, delimiter=',', create_using=nx.Graph(), nodetype=int)
    print(G.number_of_nodes(), 'nodes,', G.number_of_edges(), 'edges Graph saved in', destination_path[destination_path.rfind('/')+1:])
    print(max(G.nodes))
    return


def read_train_val_graph(path='../input_data/edgelist.txt', val_ratio=0.1):
    #gets the data from the file on the distant server
    G = nx.read_edgelist(urlopen('https://www.lix.polytechnique.fr/~nikolentzos/files/aai/challenge/edgelist.txt'), delimiter=',', create_using=nx.Graph(), nodetype=int)
    nodes = list(G.nodes())
    n = G.number_of_nodes()
    m = G.number_of_edges()
    edges = list(G.edges())

    print('Number of nodes:', n, 'number of edges:', m,'in the Complete set')

    node_to_idx = dict()
    for i, node in enumerate(nodes):
        node_to_idx[node] = i

    val_edges = list()
    G_train = G.copy()

    for edge in edges:
        if random() < val_ratio and edge[0] < n and edge[1] < n:
            val_edges.append(edge)
            G_train.remove_edge(edge[0], edge[1]) # We remove the val edges from the graph G

   
    #for edge in val_edges:
        

    n = G_train.number_of_nodes()
    m = G_train.number_of_edges()
    train_edges = list(G_train.edges())

    print('Number of nodes:', n, 'number of edges:', m, 'in the Training set')
    print('len(nodes)', len(nodes))

    y_val = [1]*len(val_edges)

    n_val_edges = len(val_edges)
    
    print('Creating random val_edges...')
    for i in range(n_val_edges):
        n1 = nodes[randint(0, n-1)]
        n2 = nodes[randint(0, n-1)]
        (n1, n2) = (min(n1, n2), max(n1, n2))
        while n2 >= n: #or (n1, n2) in train_edges:
            if (n1, n2) in train_edges:
                print((n1, n2), 'in train_edges:')
            n1 = nodes[randint(0, n-1)]
            n2 = nodes[randint(0, n-1)]
            (n1, n2) = (min(n1, n2), max(n1, n2))
        val_edges.append((n1, n2))

    y_val.extend([0]*(n_val_edges))
    
    ### From Giannis /!\
    val_indices = np.zeros((2,len(val_edges)))
    for i,edge in enumerate(val_edges):
        val_indices[0,i] = node_to_idx[edge[0]]
        val_indices[1,i] = node_to_idx[edge[1]]
    
    print('Returned G_train, train_edges, val_edges, y_val, nodes and node_to_idx objects')
    print('Loaded from', path[path.rfind('/')+1:], 'and with a training validation split ratio =', val_ratio)
    
    
    
    return G, G_train, train_edges, val_edges, val_indices, y_val, nodes, node_to_idx

def random_walk(G, node, walk_length):
    walk = [node]
  
    for i in range(walk_length-1):
        neibor_nodes = list(G.neighbors(walk[-1]))
        if len(neibor_nodes) > 0:
            next_node = choice(neibor_nodes)
            walk.append(next_node)
    walk = [node for node in walk] # in case the nodes are in string format, we don't need to cast into string, but if the nodes are in numeric or integer, we need this line to cast into string
    return walk


def generate_walks(G, num_walks, walk_length):
  # Runs "num_walks" random walks from each node, and returns a list of all random walk
    t = time()
    print('Start generating walks....')
    walks = list()  
    for i in range(num_walks):
        for node in G.nodes():
            walk = random_walk(G, node, walk_length)
            walks.append(walk)
        #print('walks : ', walks)
    print('Random walks generated in in {}s!'.format(round(time()-t)))
    return walks

def apply_word2vec_on_features(features, nodes, vector_size=128, window=5, min_count=0, sg=1, workers=8):
    t = time()
    print('Start applying Word2Vec...')
    wv_model = Word2Vec(vector_size=vector_size, window=window, min_count=min_count, sg=sg, workers=workers)
    wv_model.build_vocab(features)
    wv_model.train(features, total_examples=wv_model.corpus_count, epochs=5) 
    print('Word2vec model trained on features in {} min!'.format(round((time()-t)/60)))
    features_np = []
    for node in nodes:
        features_np.append(wv_model.wv[node])

    features_np = np.array(features_np)
    print(features_np.shape, 'features numpy array created in {} min!'.format(round((time()-t)/60)))
    return features_np



def normalize_adjacency(A):
    n = A.shape[0]
    A = A + identity(n)
    degs = A.dot(np.ones(n))
    inv_degs = np.power(degs, -1)
    D_inv = diags(inv_degs)
    A_hat = D_inv.dot(A)
    return A_hat

def create_and_normalize_adjacency(G):
    adj = nx.adjacency_matrix(G) # Obtains the adjacency matrix of the training graph
    adj = normalize_adjacency(adj)
    print('Created a normalized adjancency matrix of shape', adj.shape)
    indices = np.array(adj.nonzero()) # Gets the positions of non zeros of adj into indices
    print('Created indices', indices.shape, 'with the positions of non zeros in adj matrix')
    return adj, indices

def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)


def text_to_list(text):
    return unidecode(text).split(',')

def intersection(lst1, lst2): # a function that returns the number of common items of two lists and 1 or 0 if there are common. This function will be used in add_authors_to_pairs to add this features to the pairs.
    lst3 = [value for value in lst1 if value in lst2]
    is_common = 1 if len(lst3)>0 else 0
    return len(lst3)+1, is_common+1


def add_authors_to_pairs (pairs, authors):
    authors = pd.DataFrame(authors)
    try: 
        pairs = pairs.detach().cpu().numpy()
    except:
        pass
        

    pairs_df = pd.DataFrame(np.transpose(pairs)).rename(columns={0: "paper_1", 1: "paper_2"})
    pairs_df = pairs_df.merge(authors, left_on='paper_1', right_on='paper_id', how='left').rename(columns={'authors': "authors_1"})
    pairs_df = pairs_df.merge(authors, left_on='paper_2', right_on='paper_id', how='left').rename(columns={'authors': "authors_2"})
    pairs_df.drop(['paper_id_x', 'paper_id_y'], axis=1, inplace=True)

    pairs_df['nb_common_author'] = pairs_df.apply(lambda row: intersection(row['authors_1'], row['authors_2'])[0], axis=1)
    pairs_df['is_common_author'] = pairs_df.apply(lambda row: intersection(row['authors_1'], row['authors_2'])[1], axis=1)

    pairs_tensor = torch.LongTensor(np.transpose(pairs_df[["paper_1", "paper_2", 'is_common_author', 'nb_common_author']].values.tolist())).to(device)
    
    return pairs_tensor


In [3]:
def read_and_clean_abstracts (nodes, sample_length=-1, abstracts_path = 'https://www.lix.polytechnique.fr/~nikolentzos/files/aai/challenge/abstracts.txt'):
    t = time()
    abstracts = dict()
    abstracts_list = list()
    f = urlopen(abstracts_path)
    
    for i, line in tqdm(enumerate(f)):
        if i == sample_length:
            break
        if i in nodes:
            node, abstract = str(line).lower().split('|--|')
            abstract = remove_stopwords(abstract)
            #abstract = re.sub(r"[,.;@#?!&$()-]", " ", abstract)
            abstract = re.sub(r"[^a-zA-Z0-9\s]", "", abstract)
            #abstract = re.sub(r"\\", " ", abstract)
            abstract = remove_stopwords(abstract)

            for word in abstract.split()[:-1]:
                #abstract = abstract.replace(word, stemmer.stem(word))
                abstract = abstract.replace(word, lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(word), pos='s'), pos='a'), pos='n'), pos='v'), pos='r'))
            
            node = re.sub("[^0-9]", "", node)
            if i != int(node):
                print('i and node not the same', i, node)
            abstracts[int(node)] = abstract
            abstracts_list.append(abstract)
        
    print('Text loaded and cleaned in {:.0f} min'.format((time()-t)/60))
    return abstracts

def doc_counter (documents, word): #a function that return the number of documents containing a word
    counter = 0
    for i in documents:
        if word in documents[i]:
            counter += 1
    return counter



In [2]:
class Vocabulary:

    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.word_occurrence = {}
        self.word2node = {}
        self.words_list = []
        self.sentences_list = []
        self.sentences_list_words = []
        self.num_words = 0
        self.num_sentences = 0
        self.longest_sentence = 0

    def add_word(self, word, node):
        if word not in self.word2index:
            # First entry of word into vocabulary
            self.words_list.append(word)
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
            self.word_occurrence[word] = 1
            self.word2node[word] = [node]
        else:
            # Word exists; increase word count
            self.word2count[word] += 1
            self.word_occurrence[word] += 1
            if node not in self.word2node[word]:
                self.word2node[word].append(node)
            # self.num_words += 1
            
    def add_sentence(self, sentence, node):
        sentence_len = 0
        for word in sentence.split()[:-1]:
            sentence_len += 1
            self.add_word(word, node)
        if sentence_len > self.longest_sentence:
            # This is the longest sentence
            self.longest_sentence = sentence_len
        # Count the number of sentences
        self.num_sentences += 1
        self.sentences_list.append(sentence)
        self.sentences_list_words.append(sentence.split()[:-1])

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

    def words(self):
        return self.words_list



In [7]:
# with open('vocab.pkl', 'wb') as f:
#     pickle.load(voc, f)
    
if os.path.isfile('vocab.pkl'):
    with open('vocab.pkl', 'rb') as f:
        voc = pickle.load(f)
    print('Successfully loaded vocab object from vocab.pkl')
else:
    print('File does not exist.')

Successfully loaded vocab object from vocab.pkl


In [29]:
empty_abstracts = 0
long_abstracts = []
very_long_abstracts = []
huge_abstracts = []
for i, sentence in tqdm(enumerate(voc.sentences_list_words)):
    if len(sentence) == 0:
        empty_abstracts += 1
    if len(sentence) > 128:
        long_abstracts.append(i)
    if len(sentence) > 256:
        very_long_abstracts.append(i)
    if len(sentence) > 512:
        huge_abstracts.append(i)
print(empty_abstracts, len(long_abstracts), len(very_long_abstracts), len(huge_abstracts))

0it [00:00, ?it/s]

7249 11217 95 12


In [34]:
voc.num_words

11280217

# Load graph and authors data from sources

In [7]:
G, G_train, train_edges, val_edges, val_indices, y_val, nodes, node_to_idx = read_train_val_graph(val_ratio=0.1)


Number of nodes: 138499 number of edges: 1091955 in the Complete set
Number of nodes: 138499 number of edges: 982779 in the Training set
len(nodes) 138499
Creating random val_edges...
Returned G_train, train_edges, val_edges, y_val, nodes and node_to_idx objects
Loaded from edgelist.txt and with a training validation split ratio = 0.1


In [171]:
adj, indices = create_and_normalize_adjacency(G_train)


  adj = nx.adjacency_matrix(G) # Obtains the adjacency matrix of the training graph


Created a normalized adjancency matrix of shape (138499, 138499)
Created indices (2, 2104057) with the positions of non zeros in adj matrix


In [108]:
authors = pd.read_csv(urlopen('https://www.lix.polytechnique.fr/~nikolentzos/files/aai/challenge/authors.txt'), sep = '|', header=None)
authors = authors.rename(columns={0: "paper_id", 2: "authors"})
authors['authors'] = authors['authors'].apply(text_to_list)
authors = authors[["paper_id", "authors"]]
authors = authors[authors['paper_id'] <= max(G.nodes())]
authors.head()

Unnamed: 0,paper_id,authors
0,0,"[James H. Niblock, Jian-Xun Peng, Karen R. McM..."
1,1,"[Jian-Xun Peng, Kang Li, De-Shuang Huang]"
2,2,[J. Heikkila]
3,3,"[L. Teslic, B. Hartmann, O. Nelles, I. Skrjanc]"
4,4,"[Long Zhang, Kang Li, Er-Wei Bai, George W. Ir..."


# Create TF-IDF matrix

In [56]:


# n = -1 #length of the sample to develop and test the pipeline (-1 or negative values to take all the dataset)

#takes 4 minutes to process all the abstracts
abstracts = read_and_clean_abstracts(nodes, sample_length=n)  #149s #194s
abstracts_dict_list_words = {i: abstracts[i].split()[:-1] for i in nodes}
abstracts_list_sentences = [list(item)[1][:-3] for item in abstracts.items()]

#we create a vacabulary of words and sentences (abstracts)
t = time()
voc = Vocabulary('abstracts') 
for node in tqdm(nodes):
    voc.add_sentence(abstracts[node], node)

print('Vocab built in {:.0f} min'.format((time()-t)/60))
print('Vocab size is:', voc.num_words)

138499it [06:41, 345.37it/s]


Text loaded and cleaned in 7 min


100%|███████████████████████████████████| 138499/138499 [28:52<00:00, 79.93it/s]

Vocab built in 29 min
Vocab size is: 11280217





In [83]:
words_multiple = {key:value for (key, value) in voc.word2node.items() if len(value) >= 2}

len(words_multiple)


62907

In [95]:
#Now we will compute a logarithmic tf-idf matrix
from sklearn.feature_extraction.text import TfidfVectorizer
t = time()

# Create a TfidfVectorizer object with logarithmic tf
# And as we are intrested in links between words, we will take only words that occured at least in two abstracts
words_multiple = {key:value for (key, value) in voc.word2node.items() if len(value) >= 2}
vectorizer = TfidfVectorizer(vocabulary=list(words_multiple.keys()), sublinear_tf=True)


# Fit the vectorizer to the sentences and transform them into a TF-IDF matrix

tfidf_matrix = vectorizer.fit_transform(abstracts_list_sentences)

# Print the TF-IDF matrix
print('tf-idf matrix generated in {:.0f} sec'.format(time()-t))
print('tf-idf shape:', tfidf_matrix.shape)



tf-idf matrix generated in 7 sec
tf-idf shape: (138499, 62907)


In [130]:
from scipy import sparse

sparse.save_npz("tfidf_matrix.npz", tfidf_matrix)
your_matrix_back = sparse.load_npz("tfidf_matrix.npz")


In [341]:
tfidf_matrix.shape

(138499, 62907)

In [170]:
tfidf_matrix = sparse.load_npz("tfidf_matrix.npz")


# BART

In [200]:
import torch
from transformers import BartTokenizer, BartModel

# Load the BART model and tokenizer
model = BartModel.from_pretrained('facebook/bart-large')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')



2023-04-28 19:44:39.933220: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Downloading:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

In [241]:
abstracts_bart_embeddings = []

# Define a function to generate embeddings for text
def get_bart_embeddings(text, model):
    # Tokenize the input text
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

    # Generate embeddings using the BART model
    with torch.no_grad():
        model_output = model(**encoded_input)
        embeddings = model_output.last_hidden_state.mean(dim=1).squeeze()

    return embeddings

In [306]:
for i in tqdm(range(len(abstracts_bart_embeddings), len(voc.sentences_list))):
    abstract = voc.sentences_list[i]
    abstracts_bart_embeddings.append(get_bart_embeddings(abstract, model))


  0%|          | 0/115749 [00:00<?, ?it/s]

In [305]:
saved_abstracts_bart_embeddings = abstracts_bart_embeddings

In [309]:
len(saved_abstracts_bart_embeddings), len(abstracts_bart_embeddings), len(voc.sentences_list)

(138499, 138499, 138499)

In [310]:
abstracts_bart_embeddings = torch.stack(abstracts_bart_embeddings)
abstracts_bart_embeddings.shape

torch.Size([138499, 1024])

In [324]:
abstracts_bart_embeddings.to(device)

tensor([[ 1.9488e-01,  1.0046e+00, -5.2291e-01,  ..., -2.5941e-01,
          5.3308e-02,  6.6539e-02],
        [ 1.5396e-01,  1.2314e+00, -7.1842e-01,  ...,  6.7827e-02,
         -6.3357e-02, -8.9790e-02],
        [ 2.2133e-01,  9.1838e-01, -3.4919e-01,  ..., -7.9309e-02,
          1.4963e-01,  6.1544e-01],
        ...,
        [ 1.6774e-01,  1.1953e+00, -1.5907e-01,  ...,  1.8041e-01,
          5.2979e-01, -1.7008e-01],
        [ 6.9840e-02,  7.6416e-01, -6.6232e-05,  ..., -1.8154e-01,
          4.7777e-01, -3.7687e-01],
        [ 2.7550e-04,  1.8085e+00, -7.2895e-01,  ...,  3.4084e-01,
         -7.1908e-02,  3.1836e-01]])

In [314]:
torch.save(abstracts_bart_embeddings, 'bart_embeddings.pt')


In [339]:
bart_vocab = tokenizer.get_vocab()

print(len(bart_vocab.keys()))

50265


# Read processed data

In [315]:
t = time()
from io import BytesIO

walks_url = 'https://storage.googleapis.com/link_prediction_processed_data/walks_wv.npy'
with urlopen(walks_url) as url:
    data = url.read()

# Create a seekable file-like object from the data
fileobj = BytesIO(data)

# Load the data from the file object
walks_wv = np.load(fileobj)
print('wv walks loaded from GCP in {:.0f} sec'.format(time()-t))
walks_wv.shape

wv walks loaded from GCP in 8 sec


(138499, 64)

In [343]:
t = time()
# abstract_url = 'https://storage.googleapis.com/link_prediction_processed_data/embedded_abstracts_dict_192array.pkl.gz'

# with urlopen(abstract_url) as response:
#     compressed_data = response.read()

with open('embedded_abstracts_dict_192array.pkl.gz', 'rb') as f:
    compressed_data = f.read()


# Decompress the data
words_embedding_192 = pickle.loads(gzip.decompress(compressed_data))
print('File loaded and decompressed in {:.0f} min'.format((time()-t)/60))
print('len(my_dict):', len(words_embedding_192))


File loaded and decompressed in 5 min
len(my_dict): 138499


In [None]:
# Assume words_embedding_192 is the dictionary of arrays
max_length = 1024
embedding_size = 192

# Create a matrix of zeros to hold the padded embeddings
padded_embeddings = np.zeros((len(words_embedding_192), max_length, embedding_size))

# Loop over each abstract and pad/truncate its embedding
for i, embedding in tqdm(enumerate(words_embedding_192.values())):
    length = min(len(embedding), max_length)
    try: padded_embeddings[i, :length, :] = embedding[:length, :]
    except: pass

# Convert the matrix of padded embeddings to a PyTorch tensor
tensor_embeddings = torch.from_numpy(padded_embeddings).float()

0it [00:00, ?it/s]

In [None]:
# transforming dict of embedding abstracts words into a list of embedding abstracts
from torch.nn.utils.rnn import pad_sequence
t = time()
my_list = list(my_dict.values())
tensor_list = [torch.tensor(arr) for arr in my_list]
print('transformed the dict of lists into a list of tensors in {:.0f} min'.format((time()-t)/60))



In [None]:
# max and mean pooling of words of the abstracts

t = time()

# mean pooling of words
mean_abstract_embedding = []
for key in my_dict.keys():
    if len(my_dict[key]) > 0:
        mean_abstract_embedding.append(np.mean(my_dict[key], axis=0))
    else:
        mean_abstract_embedding.append(np.zeros(my_dict[0].shape[1]))
mean_abstract_embedding = np.array(mean_abstract_embedding)

# max pooling
max_abstract_embedding = []
for key in my_dict.keys():
    if len(my_dict[key]) > 0:
        max_abstract_embedding.append(np.max(my_dict[key], axis=0))
    else:
        max_abstract_embedding.append(np.zeros(my_dict[0].shape[1]))
max_abstract_embedding = np.array(max_abstract_embedding)

print('max and mean pooling performed in {:.0f} sec'.format((time()-t)))

# Model

In [319]:
class GNN(nn.Module):
    def __init__(self, n_feat, n_hidden, n_class, sub_class, dropout):
        super(GNN, self).__init__()
        #self.dense = nn.Linear(vocab_size, 1000)
        self.dense2 = nn.Linear(1024, n_feat)
        self.fc1 = nn.Linear(n_feat, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, n_hidden)
        self.double_fc3 = nn.Linear((3*n_hidden), n_hidden)
        self.fc4 = nn.Linear(n_hidden, sub_class)
        self.fc5 = nn.Linear(sub_class, n_class)        
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        
        

    def forward(self, x_in, bart_abstract, adj, pairs):
        #print('tfidf_matrix', tfidf_matrix.shape)
        
        #print('h_abstr', h_abstr.shape)
        #x_in = torch.cat((x_in, h_abstr), dim=1)
        #print('x_in', x_in.shape)
        
              
        h1 = self.fc1(x_in)
        z1 = self.relu(torch.spmm(adj, h1)) # sparce matrix multiplication
        z1 = self.dropout(z1)
        del(x_in, h1)

        h2 = self.fc2(z1)
        z2 = self.relu(torch.spmm(adj, h2))
        z2 = self.dropout(z2)
        del(h2, z1, adj)

        #third round of GNN is not used
        #h3 = self.fc2(z2)
        #z3 = self.relu(torch.spmm(adj, h3))
        #z3 = self.dropout(z3)
        #print('z2', z2.shape)
        #del(h3, z3, adj)

        x = z2[pairs[0]] * z2[pairs[1]] # embedded features (z2) of node 0 - embedded features of node 1
        #print('x', x.shape)
        x = pairs[3][:, None] * x
        #print('x', x.shape)
        #x1 = z2[pairs[0]]
        #x2 = z2[pairs[1]]
        
        #x = torch.cat((x, x1, x2), dim=1)
        #del(x1, x2)
        
        #h_abstr = self.dense(tfidf_matrix)
        h_abstr = self.dense2(bart_abstract)
        y = torch.cat((h_abstr[pairs[0]],h_abstr[pairs[1]]), dim=1)
        
        #print('h_abstr', h_abstr.shape)
        x = torch.cat((x, y), dim=1)
        del(h_abstr)
        del(pairs)

        x = self.relu(self.double_fc3(x))
        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        
        x = self.relu(self.fc4(x))
        x = self.dropout(x)
        
        x = self.fc5(x)

        return F.log_softmax(x, dim=1)




In [None]:
class GNN(nn.Module):
    def __init__(self, n_feat, n_hidden, n_class, sub_class, dropout):
        super(GNN, self).__init__()
        self.fc1 = nn.Linear(n_feat, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, n_hidden)
        self.fc4 = nn.Linear(n_hidden, sub_class)
        self.fc5 = nn.Linear(sub_class, n_class)        
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        
        

    def forward(self, x_in, adj, pairs):
        
              
        h1 = self.fc1(x_in)
        z1 = self.relu(torch.spmm(adj, h1)) # sparce matrix multiplication
        z1 = self.dropout(z1)
        del(x_in, h1)

        h2 = self.fc2(z1)
        z2 = self.relu(torch.spmm(adj, h2))
        z2 = self.dropout(z2)
        del(h2, z1, adj)

        x = z2[pairs[0]] * z2[pairs[1]] # embedded features (z2) of node 0 - embedded features of node 1


        x = self.relu(self.fc3(x))
        x = self.dropout(x)
        
        x = self.relu(self.fc4(x))
        x = self.dropout(x)
        
        x = self.fc5(x)

        return F.log_softmax(x, dim=1)

In [335]:
def prepare_data_to_train (features, authors, adj, indices, val_indices, y_val):
    
    print('Preparing the data for training...')
    
    t = time()
    
    y_val = torch.LongTensor(y_val).to(device)

    # Create class labels
    y = np.zeros(2*indices.shape[1])
    y[:indices.shape[1]] = 1 # Concatenated ones for edges indices and later in the model we add zeros for random indices.
    y = torch.LongTensor(y).to(device)
    
    features = torch.FloatTensor(features).to(device)
    
    indices = torch.LongTensor(indices).to(device)
    adj = sparse_mx_to_torch_sparse_tensor(adj).to(device)
    #tfidf_matrix = sparse_mx_to_torch_sparse_tensor(tfidf_matrix).to(device)
    
    # the function add_authors_to_pairs converts into torch tensors and sends to Device    
    val_indices = add_authors_to_pairs(val_indices, authors) #we add the authors to val_pairs
    indices = add_authors_to_pairs(indices, authors) #we add the authors to indices    
    rand_indices = np.random.randint(0, features.shape[0], (indices.shape[0],indices.shape[1]))# We take random indices each time we run an epoch
    rand_indices = add_authors_to_pairs(rand_indices, authors)

    pairs = torch.cat((indices, rand_indices), dim=1) # Concatenate the edges indices and random indices. 
    del(authors, indices, rand_indices)
    
    print('Data converted into torch tensors and authors added to indices in {:.0f} min'.format((time()-t)/60))

    return features, adj, pairs, y, val_indices, y_val


    
def early_stopping(loss_train, list_loss_train, loss_val, list_loss_val, window=10, tolerance=0.01):
    if (len(list_loss_val) == window and loss_val > (sum(list_loss_val)/len(list_loss_val)) and loss_train + tolerance < loss_val) or (len(list_loss_train) == window and loss_train > (sum(list_loss_train)/len(list_loss_train))):
        print('train: {:.5f} val: {:.5f} mean val: {:.5f}'.format(loss_train, loss_val, (sum(list_loss_val)/len(list_loss_val))))
        return True
    return False
    
def early_stopping(list_loss_val, list_loss_train, tolerance=0.1):
    if list_loss_val[-1] > list_loss_train[-1] + tolerance and len(list_loss_val) > 15:
        return True
    for element in list_loss_val[:-1]:
        if list_loss_val[-1] < element:
            return False
    for element in list_loss_val[:-2]:
        if list_loss_val[-2] < element:
            return False
    for element in list_loss_val[:-3]:
        if list_loss_val[-3] < element:
            return False
    for element in list_loss_val[:-4]:
        if list_loss_val[-4] < element:
            return False
    for element in list_loss_val[:-5]:
        if list_loss_val[-5] < element:
            return False
    return True
    
def train_model(model, learning_rate, bart_abstract, features, adj, pairs, y, val_indices, y_val, epochs, run_number):
    # Train model
    start_time = time()
    
    print('Initializing the optimizer with learning rate:', learning_rate)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) #optimizer with halving learning rate in training
    try: os.mkdir('./outputs')
    except: pass

    today = datetime.today().strftime('%Y-%m-%d-%H:%M')
    list_loss_val = []
    list_loss_train = []
    window = 20
    
    halving_lr = 0 # counter of the number of halving lr
    print('Start training...')
    for epoch in range(epochs):
        t = time()
        optimizer.zero_grad()
        
        model.train()

        output = model(features, bart_abstract, adj, pairs).to(device) # we run the model that gives the output.
        loss_train = F.nll_loss(output, y) # we are using nll_loss as loss to optimize, we store it in loss_train. We compare to y which is stable and contains the tag ones and zeros.
        acc_train = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y.cpu().numpy())# just to show it in the out put message of the training
        loss_train.backward() # The back propagation ? --> Computes the gradient of current tensor w.r.t. graph leaves
        optimizer.step() # Performs a single optimization step (parameter update).
        
        model.eval()
        output = model(features, bart_abstract, adj, val_indices).to(device)
        #y_val = torch.LongTensor(y_val).to(device)
        loss_val = F.nll_loss(output, y_val)
        list_loss_val.append(loss_val.item())
        list_loss_train.append(loss_train.item())
        acc_val = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y_val.cpu().numpy())
        
        if epoch % 5 == 0:
            print('Epoch: {:03d}'.format(epoch+1),
                  'loss_train: {:.4f}'.format(loss_train.item()),
                  'loss_val: {:.4f}'.format(loss_val.item()),
                  'acc_train: {:.4f}'.format(acc_train.item()),
                  'acc_val: {:.4f}'.format(acc_val.item()),
                  'time: {} s'.format(int(round(time()) - round(t))),
                 'total_time: {} min'.format(round((time() - start_time)/60)))
            
            if epoch % 50 == 0:
                model_path = "outputs/{}-model-{}epochs-{}.pt".format(today, epoch, run_number)
                torch.save(model.state_dict(), model_path)
            
            early = False
            if epoch > 20:
                early = early_stopping(list_loss_train[-window:], list_loss_val[-window:])        
            if early:
                halving_lr += 1
                if halving_lr > 4:
                    break
                list_loss_val=[]
                list_loss_train=[]
                learning_rate = learning_rate/10
                optimizer = optim.Adam(model.parameters(), lr=learning_rate)
                print('Deviding the learning rate by 10. New learning rate: {}'.format(learning_rate))



    print("Optimization Finished in {} min!".format(round((time() - start_time)/60)))
    return model



# Train the model

In [321]:
n_hidden = 64
dropout_rate = 0.2
sub_class = 8
n_class = 2
features = walks_wv#max_abstract_embedding #walks_wv
n_features = features.shape[1]
vocab_size = tfidf_matrix.shape[1]

# Creates the model
model = GNN(n_features, n_hidden, n_class, sub_class, dropout_rate).to(device)


In [325]:
features_torch, adj_torch, pairs_torch, y_torch, val_indices_torch, y_val_torch = prepare_data_to_train(features, authors, adj, indices, val_indices, y_val)


Preparing the data for training...
Data converted into torch tensors and authors added to indices in 2 min


In [336]:
# with dense2 1024 of BART
torch.cuda.empty_cache()


epochs = 200
run_number = randint(0, 1000)
learning_rate = 0.01


trained_model = train_model(model, learning_rate, abstracts_bart_embeddings, features_torch, adj_torch, pairs_torch, y_torch, val_indices_torch, y_val_torch, epochs, run_number)


Initializing the optimizer with learning rate: 0.01
Start training...
Epoch: 001 loss_train: 0.8050 loss_val: 0.7155 acc_train: 0.5016 acc_val: 0.5000 time: 21 s total_time: 0 min
Epoch: 006 loss_train: 0.6976 loss_val: 0.6874 acc_train: 0.5187 acc_val: 0.5864 time: 17 s total_time: 2 min
Epoch: 011 loss_train: 0.6887 loss_val: 0.6802 acc_train: 0.5332 acc_val: 0.6108 time: 28 s total_time: 4 min
Epoch: 016 loss_train: 0.7082 loss_val: 0.7137 acc_train: 0.5156 acc_val: 0.5000 time: 24 s total_time: 6 min
Epoch: 021 loss_train: 0.6749 loss_val: 0.6747 acc_train: 0.5676 acc_val: 0.5871 time: 32 s total_time: 8 min
Epoch: 026 loss_train: 0.6438 loss_val: 0.6458 acc_train: 0.6162 acc_val: 0.6327 time: 25 s total_time: 10 min
Epoch: 031 loss_train: 0.6211 loss_val: 0.6102 acc_train: 0.6453 acc_val: 0.6798 time: 23 s total_time: 11 min
Epoch: 036 loss_train: 0.5965 loss_val: 0.5760 acc_train: 0.6703 acc_val: 0.7183 time: 21 s total_time: 13 min
Epoch: 041 loss_train: 0.5608 loss_val: 0.5261 

In [194]:
# Without dense 1000
torch.cuda.empty_cache()


epochs = 200
run_number = randint(0, 1000)


trained_model = train_model(model, 0.01, tfidf_matrix_torch, features_torch, adj_torch, pairs_torch, y_torch, val_indices_torch, y_val_torch, epochs, run_number)


Initializing the optimizer with learning rate: 0.01
Start training...
Epoch: 001 loss_train: 0.7205 loss_val: 0.7085 acc_train: 0.5000 acc_val: 0.5000 time: 25 s total_time: 0 min
Epoch: 006 loss_train: 0.6909 loss_val: 0.6988 acc_train: 0.5182 acc_val: 0.5000 time: 20 s total_time: 2 min
Epoch: 011 loss_train: 0.6843 loss_val: 0.6764 acc_train: 0.5874 acc_val: 0.6041 time: 18 s total_time: 4 min
Epoch: 016 loss_train: 0.6125 loss_val: 0.6026 acc_train: 0.6735 acc_val: 0.6763 time: 19 s total_time: 5 min
Epoch: 021 loss_train: 0.5949 loss_val: 0.5804 acc_train: 0.6832 acc_val: 0.6994 time: 17 s total_time: 7 min
Epoch: 026 loss_train: 0.5624 loss_val: 0.5546 acc_train: 0.7285 acc_val: 0.7277 time: 19 s total_time: 8 min
Epoch: 031 loss_train: 0.5239 loss_val: 0.5169 acc_train: 0.7717 acc_val: 0.7671 time: 18 s total_time: 10 min
Epoch: 036 loss_train: 0.4932 loss_val: 0.4884 acc_train: 0.7939 acc_val: 0.7946 time: 18 s total_time: 12 min
Epoch: 041 loss_train: 0.4685 loss_val: 0.4661 a

# Generate test file

In [None]:
from datetime import datetime
import re

test_path = 'https://www.lix.polytechnique.fr/~nikolentzos/files/aai/challenge/test.txt'
node_pairs = list()
f = urlopen(test_path)

for line in f:
    t = str(line).split(',')
    t[0] = int(re.sub("[^0-9]", "", t[0]))
    t[1] = int(re.sub("[^0-9]", "", t[1]))
    node_pairs.append((node_to_idx[int(t[0])], node_to_idx[int(t[1])]))

node_pairs = np.transpose(node_pairs)
node_pairs = add_authors_to_pairs(node_pairs, authors)
#node_pairs = torch.LongTensor(node_pairs).to(device)

adj_torch = sparse_mx_to_torch_sparse_tensor(adj).to(device)
features_torch = torch.FloatTensor(features).to(device)

test_output = model(features_torch, adj_torch, node_pairs)
y_pred = torch.exp(test_output)
y_pred = y_pred.detach().cpu().numpy()

y_pred_true = list()
for element in y_pred:
    y_pred_true.append(element[1])

today = datetime.today().strftime('%Y-%m-%d')
random_nb = randint(0, 1000)
model_nb = 1

pd.DataFrame(y_pred_true, columns=['predicted']).to_csv(
"{}-submission-{}-{}.csv".format(today, model_nb, random_nb), header=True, index=True, index_label='id'
)

In [None]:
features.shape

#Draft

In [None]:
# Testing
model.eval()
node_pairs = np.array(np.transpose(node_pairs))
node_pairs = torch.LongTensor(node_pairs).to(device)

test_output = model(features, adj, node_pairs)
y_pred = torch.exp(test_output)
y_pred = y_pred.detach().cpu().numpy()

y_pred_true = list()
for element in y_pred:
    y_pred_true.append(element[1])
    

    
today = datetime.today().strftime('%Y-%m-%d')
random_nb = randint(0, 1000)

pd.DataFrame(y_pred_true, columns={'predicted'}).to_csv(
"{}-submission-{}-{}.csv".format(today, model_nb, random_nb), header=True, index=True, index_label='id'
)

In [None]:
#### New script with batches

def early_stopping(loss_train, list_loss_train, loss_val, list_loss_val, 
                   tolerance=0.01, patience=15):
    list_loss_val = list(list_loss_val)[-patience:]
    list_loss_train = list(list_loss_train)[-patience:]
    if (len(list_loss_val) == patience and loss_val > (sum(list_loss_val)/len(list_loss_val)) and loss_train + tolerance < loss_val) or (len(list_loss_train) == patience and loss_train > (sum(list_loss_train)/len(list_loss_train))):
        #print('train: {:.5f} val: {:.5f} mean val: {:.5f}'.format(loss_train, loss_val, (sum(list_loss_val)/len(list_loss_val))))
        return True
    return False
    

    
def train_model(model, learning_rate, features, adj, indices_mc, y, val_indices, 
                y_val, epochs, batch_size, wv_walk_size, 
                tolerence = 0.01, patience = 15, run_number=randint(0, 1000)):
    # Train model
    start_time = time()
    print('Initializing the optimizer with learning rate:', learning_rate)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) #optimizer with halving learning rate in training
    try: os.mkdir('./outputs')
    except: pass
    print('Preparing the data for training...')        

    today = datetime.today().strftime('%Y-%m-%d-%H:%M')
    list_loss_val = []
    list_loss_train = []

    
    halving_lr = 0 # counter of the number of halving lr
    print('Start training...')
    for epoch in range(epochs):
        t = time()

        # we create the rand indices corresponding to non edges (their y = 0)
        # we could apply a condition on epoch to run rand_indices (for speed purposes)
        rand_indices = np.random.randint(0, len(indices_mc), size=(indices_mc.shape[0], indices_mc.shape[1]))
        rand_indices = add_authors_to_pairs(rand_indices, authors)
        pairs = np.concatenate((indices_mc, rand_indices), axis=1)
        pairs = torch.LongTensor(pairs).to(device)

        permutation = torch.randperm(pairs.size()[1])
        
        # batches
        for i in range(0, pairs.size()[1], batch_size):
            optimizer.zero_grad()

            elts_indices = permutation[i:i+batch_size]
            batch_pairs = pairs[:, elts_indices]
            batch_y = y[elts_indices]

        
            model.train()

            output = model(features, adj, batch_pairs, wv_walk_size).to(device) # we run the model that gives the output.
            loss_train = F.nll_loss(output, batch_y) # we are using nll_loss as loss to optimize, we store it in loss_train. We compare to y which is stable and contains the tag ones and zeros.
            acc_train = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), batch_y.cpu().numpy())# just to show it in the out put message of the training
            loss_train.backward() # The back propagation ? --> Computes the gradient of current tensor w.r.t. graph leaves
            optimizer.step() # Performs a single optimization step (parameter update).
        
        model.eval()
        output = model(features, adj, val_indices, wv_walk_size).to(device)
        #y_val = torch.LongTensor(y_val).to(device)
        loss_val = F.nll_loss(output, y_val)
        list_loss_val.append(loss_val.item())
        list_loss_train.append(loss_train.item())
        acc_val = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y_val.cpu().numpy())
        
        if epoch % 5 == 0:
            print('Epoch: {:03d}'.format(epoch+1),
                  'loss_train: {:.4f}'.format(loss_train.item()),
                  'loss_val: {:.4f}'.format(loss_val.item()),
                  'acc_train: {:.4f}'.format(acc_train.item()),
                  'acc_val: {:.4f}'.format(acc_val.item()),
                  'time: {} s'.format(int(round(time()) - round(t))),
                 'total_time: {} min'.format(round((time() - start_time)/60)))
        if epoch % 50 == 0:
            model_path = "outputs/{}-model-{}epochs-{}.pt".format(today, epoch, run_number)
            torch.save(model.state_dict(), model_path)
        
        if int(loss_val.item()) > 5:
            break
            
        early = early_stopping(loss_train.item(), list_loss_train, loss_val.item(), list_loss_val, patience=15)        
        if early:
            halving_lr += 1
            if halving_lr > 5:
                break
            list_loss_val=[]
            learning_rate = learning_rate/10
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
            print('Deviding the learning rate by 2. New learning rate: {:.6f}'.format(learning_rate))



    print("Optimization Finished in {} min!".format(round((time() - start_time)/60)))
    return model



In [None]:
epochs = 1000

trained_model = train_model(model, 0.01, features, authors, adj, indices, y, torch.tensor(val_indices).to(device), torch.tensor(y_val).to(device), epochs)
