# Packages and functions

In [2]:
!pip install unidecode

import numpy as np
import random
from random import randint
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
import pandas as pd
import numpy as np
from time import time
from gensim.models import Word2Vec
import networkx as nx
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
from random import choice
from scipy.sparse import identity, diags
from unidecode import unidecode
from urllib.request import urlopen
import gzip
import pickle
from gensim.parsing.preprocessing import remove_stopwords
import re
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from tqdm.notebook import tqdm
import requests
import io
from scipy import sparse
import matplotlib.pyplot as plt



device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#device = torch.device("cpu")



In [3]:
def save_subgraph_in_file(nbr_nodes, source_path='../input_data/edgelist.txt', destination_path='../input_data/small_edgelist.txt'):
    G = nx.read_edgelist(source_path, delimiter=',', create_using=nx.Graph(), nodetype=int)
    G = G.subgraph(range(nbr_nodes))
    nx.write_edgelist(G, path=destination_path, delimiter=',')
    print(G.number_of_nodes(), 'nodes,', G.number_of_edges(), 'edges Graph extracted from', source_path[source_path.rfind('/')+1:])
    G = nx.read_edgelist(destination_path, delimiter=',', create_using=nx.Graph(), nodetype=int)
    print(G.number_of_nodes(), 'nodes,', G.number_of_edges(), 'edges Graph saved in', destination_path[destination_path.rfind('/')+1:])
    print(max(G.nodes))
    return


def read_train_val_graph(path='../input_data/edgelist.txt', shuffle=True, val_ratio=0.1):
    #gets the data from the file on the distant server
    G = nx.read_edgelist(urlopen('https://www.lix.polytechnique.fr/~nikolentzos/files/aai/challenge/edgelist.txt'), delimiter=',', create_using=nx.Graph(), nodetype=int)
    nodes = list(G.nodes())
    edges = list(G.edges())
    n = G.number_of_nodes()
    m = G.number_of_edges()
    
    permutation = np.array(range(n))
    mapping_permutation = dict(zip(range(n), range(n)))
    if shuffle:
        # shuffle the order of the edges without changing the labels
        random.shuffle(edges)
        G = nx.Graph()
        G.add_edges_from(edges)
        permutation = np.random.permutation(n)
        print(type(permutation))
        # create a mapping from old nodes labels to new nodes labels
        mapping_permutation = dict(zip(range(n), permutation))

        # shuffle G node labels according to the permutation
        G = nx.relabel_nodes(G, mapping_permutation)    
        
        edges = list(G.edges())
        nodes = list(G.nodes())


    print('Number of nodes:', n, 'number of edges:', m,'in the Complete set')

    node_to_idx = dict()
    for i, node in enumerate(nodes):
        node_to_idx[node] = i

    val_edges = list()
    G_train = G.copy()

    for edge in edges:
        if random.random() < val_ratio and edge[0] < n and edge[1] < n:
            val_edges.append(edge)
            G_train.remove_edge(edge[0], edge[1]) # We remove the val edges from the graph G

   
    #for edge in val_edges:
        

    n = G_train.number_of_nodes()
    m = G_train.number_of_edges()
    train_edges = list(G_train.edges())

    print('Number of nodes:', n, 'number of edges:', m, 'in the Training set')
    print('len(nodes)', len(nodes))

    y_val = [1]*len(val_edges)

    n_val_edges = len(val_edges)
    
    print('Creating random val_edges...')
    for i in range(n_val_edges):
        n1 = nodes[randint(0, n-1)]
        n2 = nodes[randint(0, n-1)]
        (n1, n2) = (min(n1, n2), max(n1, n2))
        while n2 >= n: #or (n1, n2) in train_edges:
            if (n1, n2) in train_edges:
                print((n1, n2), 'in train_edges:')
            n1 = nodes[randint(0, n-1)]
            n2 = nodes[randint(0, n-1)]
            (n1, n2) = (min(n1, n2), max(n1, n2))
        val_edges.append((n1, n2))

    y_val.extend([0]*(n_val_edges))
    
    ### From Giannis /!\
    val_indices = np.zeros((2,len(val_edges)))
    for i,edge in enumerate(val_edges):
        val_indices[0,i] = node_to_idx[edge[0]]
        val_indices[1,i] = node_to_idx[edge[1]]
    
    print('Returned G_train, train_edges, val_edges, y_val, nodes and node_to_idx objects')
    print('Loaded from', path[path.rfind('/')+1:], 'and with a training validation split ratio =', val_ratio)
    
    
    
    return G, G_train, train_edges, val_edges, val_indices, y_val, nodes, node_to_idx, permutation, mapping_permutation

def random_walk(G, node, walk_length):
    walk = [node]
  
    for i in range(walk_length-1):
        neibor_nodes = list(G.neighbors(walk[-1]))
        if len(neibor_nodes) > 0:
            next_node = choice(neibor_nodes)
            walk.append(next_node)
    walk = [node for node in walk] # in case the nodes are in string format, we don't need to cast into string, but if the nodes are in numeric or integer, we need this line to cast into string
    return walk


def generate_walks(G, num_walks, walk_length):
  # Runs "num_walks" random walks from each node, and returns a list of all random walk
    t = time()
    print('Start generating walks....')
    walks = list()  
    for i in range(num_walks):
        for node in G.nodes():
            walk = random_walk(G, node, walk_length)
            walks.append(walk)
        #print('walks : ', walks)
    print('Random walks generated in in {}s!'.format(round(time()-t)))
    return walks

def apply_word2vec_on_features(features, nodes, vector_size=128, window=5, min_count=0, sg=1, workers=8):
    t = time()
    print('Start applying Word2Vec...')
    wv_model = Word2Vec(vector_size=vector_size, window=window, min_count=min_count, sg=sg, workers=workers)
    wv_model.build_vocab(features)
    wv_model.train(features, total_examples=wv_model.corpus_count, epochs=5) 
    print('Word2vec model trained on features in {} min!'.format(round((time()-t)/60)))
    features_np = []
    for node in nodes:
        features_np.append(wv_model.wv[node])

    features_np = np.array(features_np)
    print(features_np.shape, 'features numpy array created in {} min!'.format(round((time()-t)/60)))
    return features_np


import numpy as np
import scipy.sparse as sp

def normalize_adjacency(A):
    n = A.shape[0]
    A = A + identity(n)
    degs = A.dot(np.ones(n))
    inv_degs = np.power(degs, -1)
    D_inv = diags(inv_degs)
    A_hat = D_inv.dot(A)
    return A_hat


# a proposed adj normalization, but we will keep the original one in the function after
def normalize_adj(adj):
    """Normalize adjacency matrix."""
    adj = sp.coo_matrix(adj)
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()



def create_and_normalize_adjacency(G):
    adj = nx.adjacency_matrix(G) # Obtains the adjacency matrix of the training graph
    #adj = normalize_adjacency(adj)
    adj = normalize_adj(adj)
    print('Created a normalized adjancency matrix of shape', adj.shape)
    indices = np.array(adj.nonzero()) # Gets the positions of non zeros of adj into indices
    print('Created indices', indices.shape, 'with the positions of non zeros in adj matrix')
    return adj, indices

def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)


def text_to_list(text):
    text = unidecode(text)
    text = re.sub(r"[^a-zA-Z\s.,]", "", text)
    return text.split(',')

def intersection(lst1, lst2): # a function that returns the number of common items of two lists and 1 or 0 if there are common. This function will be used in add_authors_to_pairs to add this features to the pairs.
    lst3 = [value for value in lst1 if value in lst2]
    is_common = 1 if len(lst3)>0 else 0
    return len(lst3)+1, is_common+1


def add_authors_to_pairs (pairs, authors):
    authors = pd.DataFrame(authors)
    try: 
        pairs = pairs.detach().cpu().numpy()
    except:
        pass
        

    pairs_df = pd.DataFrame(np.transpose(pairs)).rename(columns={0: "paper_1", 1: "paper_2"})
    pairs_df = pairs_df.merge(authors, left_on='paper_1', right_on='paper_permut', how='left').rename(columns={'authors': "authors_1"})
    pairs_df = pairs_df.merge(authors, left_on='paper_2', right_on='paper_permut', how='left').rename(columns={'authors': "authors_2"})
    pairs_df.drop(['paper_id_x', 'paper_id_y'], axis=1, inplace=True)

    pairs_df['nb_common_author'] = pairs_df.apply(lambda row: intersection(row['authors_1'], row['authors_2'])[0], axis=1)
    pairs_df['is_common_author'] = pairs_df.apply(lambda row: intersection(row['authors_1'], row['authors_2'])[1], axis=1)

    pairs_tensor = torch.LongTensor(np.transpose(pairs_df[["paper_1", "paper_2", 'is_common_author', 'nb_common_author']].values.tolist())).to(device)
    
    return pairs_tensor


In [4]:
def read_and_clean_abstracts (nodes, sample_length=-1, abstracts_path = 'https://www.lix.polytechnique.fr/~nikolentzos/files/aai/challenge/abstracts.txt'):
    t = time()
    abstracts = dict()
    abstracts_list = list()
    f = urlopen(abstracts_path)
    
    for i, line in tqdm(enumerate(f)):
        if i == sample_length:
            break
        if i in nodes:
            node, abstract = str(line).lower().split('|--|')
            abstract = remove_stopwords(abstract)
            #abstract = re.sub(r"[,.;@#?!&$()-]", " ", abstract)
            abstract = re.sub(r"[^a-zA-Z0-9\s]", "", abstract)
            #abstract = re.sub(r"\\", " ", abstract)
            abstract = remove_stopwords(abstract)

            for word in abstract.split()[:-1]:
                #abstract = abstract.replace(word, stemmer.stem(word))
                abstract = abstract.replace(word, lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(word), pos='s'), pos='a'), pos='n'), pos='v'), pos='r'))
            
            node = re.sub("[^0-9]", "", node)
            if i != int(node):
                print('i and node not the same', i, node)
            abstracts[int(node)] = abstract
            abstracts_list.append(abstract)
        
    print('Text loaded and cleaned in {:.0f} min'.format((time()-t)/60))
    return abstracts

def doc_counter (documents, word): #a function that return the number of documents containing a word
    counter = 0
    for i in documents:
        if word in documents[i]:
            counter += 1
    return counter



In [5]:
class Vocabulary:

    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.word_occurrence = {}
        self.word2node = {}
        self.words_list = []
        self.sentences_list = []
        self.sentences_list_words = []
        self.num_words = 0
        self.num_sentences = 0
        self.longest_sentence = 0

    def add_word(self, word, node):
        if word not in self.word2index:
            # First entry of word into vocabulary
            self.words_list.append(word)
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
            self.word_occurrence[word] = 1
            self.word2node[word] = [node]
        else:
            # Word exists; increase word count
            self.word2count[word] += 1
            self.word_occurrence[word] += 1
            if node not in self.word2node[word]:
                self.word2node[word].append(node)
            # self.num_words += 1
            
    def add_sentence(self, sentence, node):
        sentence_len = 0
        for word in sentence.split()[:-1]:
            sentence_len += 1
            self.add_word(word, node)
        if sentence_len > self.longest_sentence:
            # This is the longest sentence
            self.longest_sentence = sentence_len
        # Count the number of sentences
        self.num_sentences += 1
        self.sentences_list.append(sentence)
        self.sentences_list_words.append(sentence.split()[:-1])

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

    def words(self):
        return self.words_list



In [81]:
def prepare_data_to_train (features, authors, adj, auth_matrix, indices, val_indices, y_val):
    
    print('Preparing the data for training...')
    
    t = time()
    
    y_val = torch.LongTensor(y_val).to(device)

    # Create class labels
    y = np.zeros(2*indices.shape[1])
    y[:indices.shape[1]] = 1 # Concatenated ones for edges indices and later in the model we add zeros for random indices.
    y = torch.LongTensor(y).to(device)
    
    features = torch.FloatTensor(features).to(device)
    
    indices = torch.LongTensor(indices).to(device)
    val_indices = torch.LongTensor(val_indices).to(device)
    adj = sparse_mx_to_torch_sparse_tensor(adj).to(device)
    auth_matrix = sparse_mx_to_torch_sparse_tensor(auth_matrix).to(device)
    #tfidf_matrix = sparse_mx_to_torch_sparse_tensor(tfidf_matrix).to(device)
    
    # the function add_authors_to_pairs converts into torch tensors and sends to Device    
    #val_indices = add_authors_to_pairs(val_indices, authors) #we add the authors to val_pairs
    #indices = add_authors_to_pairs(indices, authors) #we add the authors to indices    
    #rand_indices = np.random.randint(0, features.shape[0], (indices.shape[0],indices.shape[1]))# We take random indices each time we run an epoch
    #rand_indices = add_authors_to_pairs(rand_indices, authors)

    #pairs = torch.cat((indices, rand_indices), dim=1) # Concatenate the edges indices and random indices.
    #indices = torch.LongTensor(indices).to(device)
    #del(authors, indices, rand_indices)
    
    print('Data converted into torch tensors and authors added to indices in {:.0f} min'.format((time()-t)/60))

    return features, adj, auth_matrix, indices, y, val_indices, y_val 

In [7]:
def map_features_with_permutation(features, permutation):
    new_features = np.zeros((len(features), len(features[0])))
    for i in range(len(features)):
        new_features[i] = features[permutation[i]]
    return new_features

# Load graph and authors data from sources

In [8]:
t = time()
shuffle = False
G, G_train, train_edges, val_edges, val_indices, y_val, nodes, node_to_idx, permutation, mapping_permutation = read_train_val_graph(val_ratio=0.1, shuffle=shuffle)

print('graph loaded and seperated, val indices generated and node to index mapping returned in {:.0f} s'.format(time()-t))

Number of nodes: 138499 number of edges: 1091955 in the Complete set
Number of nodes: 138499 number of edges: 982833 in the Training set
len(nodes) 138499
Creating random val_edges...
Returned G_train, train_edges, val_edges, y_val, nodes and node_to_idx objects
Loaded from edgelist.txt and with a training validation split ratio = 0.1
graph loaded and seperated, val indices generated and node to index mapping returned in 11 s


In [9]:
adj, indices = create_and_normalize_adjacency(G_train)


  adj = nx.adjacency_matrix(G) # Obtains the adjacency matrix of the training graph


Created a normalized adjancency matrix of shape (138499, 138499)
Created indices (2, 1965666) with the positions of non zeros in adj matrix


  d_inv_sqrt = np.power(rowsum, -0.5).flatten()


In [10]:
authors = pd.read_csv(urlopen('https://www.lix.polytechnique.fr/~nikolentzos/files/aai/challenge/authors.txt'), sep = '|', header=None)
authors = authors.rename(columns={0: "paper_id", 2: "authors"})
authors['authors'] = authors['authors'].apply(text_to_list)
authors = authors[["paper_id", "authors"]]
authors = authors[authors['paper_id'] <= max(G.nodes())]
authors['paper_permut'] = permutation[authors['paper_id']]
authors.head()

Unnamed: 0,paper_id,authors,paper_permut
0,0,"[James H. Niblock, JianXun Peng, Karen R. McMe...",0
1,1,"[JianXun Peng, Kang Li, DeShuang Huang]",1
2,2,[J. Heikkila],2
3,3,"[L. Teslic, B. Hartmann, O. Nelles, I. Skrjanc]",3
4,4,"[Long Zhang, Kang Li, ErWei Bai, George W. Irwin]",4


In [66]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix


# get the unique list of authors
authors_lst_ppr = list(set([a for authors_list in tqdm(authors['authors']) for a in authors_list]))

# create a mapping of author to index
author_to_index = {author: i for i, author in tqdm(enumerate(authors_lst_ppr))}

# create an empty sparse matrix
nrows = len(authors)
ncols = len(authors_lst_ppr)
data = np.ones(nrows)
row_ind = np.arange(nrows)
col_ind = np.zeros(nrows)

# fill in the sparse matrix with 1 where authors appear
for i, authors_list in tqdm(enumerate(authors['authors'])):
    for author in authors_list:
        col_ind[i] = author_to_index[author]
        row_ind[i] = i
        data[i] = 1
auth_matrix = csr_matrix((data, (row_ind, col_ind)), shape=(nrows, ncols))

# print the resulting sparse matrix
print(auth_matrix.shape)


  0%|          | 0/138499 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

(138499, 147481)


In [15]:
authors['authors'][1]

['JianXun Peng', 'Kang Li', 'DeShuang Huang']

In [None]:
# get the indices of non-zero elements
row_idx, col_idx = auth_matrix.nonzero()

# display the first non-zero element
print([row_idx[0], col_idx[0]])

In [50]:
t = time()
n = -1 #length of the sample to develop and test the pipeline (-1 or negative values to take all the dataset)

#takes 4 minutes to process all the abstracts
abstracts = read_and_clean_abstracts(nodes, sample_length=n)  #149s #194s
#abstracts_dict_list_words = {i: abstracts[i].split()[:-1] for i in nodes}
#abstracts_list_sentences = [list(item)[1][:-3] for item in abstracts.items()]

#we create a vacabulary of words and sentences (abstracts)
#we take only a sample of 3 abstracts (i=2) to explore the approach



0it [00:00, ?it/s]

Text loaded and cleaned in 7 min


In [24]:
abstracts[1]

'paper propose novel hybrid forward algorithm hfa construction radial basis function rbf neural network tunable node main objective efficiently effectively produce parsimonious rbf neural network generalize study achieve simultaneous network structure determination parameter optimization continuous parameter space mix integer hard problem propose hfa tackle problem integrate analytic framework lead significantly improve network performance reduce memory usage network construction computational complexity analysis confirm efficiency propose algorithm simulation result demonstrate effectivenessn'

In [51]:
def clean_auth(text):
    text = unidecode(text)
    text = re.sub(r"[^a-zA-Z\s.,]", "", text)
    return text

authors = pd.read_csv(urlopen('https://www.lix.polytechnique.fr/~nikolentzos/files/aai/challenge/authors.txt'), sep = '|', header=None)
authors = authors.rename(columns={0: "paper_id", 2: "authors"})
authors['authors'] = authors['authors'].apply(text_to_list)
authors = authors[["paper_id", "authors"]]


for i in tqdm(range(len(abstracts))):
    for author in authors['authors'][i]:
        abstracts[i] = author + " " + abstracts[i]


  0%|          | 0/138499 [00:00<?, ?it/s]

In [49]:
authors['authors'][0]

['James H. Niblock', 'JianXun Peng', 'Karen R. McMenemy', 'George W. Irwin']

In [52]:
authors['authors'][i][0]

'Pietro Morerio'

In [53]:
voc_auth = Vocabulary('abstracts') 
for i in tqdm(nodes):
    voc_auth.add_sentence(abstracts[i], i)

print('Text cleaned and vocab built in {:.0f} min'.format((time()-t)/60))

  0%|          | 0/138499 [00:00<?, ?it/s]

Text cleaned and vocab built in 45 min


In [56]:
voc_auth.sentences_list

['George W. Irwin Karen R. McMenemy JianXun Peng James H. Niblock development automate quality assessment aerodrome grind light agl accordance associate standard recommendation present compose image sensor place inside cockpit aircraft record image agl normal descent aerodrome modelbased methodology ascertain optimum match template agl actual image data order calculate position orientation camera instant image acquire camera position orientation data pixel grey level image luminaire estimate value luminous intensity give luminaire compare expect brightness luminaire ensure operate require standard metric quality agl pattern determine experiment real image data present demonstrate application effectiveness systemn',
 'DeShuang Huang Kang Li JianXun Peng paper propose novel hybrid forward algorithm hfa construction radial basis function rbf neural network tunable node main objective efficiently effectively produce parsimonious rbf neural network generalize study achieve simultaneous netw

In [90]:
#Now we will compute a logarithmic tf-idf matrix
from sklearn.feature_extraction.text import TfidfVectorizer
t = time()

# Create a TfidfVectorizer object with logarithmic tf
# And as we are intrested in links between words, we will take only words that occured at least in two abstracts
words_multiple = {key:value for (key, value) in voc_auth.word_occurrence.items() if value >= 1}
vectorizer = TfidfVectorizer(vocabulary=list(words_multiple.keys()), sublinear_tf=True)


# Fit the vectorizer to the sentences and transform them into a TF-IDF matrix

tfidf_matrix_complete = vectorizer.fit_transform(voc_auth.sentences_list)

# Print the TF-IDF matrix
print('tf-idf matrix generated in {:.0f} sec'.format(time()-t))

#del (abstracts_list_sentences, voc, abstracts, abstracts_dict_list_words)




tf-idf matrix generated in 12 sec


In [91]:
tfidf_matrix_complete

<138499x275240 sparse matrix of type '<class 'numpy.float64'>'
	with 8449259 stored elements in Compressed Sparse Row format>

In [92]:
import os
from scipy import sparse

# Specify the file path on your desktop
save_path = os.path.expanduser("tfidf_complete_author.npz")

# Save the tfidf_matrix
sparse.save_npz(save_path, tfidf_matrix_complete)

print("TF-IDF matrix saved successfully.")


TF-IDF matrix saved successfully.


In [None]:
sentence_embeddings = model.encode(voc.sentences_list)
sentence_embeddings.shape

In [None]:
sentence_embeddings.shape

In [None]:
with open('sentence_embeddings_BERT.plk', 'wb') as f:
    pickle.dump(sentence_embeddings, f)

# Read processed data

In [59]:
t = time()
from io import BytesIO

walks_url = 'https://storage.googleapis.com/link_prediction_processed_data/walks_wv.npy'
with urlopen(walks_url) as url:
    data = url.read()

# Create a seekable file-like object from the data
fileobj = BytesIO(data)

# Load the data from the file object
walks_wv = np.load(fileobj)
print('wv walks loaded from GCP in {:.0f} sec'.format(time()-t))
walks_wv.shape

wv walks loaded from GCP in 0 sec


(138499, 64)

In [None]:
# # Load max embeddings wv_192

# url='https://storage.googleapis.com/link_prediction_processed_data/max_abstract_embedding.pkl'

# response = requests.get(url)
# data = response.content
# max_abstract_embedding = pickle.loads(data)

In [None]:
# # Load the BART embedding torch tensor

# url = "https://storage.googleapis.com/link_prediction_processed_data/bart_embeddings.pt"
# response = requests.get(url)

# with open("bart_embeddings.pt", "wb") as f:
#     f.write(response.content)

# abstracts_bart_embeddings = torch.load('bart_embeddings.pt')


In [None]:
# Load the wv300 mean embedding
url = 'https://storage.googleapis.com/link_prediction_processed_data/embedded_abstracts_local_wv300.npy'

response = requests.get(url)

with open('embedded_abstracts_local_wv300.npy', 'wb') as f:
    f.write(response.content)

# Load the numpy array from the saved file
local_wv300_abstracts = np.load('embedded_abstracts_local_wv300.npy')

local_wv300_abstracts.shape


In [None]:
# # Load the goog300 mean embedding
# url = 'https://storage.googleapis.com/link_prediction_processed_data/embedded_abstracts_goog300.npy'

# response = requests.get(url)

# with open('embedded_abstracts_goog300.npy', 'wb') as f:
#     f.write(response.content)

# # Load the numpy array from the saved file
# goog300_abstracts = np.load('embedded_abstracts_goog300.npy')

# goog300_abstracts.shape


In [None]:
# Load TF-IDF matrix

url = "https://storage.googleapis.com/link_prediction_processed_data/tfidf_matrix.npz"

response = requests.get(url)

with open("tfidf_matrix.npz", "wb") as f:
    f.write(response.content)

tfidf_matrix = sparse.load_npz("tfidf_matrix.npz")

tfidf_matrix.shape

In [94]:
from sklearn.decomposition import TruncatedSVD

# Initialize TruncatedSVD with desired number of components
n_components = 1000  # Adjust the number of components as needed
svd = TruncatedSVD(n_components=n_components)

# Apply TruncatedSVD to the TF-IDF matrix
tfidf_reduced = svd.fit_transform(tfidf_matrix)

# The tfidf_reduced matrix will have the reduced dimensionality based on TruncatedSVD
print(tfidf_reduced.shape)


(138499, 1000)


In [None]:
# # Load words 192 embedding
# import gzip
# import pickle

# with gzip.open('embedded_abstracts_dict_192array.pkl.gz', 'rb') as f:
#     words_embedding_192 = pickle.load(f)

# len(words_embedding_192)

In [None]:
# len(words_embedding_192)

In [None]:
# from tqdm.notebook import tqdm
# import numpy as np

# words_embedding_192_trunc128 = dict ()
# for i in tqdm(range(len(words_embedding_192))):
#     if len(words_embedding_192[i])>0:
#         arr = np.zeros((128, 192))
#         vec = words_embedding_192[i][:128, :]
#         arr[:vec.shape[0],:] = vec
#         words_embedding_192_trunc128[i] = torch.tensor(arr).to(device)
#     else:
#         words_embedding_192_trunc128[i] = []


In [None]:
# tensor_list = []
# i = 0
# for i in tqdm(range(len(words_embedding_192_trunc128))):
#     if len(words_embedding_192_trunc128[i])>0:
#         tensor_list.append(words_embedding_192_trunc128[i])
#     else:
#         tensor_list.append(torch.zeros((128, 192)))


In [None]:
# del(words_embedding_192_trunc128)

In [None]:
# import gzip

# filename = 'words_embedding_192_trunc128.pkl.gz'

# # open the file in binary mode and write the dictionary to it, compressing the data with gzip
# with gzip.open(filename, 'wb') as f:
#     pickle.dump(words_embedding_192_trunc128, f)

In [None]:
# tensor_list_float = []
# for i, tensor in tqdm(enumerate(tensor_list)):
#     tensor_list_float.append(tensor_list[i].float())
# del(tensor_list)

In [None]:
# t = time ()
# url = 'https://storage.googleapis.com/link_prediction_processed_data/words_embedding_192_trunc128.pkl.gz'


# # download the file from the URL
# response = requests.get(url)

# # save the file to disk
# with open(filename, 'wb') as f:
#     f.write(response.content)

# # load the data from the file
# with gzip.open(filename, 'rb') as f:
#     words_embedding_192_trunc128 = pickle.load(f)

# print('padded truncated embeddings loaded in {:.0f} sec'.format(time()-t))

# Models

In [61]:
class GNN(nn.Module):
    def __init__(self, n_text, n_auth, n_feat, n_hidden, n_class, sub_class, dropout):
        super(GNN, self).__init__()
        #self.auth_emb = nn.Linear(n_auth, n_hidden)
        self.abstract_emb = nn.Linear(n_text, 3*n_hidden)
        self.fc1 = nn.Linear(n_feat+3*n_hidden, n_hidden)        
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(2*n_hidden, n_hidden)
        self.fc4 = nn.Linear(n_hidden, sub_class)
        self.fc5 = nn.Linear(sub_class, n_class)        
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        
        

    def forward(self, x_in, abstract, auth, adj, pairs):

        y = self.abstract_emb(abstract)
        y = self.relu(y)
        y = self.dropout(y)
        del(abstract)

#         y = self.auth_emb(auth)
#         y = self.relu(y)
#         y = self.dropout(y)
#         del(auth)

        x_in = torch.cat((x_in, y), dim=1)
        
        h1 = self.fc1(x_in)
        z1 = self.relu(torch.spmm(adj, h1))
        z1 = self.dropout(z1)
        del(x_in, h1)

        h2 = self.fc2(z1)
        z2 = self.relu(torch.spmm(adj, h2))
        z2 = self.dropout(z2)
        del(h2, z1, adj)
        
        #x = torch.cat((z2[pairs[0]] , y[pairs[0]], z2[pairs[1]] , y[pairs[1]]), dim=1)
        x = torch.cat((z2[pairs[0]] , z2[pairs[1]]), dim=1)
        del(z2)

        x = self.relu(self.fc3(x))
        x = self.dropout(x)

        #x = torch.cat((x, pairs[2][:, None], pairs[3][:, None]), dim=1)        
        del(pairs)
        
        x = self.relu(self.fc4(x))
        x = self.dropout(x)
        
        x = self.fc5(x)
        return F.log_softmax(x, dim=1)




# Training functions

In [62]:
class EarlyStopping:
    def __init__(self, model, patience, delta, path='checkpoint.pt'):
        self.patience = patience
        self.delta = delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.model = model
        self.val_loss_min = np.Inf
        
    def __call__(self, val_loss, path='checkpoint.pt'):
        score = val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(path)
        elif score > self.best_score + 0:
            self.counter += 1
            #print(self.counter)
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(path)
            self.counter = 0

    def save_checkpoint(self, path):
        torch.save(self.model.state_dict(), path)

    def load_checkpoint(self, path):
        self.model.load_state_dict(torch.load(path))

In [None]:
tensor1 = (pairs_torch[0:2].permute(1, 0)[:int(len(pairs_torch[0])/2)])
tensor2 = pairs_torch[0:2].permute(1, 0)[int(len(pairs_torch[0])/2):]

In [None]:
tensor1[:, None]

In [None]:
common_indices = torch.nonzero(torch.all(torch.eq(tensor1[:, None], tensor2[None, :]), dim=-1))

# Print the common indices
print(common_indices)

In [None]:
torch.isin((pairs_torch[0:2].permute(1, 0)[:int(len(pairs_torch[0])/2)]), (pairs_torch[0:2].permute(1, 0)[int(len(pairs_torch[0])/2):]))
           

In [None]:
import torch

# Example tensors
tensor1 = torch.tensor([[1, 2], [3, 4], [5, 6]])
tensor2 = torch.tensor([[3, 4], [7, 8], [9, 10]])

# Reshape tensors to (1, -1) to get pairs of elements
pairs1 = tensor1.reshape(1, -1)
pairs2 = tensor2.reshape(1, -1)

# Check if there are any common pairs of elements
common_pairs = torch.eq(pairs1, pairs2).any()

# Print the result
print(common_pairs.item())  # True if there are common pairs, False otherwise


In [None]:
torch.isin(tensor1, tensor2)

In [87]:
def train_model(model, learning_rate, abstract, auth, features, adj, indices, y, val_indices, y_val, epochs, run_number, window = 10):
    # Train model
    start_time = time()
    
    print('Initializing the optimizer with learning rate:', learning_rate)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) #optimizer with halving learning rate in training
    try: os.mkdir('./outputs')
    except: pass

    today = datetime.today().strftime('%Y-%m-%d-%H:%M')
    list_loss_val = []
    list_loss_train = []
    list_epochs = []
    
    halving_lr = 0 # counter of the number of halving lr
    patience = 16
    early_stopping = EarlyStopping(model, patience=patience, delta=0.1, path='checkpoint.pt')
    print('Start training...')
    for epoch in range(epochs):
        t = time()
        optimizer.zero_grad()
        
        rand_indices = torch.randint(0, features.shape[0], size=(indices.shape[0],indices.shape[1])).to(device)
        
        pairs = torch.cat((indices, rand_indices), dim=1)
        
        model.train()

        output = model(features, abstract, auth, adj, pairs).to(device) # we run the model that gives the output.
        loss_train = F.nll_loss(output, y) # we are using nll_loss as loss to optimize, we store it in loss_train. We compare to y which is stable and contains the tag ones and zeros.
        acc_train = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y.cpu().numpy())# just to show it in the out put message of the training
        loss_train.backward() # The back propagation ? --> Computes the gradient of current tensor w.r.t. graph leaves
        optimizer.step() # Performs a single optimization step (parameter update).
        
        model.eval()
        output = model(features, abstract, auth, adj, val_indices).to(device)
        #y_val = torch.LongTensor(y_val).to(device)
        loss_val = F.nll_loss(output, y_val)
        list_loss_val.append(loss_val.item())
        list_loss_train.append(loss_train.item())
        list_epochs.append(epoch)
        acc_val = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y_val.cpu().numpy())
        


        if epoch % 5 == 0:
            print('Epoch: {:03d}'.format(epoch+1),
                  'loss_train: {:.4f}'.format(loss_train.item()),
                  'loss_val: {:.4f}'.format(loss_val.item()),
                  'acc_train: {:.4f}'.format(acc_train.item()),
                  'acc_val: {:.4f}'.format(acc_val.item()),
                  'time: {} s'.format(int(round(time()) - round(t))),
                 'total_time: {} min'.format(round((time() - start_time)/60)))
            
            if epoch % 50 == 0:
                model_path = "outputs/{}-model-{}epochs-{}.pt".format(today, epoch, run_number)
                torch.save(model.state_dict(), model_path)
            

        
        early_stopping(loss_val)

        if early_stopping.early_stop:
            halving_lr += 1
            if halving_lr > 4:
                break
            learning_rate = learning_rate/10
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
            early_stopping = EarlyStopping(model, patience=patience, delta=0.01, path='checkpoint.pt')
            print('Deviding the learning rate by 10. New learning rate: {}'.format(learning_rate))



    print("Optimization Finished in {} min!".format(round((time() - start_time)/60)))
    return model, list_loss_val, list_loss_train, list_epochs



# Train the model

In [82]:
features_torch, adj_torch, auth_torch, indices_torch, y_torch, val_indices_torch, y_val_torch = prepare_data_to_train(walks_wv, authors, adj, auth_matrix, indices, val_indices, y_val)
#max_abstract_embedding_torch = torch.FloatTensor(max_abstract_embedding_array).to(device)
#abstracts_bart_embeddings = abstracts_bart_embeddings.to(device)
#goog300_abstracts_torch = torch.FloatTensor(goog300_abstracts).to(device)
#local_wv300_abstracts_torch = torch.FloatTensor(local_wv300_abstracts).to(device)
tfidf_matrix_torch = sparse_mx_to_torch_sparse_tensor(tfidf_matrix).to(device)

Preparing the data for training...
Data converted into torch tensors and authors added to indices in 0 min


In [None]:
auth_torch.shape

In [83]:
n_hidden = 64
dropout_rate = 0.2
sub_class = 32
n_class = 2
text_embedding = tfidf_matrix_torch
n_text = text_embedding.shape[1]
n_auth = auth_torch.shape[1] 
n_features = features_torch.shape[1]


# Creates the model
model = GNN(n_text, n_auth, n_features, n_hidden, n_class, sub_class, dropout_rate).to(device)


In [86]:
auth_torch.shape, features_torch.shape, adj_torch.shape, indices_torch.shape, y_torch.shape, val_indices_torch.shape, y_val_torch.shape

(torch.Size([138499, 147481]),
 torch.Size([138499, 64]),
 torch.Size([138499, 138499]),
 torch.Size([2, 1965666]),
 torch.Size([3931332]),
 torch.Size([2, 218244]),
 torch.Size([218244]))

In [88]:
torch.cuda.empty_cache()


epochs = 400
run_number = randint(0, 1000)
learning_rate = 0.01


trained_model, list_loss_val, list_loss_train, list_epochs = train_model(model, learning_rate, text_embedding, 
                            auth_torch, features_torch, adj_torch, indices_torch, 
                            y_torch, val_indices_torch, y_val_torch, epochs, run_number)


Initializing the optimizer with learning rate: 0.01
Start training...
Epoch: 001 loss_train: 0.6963 loss_val: 0.6712 acc_train: 0.5000 acc_val: 0.5000 time: 24 s total_time: 0 min
Epoch: 006 loss_train: 0.6655 loss_val: 0.6443 acc_train: 0.6429 acc_val: 0.7426 time: 23 s total_time: 2 min


KeyboardInterrupt: 

In [None]:
%matplotlib inline


y1 = pd.Series(list_loss_val[:400])
y2 = pd.Series(list_loss_train[:400])
x = pd.Series(list_epochs[:400])




plt.plot(x, y1, label='Validation Loss')
plt.plot(x, y2, label='Training Loss')

plt.title('Loss over Epochs, G not shuffled')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
pairs_torch.shape

In [None]:
break

In [None]:
torch.cuda.empty_cache()


epochs = 602
run_number = randint(0, 1000)
learning_rate = 0.01


trained_model = train_model(model, learning_rate, text_embedding, 
                            features_torch, adj_torch, pairs_torch, 
                            y_torch, val_indices_torch, y_val_torch, epochs, run_number)


In [None]:
# before shuffling
torch.cuda.empty_cache()


epochs = 602
run_number = randint(0, 1000)
learning_rate = 0.01


trained_model = train_model(model, learning_rate, text_embedding, 
                            features_torch, adj_torch, pairs_torch, 
                            y_torch, val_indices_torch, y_val_torch, epochs, run_number)


In [None]:
np.shape(pairs_torch), y_torch.shape, val_indices_torch.shape, y_val_torch.shape

In [None]:
torch.cuda.empty_cache()


epochs = 602
run_number = randint(0, 1000)
learning_rate = 0.01


trained_model = train_model(model, learning_rate, text_embedding, 
                            features_torch, adj_torch, pairs_torch, 
                            y_torch, val_indices_torch, y_val_torch, epochs, run_number)


In [None]:
break

In [None]:
# with dense2 1024 of BART
torch.cuda.empty_cache()


epochs = 200
run_number = randint(0, 1000)
learning_rate = 0.005


trained_model = train_model(model, learning_rate, abstracts_bart_embeddings, features_torch, adj_torch, pairs_torch, y_torch, val_indices_torch, y_val_torch, epochs, run_number)


In [None]:
break

In [None]:
# Without dense 1000
torch.cuda.empty_cache()


epochs = 200
run_number = randint(0, 1000)


trained_model = train_model(model, 0.01, tfidf_matrix_torch, features_torch, adj_torch, pairs_torch, y_torch, val_indices_torch, y_val_torch, epochs, run_number)


# Generate test file

In [None]:
from datetime import datetime
import re

test_path = 'https://www.lix.polytechnique.fr/~nikolentzos/files/aai/challenge/test.txt'
node_pairs = list()
f = urlopen(test_path)

for line in f:
    t = str(line).split(',')
    t[0] = int(re.sub("[^0-9]", "", t[0]))
    t[1] = int(re.sub("[^0-9]", "", t[1]))
    node_pairs.append((node_to_idx[int(t[0])], node_to_idx[int(t[1])]))

node_pairs = np.transpose(node_pairs)
node_pairs = add_authors_to_pairs(node_pairs, authors)
#node_pairs = torch.LongTensor(node_pairs).to(device)

adj_torch = sparse_mx_to_torch_sparse_tensor(adj).to(device)
features_torch = torch.FloatTensor(walks_wv).to(device)

test_output = model(features_torch, text_embedding, adj_torch, node_pairs)
y_pred = torch.exp(test_output)
y_pred = y_pred.detach().cpu().numpy()

y_pred_true = list()
for element in y_pred:
    y_pred_true.append(element[1])

today = datetime.today().strftime('%Y-%m-%d')
random_nb = randint(0, 1000)
model_nb = 4

pd.DataFrame(y_pred_true, columns=['predicted']).to_csv(
"{}-submission-{}-{}.csv".format(today, model_nb, random_nb), header=True, index=True, index_label='id'
)

In [None]:
features.shape

# BART

In [None]:
import torch
from transformers import BartTokenizer, BartModel

# Load the BART model and tokenizer
model = BartModel.from_pretrained('facebook/bart-large')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')



In [None]:
abstracts_bart_embeddings = []

# Define a function to generate embeddings for text
def get_bart_embeddings(text, model):
    # Tokenize the input text
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors='pt')

    # Generate embeddings using the BART model
    with torch.no_grad():
        model_output = model(**encoded_input)
        embeddings = model_output.last_hidden_state.mean(dim=1).squeeze()

    return embeddings

In [None]:
for i in tqdm(range(len(abstracts_bart_embeddings), len(voc.sentences_list))):
    abstract = voc.sentences_list[i]
    abstracts_bart_embeddings.append(get_bart_embeddings(abstract, model))


In [None]:
saved_abstracts_bart_embeddings = abstracts_bart_embeddings

In [None]:
len(saved_abstracts_bart_embeddings), len(abstracts_bart_embeddings), len(voc.sentences_list)

In [None]:
abstracts_bart_embeddings = torch.stack(abstracts_bart_embeddings)
abstracts_bart_embeddings.shape

In [None]:
abstracts_bart_embeddings.to(device)

In [None]:
torch.save(abstracts_bart_embeddings, 'bart_embeddings.pt')


#Draft

In [None]:
# Testing
model.eval()
node_pairs = np.array(np.transpose(node_pairs))
node_pairs = torch.LongTensor(node_pairs).to(device)

test_output = model(features, adj, node_pairs)
y_pred = torch.exp(test_output)
y_pred = y_pred.detach().cpu().numpy()

y_pred_true = list()
for element in y_pred:
    y_pred_true.append(element[1])
    

    
today = datetime.today().strftime('%Y-%m-%d')
random_nb = randint(0, 1000)

pd.DataFrame(y_pred_true, columns={'predicted'}).to_csv(
"{}-submission-{}-{}.csv".format(today, model_nb, random_nb), header=True, index=True, index_label='id'
)

In [None]:
#### New script with batches

def early_stopping(loss_train, list_loss_train, loss_val, list_loss_val, 
                   tolerance=0.01, patience=15):
    list_loss_val = list(list_loss_val)[-patience:]
    list_loss_train = list(list_loss_train)[-patience:]
    if (len(list_loss_val) == patience and loss_val > (sum(list_loss_val)/len(list_loss_val)) and loss_train + tolerance < loss_val) or (len(list_loss_train) == patience and loss_train > (sum(list_loss_train)/len(list_loss_train))):
        #print('train: {:.5f} val: {:.5f} mean val: {:.5f}'.format(loss_train, loss_val, (sum(list_loss_val)/len(list_loss_val))))
        return True
    return False
    

    
def train_model(model, learning_rate, features, adj, indices_mc, y, val_indices, 
                y_val, epochs, batch_size, wv_walk_size, 
                tolerence = 0.01, patience = 15, run_number=randint(0, 1000)):
    # Train model
    start_time = time()
    print('Initializing the optimizer with learning rate:', learning_rate)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate) #optimizer with halving learning rate in training
    try: os.mkdir('./outputs')
    except: pass
    print('Preparing the data for training...')        

    today = datetime.today().strftime('%Y-%m-%d-%H:%M')
    list_loss_val = []
    list_loss_train = []

    
    halving_lr = 0 # counter of the number of halving lr
    print('Start training...')
    for epoch in range(epochs):
        t = time()

        # we create the rand indices corresponding to non edges (their y = 0)
        # we could apply a condition on epoch to run rand_indices (for speed purposes)
        rand_indices = np.random.randint(0, len(indices_mc), size=(indices_mc.shape[0], indices_mc.shape[1]))
        rand_indices = add_authors_to_pairs(rand_indices, authors)
        pairs = np.concatenate((indices_mc, rand_indices), axis=1)
        pairs = torch.LongTensor(pairs).to(device)

        permutation = torch.randperm(pairs.size()[1])
        
        # batches
        for i in range(0, pairs.size()[1], batch_size):
            optimizer.zero_grad()

            elts_indices = permutation[i:i+batch_size]
            batch_pairs = pairs[:, elts_indices]
            batch_y = y[elts_indices]

        
            model.train()

            output = model(features, adj, batch_pairs, wv_walk_size).to(device) # we run the model that gives the output.
            loss_train = F.nll_loss(output, batch_y) # we are using nll_loss as loss to optimize, we store it in loss_train. We compare to y which is stable and contains the tag ones and zeros.
            acc_train = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), batch_y.cpu().numpy())# just to show it in the out put message of the training
            loss_train.backward() # The back propagation ? --> Computes the gradient of current tensor w.r.t. graph leaves
            optimizer.step() # Performs a single optimization step (parameter update).
        
        model.eval()
        output = model(features, adj, val_indices, wv_walk_size).to(device)
        #y_val = torch.LongTensor(y_val).to(device)
        loss_val = F.nll_loss(output, y_val)
        list_loss_val.append(loss_val.item())
        list_loss_train.append(loss_train.item())
        acc_val = accuracy_score(torch.argmax(output, dim=1).detach().cpu().numpy(), y_val.cpu().numpy())
        
        if epoch % 5 == 0:
            print('Epoch: {:03d}'.format(epoch+1),
                  'loss_train: {:.4f}'.format(loss_train.item()),
                  'loss_val: {:.4f}'.format(loss_val.item()),
                  'acc_train: {:.4f}'.format(acc_train.item()),
                  'acc_val: {:.4f}'.format(acc_val.item()),
                  'time: {} s'.format(int(round(time()) - round(t))),
                 'total_time: {} min'.format(round((time() - start_time)/60)))
        if epoch % 50 == 0:
            model_path = "outputs/{}-model-{}epochs-{}.pt".format(today, epoch, run_number)
            torch.save(model.state_dict(), model_path)
        
        if int(loss_val.item()) > 5:
            break
            
        early = early_stopping(loss_train.item(), list_loss_train, loss_val.item(), list_loss_val, patience=15)        
        if early:
            halving_lr += 1
            if halving_lr > 5:
                break
            list_loss_val=[]
            learning_rate = learning_rate/10
            optimizer = optim.Adam(model.parameters(), lr=learning_rate)
            print('Deviding the learning rate by 2. New learning rate: {:.6f}'.format(learning_rate))



    print("Optimization Finished in {} min!".format(round((time() - start_time)/60)))
    return model



In [None]:
epochs = 1000

trained_model = train_model(model, 0.01, features, authors, adj, indices, y, torch.tensor(val_indices).to(device), torch.tensor(y_val).to(device), epochs)
