In [70]:
import pandas as pd
import networkx as nx
import csv
import numpy as np
from random import randint
from sklearn.linear_model import LogisticRegression


In [71]:
# Create a graph
G = nx.read_edgelist('edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()
print('Number of nodes:', n)
print('Number of edges:', m)

# Read the abstract of each paper
abstracts = dict()
with open('abstracts.txt', 'r', encoding="utf8") as f:
    for line in f:
        node, abstract = line.split('|--|')
        abstracts[int(node)] = abstract.replace('\n', '')

# Map text to set of terms
#for node in abstracts:
#    abstracts[node] = set(abstracts[node].split())

Number of nodes: 138499
Number of edges: 1091955


## Data Cleaning and SEAL Implementation

In [74]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [79]:
l_edges = [[],[]]
for edge in G.edges:
    l_edges[0].append(edge[0])
    l_edges[1].append(edge[1])
edge_index = torch.tensor(l_edges)
edge_index = torch.cat([edge_index, edge_index.flip(0)], 1) # undirected graph

#### 1. First separation

In [83]:
from torch_geometric.data import Data

ModuleNotFoundError: No module named 'torch_geometric'

In [81]:
# train, test, val masks for each node
train_mask = torch.tensor([True] * round(edge_index.unique().size(0) * 0.8) + 
                          [False]* (edge_index.unique().size(0) - round(edge_index.unique().size(0) * 0.8)))
test_mask = torch.tensor([False] * round(edge_index.unique().size(0) * 0.8) + 
                         [True]* (round(edge_index.unique().size(0) * 0.1)) + 
                         [False]* (edge_index.unique().size(0) - round(edge_index.unique().size(0) * 0.8) 
                                   - round(edge_index.unique().size(0) * 0.1)))
val_mask = torch.tensor([False] * round(edge_index.unique().size(0) * 0.8) + 
                        [False]* (round(edge_index.unique().size(0) * 0.1)) + 
                        [True]* (edge_index.unique().size(0) - round(edge_index.unique().size(0) * 0.8) 
                                 - round(edge_index.unique().size(0) * 0.1)))

In [82]:
new_data = Data(edge_index=edge_index, 
                x=x, 
                train_mask = train_mask, 
                val_mask=val_mask, 
                test_mask=test_mask)

batch_size = 1

NameError: name 'Data' is not defined

In [None]:
# features (just random here)
n_feats = 10
x = torch.randint(low=0, high=4, size=(edge_index.unique().size(0), n_feats)) 

# train, test, val masks for each node
train_mask = torch.tensor([True] * round(edge_index.unique().size(0) * 0.8) + 
                          [False]* (edge_index.unique().size(0) - round(edge_index.unique().size(0) * 0.8)))
test_mask = torch.tensor([False] * round(edge_index.unique().size(0) * 0.8) + 
                         [True]* (round(edge_index.unique().size(0) * 0.1)) + 
                         [False]* (edge_index.unique().size(0) - round(edge_index.unique().size(0) * 0.8) 
                                   - round(edge_index.unique().size(0) * 0.1)))
val_mask = torch.tensor([False] * round(edge_index.unique().size(0) * 0.8) + 
                        [False]* (round(edge_index.unique().size(0) * 0.1)) + 
                        [True]* (edge_index.unique().size(0) - round(edge_index.unique().size(0) * 0.8) 
                                 - round(edge_index.unique().size(0) * 0.1)))

new_data = Data(edge_index=edge_index, 
                x=x, 
                train_mask = train_mask, 
                val_mask=val_mask, 
                test_mask=test_mask)

In [73]:
nx.number_connected_components(G)

57

In [18]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [26]:
data = list(abstracts.values())

In [48]:
# ~4 mins
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [49]:
tagged_data

[TaggedDocument(words=['the', 'development', 'of', 'an', 'automated', 'system', 'for', 'the', 'quality', 'assessment', 'of', 'aerodrome', 'ground', 'lighting', '(', 'agl', ')', ',', 'in', 'accordance', 'with', 'associated', 'standards', 'and', 'recommendations', ',', 'is', 'presented', '.', 'the', 'system', 'is', 'composed', 'of', 'an', 'image', 'sensor', ',', 'placed', 'inside', 'the', 'cockpit', 'of', 'an', 'aircraft', 'to', 'record', 'images', 'of', 'the', 'agl', 'during', 'a', 'normal', 'descent', 'to', 'an', 'aerodrome', '.', 'a', 'model-based', 'methodology', 'is', 'used', 'to', 'ascertain', 'the', 'optimum', 'match', 'between', 'a', 'template', 'of', 'the', 'agl', 'and', 'the', 'actual', 'image', 'data', 'in', 'order', 'to', 'calculate', 'the', 'position', 'and', 'orientation', 'of', 'the', 'camera', 'at', 'the', 'instant', 'the', 'image', 'was', 'acquired', '.', 'the', 'camera', 'position', 'and', 'orientation', 'data', 'are', 'used', 'along', 'with', 'the', 'pixel', 'grey', 'l

In [51]:
from time import time 
max_epochs = 200
vec_size = 256
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    t_start = time()
    
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=1)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
    print('Epoch {0}. Elapsed {1} s'.format(epoch, time() - t_start))

model.save("d2v.model")
print("Model Saved")

Epoch 0. Elapsed 19.854040384292603 s
Epoch 1. Elapsed 20.038440704345703 s
Epoch 2. Elapsed 18.997846126556396 s
Epoch 3. Elapsed 18.85793447494507 s
Epoch 4. Elapsed 18.689804553985596 s
Epoch 5. Elapsed 18.472702026367188 s
Epoch 6. Elapsed 18.37824821472168 s
Epoch 7. Elapsed 18.47257423400879 s
Epoch 8. Elapsed 18.429821729660034 s
Epoch 9. Elapsed 18.312389850616455 s
Epoch 10. Elapsed 18.512078523635864 s
Epoch 11. Elapsed 18.89220356941223 s
Epoch 12. Elapsed 18.343973875045776 s
Epoch 13. Elapsed 18.300127267837524 s
Epoch 14. Elapsed 19.2231228351593 s
Epoch 15. Elapsed 16.9656720161438 s
Epoch 16. Elapsed 16.982479333877563 s
Epoch 17. Elapsed 17.039225816726685 s
Epoch 18. Elapsed 16.694501876831055 s
Epoch 19. Elapsed 16.993483066558838 s
Epoch 20. Elapsed 16.986959218978882 s
Epoch 21. Elapsed 16.56943917274475 s
Epoch 22. Elapsed 16.78674864768982 s
Epoch 23. Elapsed 16.52907657623291 s
Epoch 24. Elapsed 16.438615083694458 s
Epoch 25. Elapsed 16.52637219429016 s
Epoch 26

In [69]:
from gensim.models.doc2vec import Doc2Vec

model= Doc2Vec.load("d2v.model")
# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('1', topn=350)
print(similar_doc)

[('127115', 0.38494032621383667), ('6', 0.38255950808525085), ('58362', 0.3761834502220154), ('107826', 0.3757513165473938), ('84673', 0.375234991312027), ('21827', 0.3747912049293518), ('15596', 0.3682512938976288), ('55030', 0.36822032928466797), ('114408', 0.36764729022979736), ('119572', 0.3651387095451355), ('123153', 0.36297106742858887), ('106031', 0.36038586497306824), ('9559', 0.3597432076931), ('62847', 0.3573704659938812), ('136135', 0.35543155670166016), ('16861', 0.3548899292945862), ('86555', 0.35439884662628174), ('27023', 0.3510464131832123), ('15444', 0.34916529059410095), ('130957', 0.34889888763427734), ('130483', 0.34800687432289124), ('111163', 0.34764745831489563), ('106529', 0.3476121723651886), ('69782', 0.347492516040802), ('45161', 0.3465355336666107), ('132370', 0.34526506066322327), ('62989', 0.34417662024497986), ('134394', 0.34367290139198303), ('123086', 0.34145280718803406), ('52729', 0.34074264764785767), ('125982', 0.34062305092811584), ('138233', 0.34

  similar_doc = model.docvecs.most_similar('1', topn=350)


In [45]:
abstracts[7]

'Extreme learning machine (ELM) proposed by Huang et al. was developed for generalized single hidden layer feedforward networks (SLFNs) with a wide variety of hidden nodes. It proved to be very fast and effective especially for solving function approximation problems with a predetermined network structure. However, the method for determining the network structure of preliminary ELM may be tedious and may not lead to a parsimonious solution. In this paper, a systematic two-stage algorithm (named TS-ELM) is introduced to handle the problem. In the first stage, a forward recursive algorithm is applied to select the hidden nodes from the candidates randomly generated in each step and add them to the network until the stopping criterion achieves its minimum. The significance of each hidden node is then reviewed in the second stage and the insignificance ones are removed from the network, which drastically reduces the network complexity. The effectiveness of TS-ELM is verified by the empiric

In [46]:
abstracts[1]

'This paper proposes a novel hybrid forward algorithm (HFA) for the construction of radial basis function (RBF) neural networks with tunable nodes. The main objective is to efficiently and effectively produce a parsimonious RBF neural network that generalizes well. In this study, it is achieved through simultaneous network structure determination and parameter optimization on the continuous parameter space. This is a mixed integer hard problem and the proposed HFA tackles this problem using an integrated analytic framework, leading to significantly improved network performance and reduced memory usage for the network construction. The computational complexity analysis confirms the efficiency of the proposed algorithm, and the simulation results demonstrate its effectiveness'