In [1]:
import umap
import numpy
import scipy as sp
import networkx as nx
from node2vec import Node2Vec
from pprint import pprint as pprint
import tqdm

In [2]:
G = nx.DiGraph()

with open('graph.desc', 'r') as fh:
    for line_number, line in enumerate(fh):
        if line_number>=2:
            article_name = line.split("\t")[1][1:][:-2]
            line_number = int(line.split("\t")[0])
            G.add_node(line_number, article_name=article_name)
            
# for i in range(0, len(G.nodes)):
#     print(G.nodes[i])

In [3]:
with open('graph.graph', 'r') as fh:
    for line_number, edge in enumerate(fh):
        src = (edge.split("\t")[0]).split(" ")[0].rstrip()
        dest = (edge.split("\t")[0]).split(" ")[1].rstrip()
        G.add_edge(src,dest)
        # print(src, '->', dest)

In [4]:
pprint(nx.info(G))

('Name: \n'
 'Type: DiGraph\n'
 'Number of nodes: 459513\n'
 'Number of edges: 827118\n'
 'Average in degree:   1.8000\n'
 'Average out degree:   1.8000')


In [5]:
min_degree = 20

for n in list(G.nodes):
   if G.in_degree(n)<min_degree:
       G.remove_node(n)

In [6]:
pprint(nx.info(G))

('Name: \n'
 'Type: DiGraph\n'
 'Number of nodes: 855\n'
 'Number of edges: 3955\n'
 'Average in degree:   4.6257\n'
 'Average out degree:   4.6257')


In [7]:
nodes_count = len(G.nodes)

# Precompute probabilities and generate walks
node2vec = Node2Vec(G, dimensions=100, walk_length=50, num_walks=500, p=1, q=1, weight_key='weight',
                 workers=8, sampling_strategy=None)

Computing transition probabilities: 100%|██████████| 855/855 [00:00<00:00, 4190.31it/s]
0it [00:00, ?it/s]CPU: 1):   0%|          | 0/1 [00:00<?, ?it/s]
Generating walks (CPU: 1): 100%|██████████| 1/1 [00:00<00:00,  7.61it/s]

0it [00:00, ?it/s]CPU: 3): : 0it [00:00, ?it/s]
0it [00:00, ?it/s]CPU: 4): : 0it [00:00, ?it/s]
0it [00:00, ?it/s]CPU: 5): : 0it [00:00, ?it/s]
0it [00:00, ?it/s]CPU: 6): : 0it [00:00, ?it/s]
Generating walks (CPU: 8): : 0it [00:00, ?it/s]


In [8]:
sorted(G.degree, key=lambda x: x[1], reverse=True)[:10]

[('251890', 78),
 ('36497', 77),
 ('251899', 61),
 ('228840', 56),
 ('227969', 53),
 ('79025', 52),
 ('128596', 51),
 ('246320', 51),
 ('122206', 48),
 ('55120', 48)]

In [9]:
print(node2vec.walks[:1])

[['54519', '86528', '244824', '103283', '244824']]


In [10]:
# Embed
model = node2vec.fit(window=10, min_count=1, batch_words=8)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
print(model)

Word2Vec(vocab=855, size=10, alpha=0.025)


In [11]:
# Look for most similar nodes
# print (nx.get_node_attributes(G, "Horseland"))

model.wv.most_similar('55120')  # Output node names are always strings

[('259546', 0.8220720887184143),
 ('124385', 0.8178898096084595),
 ('90633', 0.8156998157501221),
 ('191832', 0.807579755783081),
 ('57724', 0.7341687083244324),
 ('29690', 0.7323005199432373),
 ('95195', 0.7192438840866089),
 ('48569', 0.7169404625892639),
 ('13839', 0.711948037147522),
 ('95196', 0.7053947448730469)]

In [12]:
'62737' in G

False

In [13]:
node_vectors = model.wv
print(node_vectors)

<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x1a47651390>


In [14]:
vector_1 = node_vectors['24982']
print(vector_1)

[-0.03611172  0.00370547 -0.02995202  0.01948346  0.00898747  0.03227345
 -0.03029198  0.03423879  0.03030616  0.03784813]


In [15]:
# Save embeddings for later use

node_vectors.save_word2vec_format("art-embedding.txt")

In [16]:
# Load embeddings

from gensim.models import KeyedVectors
node_vectors_loaded = KeyedVectors.load_word2vec_format("art-embedding.txt")

In [17]:
print(node_vectors_loaded)

<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x1a2acc9be0>


In [19]:
vector_2 = node_vectors_loaded['62737']
print(vector_2)

[-0.04512537  0.00796047 -0.01502777  0.02045568  0.0051362   0.03891236
  0.04613927 -0.02415276 -0.00679287 -0.04590474]


In [20]:
node_vectors_loaded.most_similar('62737') 

[('259546', 0.8220720887184143),
 ('124385', 0.8178898096084595),
 ('90633', 0.8156998157501221),
 ('191832', 0.807579755783081),
 ('57724', 0.7341687083244324),
 ('29690', 0.7323005199432373),
 ('95195', 0.7192438840866089),
 ('48569', 0.7169404625892639),
 ('13839', 0.711948037147522),
 ('95196', 0.7053947448730469)]

In [21]:
'259546' in G

True