In [1]:
import umap
import numpy
import scipy as sp
import networkx as nx
from node2vec import Node2Vec
from pprint import pprint as pprint
import tqdm

In [2]:
G = nx.DiGraph()

with open('graph.desc', 'r') as fh:
    for line_number, line in enumerate(fh):
        if line_number>=2:
            article_name = line.split("\t")[1][1:][:-2]
            line_number = int(line.split("\t")[0])
            G.add_node(line_number, article_name=article_name)
            
# for i in range(0, len(G.nodes)):
#     print(G.nodes[i])

In [3]:
with open('graph.graph', 'r') as fh:
    for line_number, edge in enumerate(fh):
        src = (edge.split("\t")[0]).split(" ")[0].rstrip()
        dest = (edge.split("\t")[0]).split(" ")[1].rstrip()
        G.add_edge(src,dest)
        # print(src, '->', dest)

In [4]:
pprint(nx.info(G))

('Name: \n'
 'Type: DiGraph\n'
 'Number of nodes: 459513\n'
 'Number of edges: 827118\n'
 'Average in degree:   1.8000\n'
 'Average out degree:   1.8000')


In [5]:
for n in list(G.nodes):
   if G.in_degree(n)<20:
       G.remove_node(n)

In [6]:
pprint(nx.info(G))

('Name: \n'
 'Type: DiGraph\n'
 'Number of nodes: 4927\n'
 'Number of edges: 36504\n'
 'Average in degree:   7.4090\n'
 'Average out degree:   7.4090')


In [7]:
nodes_count = len(G.nodes)

# Precompute probabilities and generate walks
node2vec = Node2Vec(G, dimensions=100, walk_length=50, num_walks=500, p=1, q=1, weight_key='weight',
                 workers=8, sampling_strategy=None)

Computing transition probabilities: 100%|██████████| 4927/4927 [00:02<00:00, 2031.04it/s]
Generating walks (CPU: 4): 100%|██████████| 63/63 [16:47<00:00, 14.96s/it]









In [8]:
sorted(G.degree, key=lambda x: x[1], reverse=True)[:10]

[('251890', 302),
 ('251899', 260),
 ('36497', 223),
 ('122206', 206),
 ('128596', 185),
 ('228840', 179),
 ('68725', 176),
 ('53144', 163),
 ('55120', 159),
 ('79025', 158)]

In [9]:
print(node2vec.walks[:1])

[['149592', '145998']]


In [10]:
# Embed
model = node2vec.fit(window=10, min_count=1, batch_words=8)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
print(model)

Word2Vec(vocab=4927, size=100, alpha=0.025)


In [11]:
# Look for most similar nodes
# print (nx.get_node_attributes(G, "Horseland"))

model.wv.most_similar('55120')  # Output node names are always strings

[('62737', 0.6340166330337524),
 ('8584', 0.5250604748725891),
 ('62740', 0.491012841463089),
 ('2131', 0.4751129746437073),
 ('43400', 0.4646091163158417),
 ('231293', 0.46132171154022217),
 ('63512', 0.45839983224868774),
 ('13523', 0.422024667263031),
 ('246320', 0.421928733587265),
 ('79027', 0.4158029556274414)]

In [12]:
G.node['62737']

{}

In [13]:
node_vectors = model.wv
print(node_vectors)

<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x1a3c074be0>


In [14]:
vector_2 = node_vectors['24982']
print(vector_2)

[ 0.8148498  -2.6389446   2.9474733   3.3352592  -2.0999053  -3.3599775
 -1.4791487  -1.7040913  -3.968327    1.4743989   5.075324    2.5635593
  0.82248795  1.785244    0.56000215 -0.5750287  -2.074587    0.29938474
 -2.7532523   3.4682868  -4.0334725  -0.75760746  4.2015185  -3.3961043
 -1.5602596  -2.0747516  -0.6009316  -2.979457    5.849955   -5.8088856
  6.7519     -0.85770917  3.3671327   0.58310235 -1.542631    1.5882665
 -0.08063639  4.701636   -0.778013   -6.0782914  -6.3227034   5.8850565
  1.8910761  -3.2353685  -0.740532   -0.17759982 -6.182778    6.027771
  9.850884    0.7536562  -2.191642   -5.49071     1.3249635  -1.9801431
  1.5911679   5.305081   -1.2546628   4.9157577   0.8803714   8.278783
 -2.6281285  -2.6519084   0.30053893  4.2290354   1.7059455   1.7679937
 -1.3829061   0.44505718  7.844903   -7.5429816   4.781097    8.886375
 -0.9373578  -3.708006    0.44118953 -4.5547285   1.3938085   4.1830735
  7.770332   -0.68546194  1.6505066  -1.0915666   2.161071    6.31

In [15]:
# Save embeddings for later use
model.wv.save_word2vec_format("art-embedding.txt")

In [16]:
# model = Word2Vec.load(EMBEDDING_FILENAME)

In [17]:
# Save model for later use
# model.save(EMBEDDING_MODEL_FILENAME)

In [18]:
# embed = umap.UMAP().fit_transform(node2vec)