In [1]:
import umap
import numpy
import scipy as sp
import networkx as nx
from node2vec import Node2Vec
from pprint import pprint as pprint
import tqdm
import matplotlib.pyplot as plt
from ipywidgets import interact, interactive, fixed, interact_manual
import seaborn
import mplcursors
import pickle
import os
%matplotlib nbagg

In [2]:
G = nx.Graph()

with open('graph.desc', 'r') as fh:
    for line_number, line in enumerate(fh):
        if line_number>=2:
            article_name = line.split("\t")[1][1:][:-2]
            line_number = int(line.split("\t")[0])
            G.add_node(line_number, article_name=article_name)
            
# for i in range(0, len(G.nodes)):
#     print(G.nodes[i])

In [3]:
with open('graph.graph', 'r') as fh:
    for line_number, edge in enumerate(fh):
        src = (edge.split("\t")[0]).split(" ")[0].rstrip()
        dest = (edge.split("\t")[0]).split(" ")[1].rstrip()
        G.add_edge(src,dest)
        # print(src, '->', dest)

In [4]:
article_name = nx.get_node_attributes(G, name='article_name')
article_name[251890]

'Soul music'

In [5]:
pprint(nx.info(G))

('Name: \n'
 'Type: Graph\n'
 'Number of nodes: 459513\n'
 'Number of edges: 678216\n'
 'Average degree:   2.9519')


In [6]:
min_degree = 20

nodes = [node for node, degree in G.degree() if not degree<10]


In [7]:
list(G.nodes(data=True))[:10]

[(0, {'article_name': 'As Crazy as It Gets'}),
 (1, {'article_name': 'Road to Yesterday (film)'}),
 (2, {'article_name': 'UK Blak'}),
 (3, {'article_name': 'Beach of the War Goddess'}),
 (4, {'article_name': 'Good Vibrations'}),
 (5, {'article_name': 'Cool Jerk'}),
 (6, {'article_name': 'Can We Still Be Friends'}),
 (7, {'article_name': 'I Saw the Light (Todd Rundgren song)'}),
 (8, {'article_name': 'La-La (Means I Love You)'}),
 (9, {'article_name': 'Bang the Drum All Day'})]

In [8]:
subgraph_G = G.subgraph(nodes)

In [9]:
list(subgraph_G.nodes(data=True))[:10]

[('251890', {}),
 ('251899', {}),
 ('22143', {}),
 ('174660', {}),
 ('6197', {}),
 ('45250', {}),
 ('154299', {}),
 ('226450', {}),
 ('4', {}),
 ('117638', {})]

In [10]:
pprint(nx.info(subgraph_G))

('Name: \n'
 'Type: SubGraph\n'
 'Number of nodes: 33733\n'
 'Number of edges: 263038\n'
 'Average degree:  15.5953')


In [11]:
article_name = nx.get_node_attributes(G, name='article_name')
article_name[251890]

'Soul music'

In [14]:
nodes_count = len(subgraph_G.nodes)

# Precompute probabilities and generate walks
files = os.listdir(".")
if "node2vec.pickle" in files:
    with open("node2vec.pickle", "rb") as file:
        node2vec = pickle.load(file)
else:
    node2vec = Node2Vec(subgraph_G, dimensions=100, walk_length=50, num_walks=500, p=1, q=1, weight_key='weight',
                 workers=8, sampling_strategy=None)

In [13]:
# node2vec = Node2Vec(G, dimensions=100, walk_length=50, num_walks=500, p=1, q=1, weight_key='weight',
#                  workers=8, sampling_strategy=None)

Computing transition probabilities:  49%|████▊     | 223722/459513 [00:00<00:00, 745671.97it/s]


KeyboardInterrupt: 

In [15]:
files = os.listdir(".")
if "node2vec.pickle" not in files:
    with open("node2vec.pickle", "wb") as file:
        pickle.dump(node2vec, file)

In [16]:
sorted(subgraph_G.degree, key=lambda x: x[1], reverse=True)[:10]

[('251899', 1577),
 ('251890', 1418),
 ('36497', 981),
 ('128596', 819),
 ('122206', 803),
 ('228840', 754),
 ('225260', 712),
 ('55120', 709),
 ('227969', 661),
 ('169207', 658)]

In [None]:
# Embed
# model = node2vec.fit(window=10, min_count=1, batch_words=8)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
# print(model)

In [None]:
# Look for most similar nodes
# print (nx.get_node_attributes(G, "Horseland"))

# `model.wv.most_similar('55120')  # Output node names are always strings

In [None]:
# '62737' in G

In [None]:
# node_vectors = model.wv
# print(node_vectors)

In [None]:
# vector_1 = node_vectors['24982']
# print(vector_1)

In [None]:
# Save embeddings for later use

# node_vectors.save_word2vec_format("art-embedding.txt")

In [23]:
# Load embeddings

from gensim.models import KeyedVectors
node_vectors_loaded = KeyedVectors.load_word2vec_format("art-embedding.txt")
wv = node_vectors_loaded.vectors

In [24]:
def interactive_umap(n_neighbors, min_dist, n_components):
    umap_obj = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components)
    transformed = umap_obj.fit_transform(wv)
    plt.figure(figsize=(7.5,7.5))
    seaborn.regplot(x=transformed[:, 0], y=transformed[:, 1], fit_reg=False)    
    
    plt.show()

In [28]:
interactive(interactive_umap, n_neighbors=15, min_dist=0.5, n_components=3, metric='correlation')