In [16]:
import umap
import numpy
import scipy as sp
import networkx as nx
from node2vec import Node2Vec
from pprint import pprint as pprint
import tqdm
import matplotlib.pyplot as plt
from ipywidgets import interact, interactive, fixed, interact_manual
import seaborn
import mplcursors
import pickle
import os
%matplotlib nbagg

In [46]:
G = nx.DiGraph()

with open('graph.desc', 'r') as fh:
    for line_number, line in enumerate(fh):
        if line_number>=2:
            article_name = line.split("\t")[1][1:][:-2]
            line_number = int(line.split("\t")[0])
            G.add_node(line_number, article_name=article_name)
            
# for i in range(0, len(G.nodes)):
#     print(G.nodes[i])

In [47]:
with open('graph.graph', 'r') as fh:
    for line_number, edge in enumerate(fh):
        src = (edge.split("\t")[0]).split(" ")[0].rstrip()
        dest = (edge.split("\t")[0]).split(" ")[1].rstrip()
        G.add_edge(src,dest)
        # print(src, '->', dest)

In [48]:
article_name = nx.get_node_attributes(G, name='article_name')
article_name[251890]

'Soul music'

In [49]:
pprint(nx.info(G))

('Name: \n'
 'Type: DiGraph\n'
 'Number of nodes: 459513\n'
 'Number of edges: 827118\n'
 'Average in degree:   1.8000\n'
 'Average out degree:   1.8000')


In [66]:
min_degree = 10

nodes = [node for node, degree in G.degree() if degree >= min_degree]

In [67]:
list(G.nodes(data=True))[:10]

[(0, {'article_name': 'As Crazy as It Gets'}),
 (1, {'article_name': 'Road to Yesterday (film)'}),
 (2, {'article_name': 'UK Blak'}),
 (3, {'article_name': 'Beach of the War Goddess'}),
 (4, {'article_name': 'Good Vibrations'}),
 (5, {'article_name': 'Cool Jerk'}),
 (6, {'article_name': 'Can We Still Be Friends'}),
 (7, {'article_name': 'I Saw the Light (Todd Rundgren song)'}),
 (8, {'article_name': 'La-La (Means I Love You)'}),
 (9, {'article_name': 'Bang the Drum All Day'})]

In [68]:
subgraph_G = G.subgraph(nodes)

In [69]:
list(subgraph_G.nodes(data=True))[:10]

[('67987', {}),
 ('259394', {}),
 ('131794', {}),
 ('207393', {}),
 ('21338', {}),
 ('133784', {}),
 ('11952', {}),
 ('37214', {}),
 ('12755', {}),
 ('156467', {})]

In [70]:
pprint(nx.info(subgraph_G))

('Name: \n'
 'Type: SubDiGraph\n'
 'Number of nodes: 44612\n'
 'Number of edges: 386418\n'
 'Average in degree:   8.6618\n'
 'Average out degree:   8.6618')


In [71]:
article_name = nx.get_node_attributes(G, name='article_name')
article_name[251890]

'Soul music'

In [104]:
print(subgraph_G.degree("67987"))

10


In [107]:
# Count betweenness

subgraph_G_betweenness = nx.betweenness_centrality(subgraph_G, k=10000)

In [108]:
print({k: subgraph_G_betweenness[k] for k in list(subgraph_G_betweenness)[:10]})

{'67987': 6.867663752067565e-05, '259394': 6.842818969238338e-06, '131794': 0.0, '207393': 3.9938960771694634e-05, '21338': 0.0, '133784': 0.0, '11952': 2.267116049505658e-06, '37214': 0.0, '12755': 5.515209907430383e-07, '156467': 2.5614800632086612e-06}


In [110]:
files = os.listdir(".")
with open("graph-betweenness-10000.desc", "w") as file:
    for node in subgraph_G_betweenness:
        file.write(str(node) + ' ' + str(subgraph_G_betweenness[node]) + ' "'+article_name[int(node)]+'"\n')

In [None]:
# Node2Vec

In [None]:
nodes_count = len(subgraph_G.nodes)

# Precompute probabilities and generate walks
files = os.listdir(".")
if "node2vec.pickle" in files:
    with open("node2vec.pickle", "rb") as file:
        node2vec = pickle.load(file)
else:
    node2vec = Node2Vec(subgraph_G, dimensions=100, walk_length=50, num_walks=500, p=1, q=1, weight_key='weight',
                 workers=8, sampling_strategy=None)

Computing transition probabilities: 100%|██████████| 33733/33733 [04:40<00:00, 120.30it/s]
Generating walks (CPU: 2):   3%|▎         | 2/63 [04:21<2:12:56, 130.76s/it]

In [None]:
files = os.listdir(".")
if "node2vec.pickle" not in files:
    with open("node2vec.pickle", "wb") as file:
        pickle.dump(node2vec, file)

In [None]:
sorted(subgraph_G.degree, key=lambda x: x[1], reverse=True)[:10]

In [None]:
# Embed
# model = node2vec.fit(window=10, min_count=1, batch_words=8)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)
# print(model)

In [None]:
# Look for most similar nodes
# print (nx.get_node_attributes(G, "Horseland"))

# `model.wv.most_similar('55120')  # Output node names are always strings

In [None]:
# '62737' in G

In [None]:
# node_vectors = model.wv
# print(node_vectors)

In [None]:
# vector_1 = node_vectors['24982']
# print(vector_1)

In [None]:
# Save embeddings for later use

# node_vectors.save_word2vec_format("art-embedding.txt")

In [23]:
# Load embeddings

from gensim.models import KeyedVectors
node_vectors_loaded = KeyedVectors.load_word2vec_format("art-embedding.txt")
wv = node_vectors_loaded.vectors

In [24]:
def interactive_umap(n_neighbors, min_dist, n_components):
    umap_obj = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components)
    transformed = umap_obj.fit_transform(wv)
    plt.figure(figsize=(7.5,7.5))
    seaborn.regplot(x=transformed[:, 0], y=transformed[:, 1], fit_reg=False)    
    
    plt.show()

In [28]:
interactive(interactive_umap, n_neighbors=15, min_dist=0.5, n_components=3, metric='correlation')