# Neptune to NetworkX to JSON for D3 Vizualization via Gremlin

In [1]:
#!pip install nest-asynacio

In [2]:
#!pip install igraph

In [3]:
import networkx as nx
import pandas as pd
import igraph as ig
import json
from networkx.readwrite import json_graph

from __future__  import print_function  # Python 2/3 compatibility

from gremlin_python import statics
from gremlin_python.structure.graph import Graph
from gremlin_python.process.graph_traversal import __
from gremlin_python.process.strategies import *
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection

In [4]:
# Necessary to avoid Cannot run the event loop while another loop is running error
import nest_asyncio
nest_asyncio.apply()

### Create connection to DB. 

A graph query is often referred to as a traversal as that is what we are in fact doing. We are traversing the graph from a starting point to an ending point. Traversals consist of one or more steps (essentially methods) that are chained together.

In [29]:
graph = Graph()

remoteConn = DriverRemoteConnection('ws://mgt-prd-infr-neptune-alb-122161610.us-east-1.elb.amazonaws.com:8182/gremlin','g')
g = graph.traversal().withRemote(remoteConn)

Sample to show connection worked: Return 10 vertices. 

In [6]:
print(g.V().limit(10).toList())

[v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8], v[9], v[10]]


### Sample Gremlin Queries

Count Nodes

In [7]:
g.V().count().next()

8039

In [8]:
print(g.V().hasLabel("@minboxradio").toList())

[]


Count Edges

In [10]:
g.E().count().next()

13771

What to Pull out for NetworkX conversion

In [11]:
sg = g.V().bothE().otherV().path().by(__.valueMap(True)).toList()

In [20]:
list(sg[0][0].values())

[['@iooner'], 'Tweeter', '1351']

In [14]:
list(sg[0][1].values())

[0.06812110418521816, 'mentioned_by', 'e14']

In [15]:
list(sg[0][2].values())

[['@MinBoxRadio'], 'Tweeter', '1']

In [22]:
list(sg[0][0].values())[2]

'1351'

In [27]:
list(sg[0][1].values())[1]

'mentioned_by'

### Convert into networkx graph

In [30]:
G = nx.DiGraph()
sg = g.V().bothE().otherV().path().by(__.valueMap(True)).toList()
for i in range(0,len(sg)):
    # Add Nodes
    G.add_node(list(sg[i][0].values())[2],screen_name=list(sg[i][0].values())[0][0], ntype= list(sg[i][0].values())[1])
    G.add_node(list(sg[i][2].values())[2],screen_name=list(sg[i][2].values())[0][0], ntype= list(sg[i][0].values())[1])
    # Add Edges
    G.add_edge(list(sg[i][0].values())[2],list(sg[i][2].values())[2],elabel=list(sg[i][1].values())[2],weight=list(sg[i][1].values())[0],etype=list(sg[i][1].values())[1])

Extract to show edge and node lists

In [31]:
G_edgelist = nx.to_pandas_edgelist(G)
#G_edgelist = G_edgelist[G_edgelist['elabel'] != "retweeted"]
G_edgelist.head()

Unnamed: 0,source,target,etype,weight,elabel
0,1351,1,mentioned_by,0.068121,e14
1,1,1351,mentioned_by,0.068121,e14
2,1,1367,mentioned_by,0.04675,e30
3,1,1383,mentioned_by,0.038736,e46
4,1,1399,mentioned_by,0.033393,e62


In [32]:
G_nodelist = pd.DataFrame([i[1] for i in G.nodes(data=True)], index=[i[0] for i in G.nodes(data=True)]).rename_axis('node').reset_index()
G_nodelist

Unnamed: 0,node,screen_name,ntype
0,1351,@iooner,Tweeter
1,1,@MinBoxRadio,Tweeter
2,1367,@Retalyx,Tweeter
3,1383,@WTCraft,Tweeter
4,1399,@Foulimage,Tweeter
...,...,...,...
8034,7909,@MpM2dPlh3c9lomDJD0q120yjZbcpzIiZ6jiW5l,Tweeter
8035,1316,@ONkki4n+E0yCYHDIn24IsYFBbTsBbH28W1UeFmxPyY=,Tweeter
8036,8009,@curious4fun44,Tweeter
8037,8016,@Keskin_Sozler1,Tweeter


### Export to JSON for data viz

In [33]:
with open('networkdata1.json', 'w') as outfile1:
    outfile1.write(json.dumps(json_graph.node_link_data(G)))

### Close DB Connection

In [233]:
remoteConn.close()