# Neptune Query Examples

In [1]:
#!pip install nest-asynacio

In [2]:
#!pip install igraph

In [1]:
import networkx as nx
import pandas as pd
import igraph as ig
import json
from networkx.readwrite import json_graph

from __future__  import print_function  # Python 2/3 compatibility

from gremlin_python import statics
from gremlin_python.structure.graph import Graph
from gremlin_python.process.graph_traversal import __
from gremlin_python.process.strategies import *
from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection

In [2]:
# Necessary to avoid Cannot run the event loop while another loop is running error
import nest_asyncio
nest_asyncio.apply()

### Create connection to DB. 

A graph query is often referred to as a traversal as that is what we are in fact doing. We are traversing the graph from a starting point to an ending point. Traversals consist of one or more steps (essentially methods) that are chained together.

In [109]:
graph = Graph()

remoteConn = DriverRemoteConnection('ws://mgt-prd-infr-neptune-alb-122161610.us-east-1.elb.amazonaws.com:8182/gremlin','g')
g = graph.traversal().withRemote(remoteConn)

Sample to show connection worked: Return 10 vertices. 

In [4]:
print(g.V().limit(10).toList())

[v[1], v[2], v[3], v[4], v[5], v[6], v[7], v[8], v[9], v[10]]


### Sample Gremlin Queries

Count Nodes

In [5]:
g.V().count().next()

64425

In [6]:
g.V().limit(10).elementMap().toList()

[{<T.id: 1>: '1',
  <T.label: 4>: 'Movie',
  'node': 'The Other Side of the Wind (2018)'},
 {<T.id: 1>: '2', <T.label: 4>: 'Movie', 'node': 'Pál Adrienn (2010)'},
 {<T.id: 1>: '3', <T.label: 4>: 'Movie', 'node': 'Foodfight! (2012)'},
 {<T.id: 1>: '4', <T.label: 4>: 'Movie', 'node': 'Chinango (2009)'},
 {<T.id: 1>: '5', <T.label: 4>: 'Movie', 'node': 'Wazir (2016)'},
 {<T.id: 1>: '6', <T.label: 4>: 'Movie', 'node': 'The Wicker Tree (2011)'},
 {<T.id: 1>: '7', <T.label: 4>: 'Movie', 'node': 'Coraline (2009)'},
 {<T.id: 1>: '8', <T.label: 4>: 'Movie', 'node': 'On the Road (2012)'},
 {<T.id: 1>: '9', <T.label: 4>: 'Movie', 'node': 'The Future (2011)'},
 {<T.id: 1>: '10', <T.label: 4>: 'Movie', 'node': 'The Watch (2012)'}]

Count Edges

In [8]:
g.E().count().next()

175640

In [9]:
g.E().limit(10).elementMap().toList()

[{<T.id: 1>: 'e14210',
  <T.label: 4>: 'Directed',
  <Direction.IN: 2>: {<T.id: 1>: '13628', <T.label: 4>: 'Movie'},
  <Direction.OUT: 3>: {<T.id: 1>: '27529', <T.label: 4>: 'Director'}},
 {<T.id: 1>: 'e14274',
  <T.label: 4>: 'Directed',
  <Direction.IN: 2>: {<T.id: 1>: '13687', <T.label: 4>: 'Movie'},
  <Direction.OUT: 3>: {<T.id: 1>: '27555', <T.label: 4>: 'Director'}},
 {<T.id: 1>: 'e14338',
  <T.label: 4>: 'Directed',
  <Direction.IN: 2>: {<T.id: 1>: '13748', <T.label: 4>: 'Movie'},
  <Direction.OUT: 3>: {<T.id: 1>: '27244', <T.label: 4>: 'Director'}},
 {<T.id: 1>: 'e14402',
  <T.label: 4>: 'Directed',
  <Direction.IN: 2>: {<T.id: 1>: '13810', <T.label: 4>: 'Movie'},
  <Direction.OUT: 3>: {<T.id: 1>: '26764', <T.label: 4>: 'Director'}},
 {<T.id: 1>: 'e14466',
  <T.label: 4>: 'Directed',
  <Direction.IN: 2>: {<T.id: 1>: '13872', <T.label: 4>: 'Movie'},
  <Direction.OUT: 3>: {<T.id: 1>: '27625', <T.label: 4>: 'Director'}},
 {<T.id: 1>: 'e14530',
  <T.label: 4>: 'Directed',
  <Direct

### Example Queries

#### Nodes

In [22]:
g.V("3").valueMap(True).unfold().toList()

[{'node': ['Foodfight! (2012)']}, {<T.id: 1>: '3'}, {<T.label: 4>: 'Movie'}]

In [25]:
g.E("e33").valueMap(True).unfold().toList()

[{<T.id: 1>: 'e33'}, {<T.label: 4>: 'Directed'}]

In [28]:
g.V().hasLabel('Movie').values('node').limit(10).toList()

['The Other Side of the Wind (2018)',
 'Pál Adrienn (2010)',
 'Foodfight! (2012)',
 'Chinango (2009)',
 'Wazir (2016)',
 'The Wicker Tree (2011)',
 'Coraline (2009)',
 'On the Road (2012)',
 'The Future (2011)',
 'The Watch (2012)']

In [43]:
g.V().hasLabel('Actor').values('node').limit(20).toList()

['Éva Gábor',
 'George Peppard',
 'Rachel Nichols',
 'Kuno Becker',
 'Brad Pitt',
 'Anika Noni Rose',
 'Steve Buscemi',
 'Renée Zellweger',
 'Brené Brown',
 'James McAvoy',
 'Tom Hanks',
 'Owen Wilson',
 'Evan McGuire',
 'Dana Delany',
 'Josh Brolin',
 'Alex Frost',
 'John Corbett',
 'Alicia Vikander',
 'Brooke Nevin',
 'Travis Fimmel']

In [45]:
g.V().has('node','Tom Hanks').toList()

[v[17873], v[52421]]

In [47]:
g.V().has('Actor','node','Tom Hanks').toList()

[v[52421]]

In [50]:
g.V().has('Actor','node','Tom Hanks').values().toList()

['Tom Hanks']

In [64]:
g.V().hasLabel('Movie').count().next()

16838

#### Edges

In [33]:
g.E().hasLabel('Directed').limit(10).toList()

[e[e1][16839-Directed->1],
 e[e2][16840-Directed->2],
 e[e3][16841-Directed->3],
 e[e4][29228-Directed->3],
 e[e5][20839-Directed->4],
 e[e6][20840-Directed->5],
 e[e7][16842-Directed->6],
 e[e8][20841-Directed->7],
 e[e9][20842-Directed->8],
 e[e10][20843-Directed->9]]

In [80]:
g.V().has('Actor','node','Tom Hanks').out().values('node').fold().toList()

[['California Typewriter (2016)',
  'nm0429069',
  'nm0004056',
  'nm1445746',
  'Toy Story 4 (2019)',
  'nm0005124',
  'Toy Story 3 (2010)',
  'nm0881279',
  'Marielle Heller',
  'Doug Nichol',
  'nm2155757',
  'nm1754526',
  'nm1578335',
  'A Beautiful Day in the Neighborhood (2019)',
  'nm1406764',
  'nm0566489',
  'nm2873116',
  'Lee Unkrich',
  'nm0405271',
  'nm5403126',
  'Larry Crowne (2011)',
  'Josh Cooley']]

In [84]:
g.V().has('Director','node','Steven Spielberg').out('Directed').values('node').fold().toList()

[['Bridge of Spies (2015)',
  'The BFG (2016)',
  'The Post (2017)',
  'Ready Player One (2018)',
  'War Horse (2011)',
  'Lincoln (2012)',
  'The Adventures of Tintin (2011)']]

In [107]:
g.V().has('Actor','node','Brad Pitt').out('Starred In').values('node').fold().toList()

[['Inglourious Basterds (2009)']]

In [103]:
g.V().has('Director','node','Steven Spielberg').outE().inV().path().toList()

[path[v[20872], e[e76][20872-Directed->68], v[68]],
 path[v[20872], e[e444][20872-Directed->415], v[415]],
 path[v[20872], e[e3105][20872-Directed->2951], v[2951]],
 path[v[20872], e[e3739][20872-Directed->3559], v[3559]],
 path[v[20872], e[e10675][20872-Directed->10214], v[10214]],
 path[v[20872], e[e10709][20872-Directed->10248], v[10248]],
 path[v[20872], e[e14921][20872-Directed->14312], v[14312]]]

In [60]:
g.V().hasLabel('Director').outE('Directed').count().next() # 17.262 directors directed 16,838 movies

17262

What to Pull out for NetworkX conversion

In [10]:
sg = g.V().bothE().otherV().path().by(__.valueMap(True)).toList()

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt



In [None]:
list(sg[0][0].values())

In [None]:
list(sg[0][1].values())

In [None]:
list(sg[0][2].values())

In [None]:
list(sg[0][0].values())[2]

In [None]:
list(sg[0][1].values())[1]

ERROR:root:
Received error message '{'requestId': 'a11e068a-f53a-4ccf-864c-4fdfd3d4e0dd', 'status': {'message': '{"detailedMessage":"A timeout occurred within the script during evaluation.","requestId":"a11e068a-f53a-4ccf-864c-4fdfd3d4e0dd","code":"TimeLimitExceededException"}', 'code': 500, 'attributes': {}}, 'result': {'data': None, 'meta': {}}}'

With results dictionary '{'a11e068a-f53a-4ccf-864c-4fdfd3d4e0dd': <gremlin_python.driver.resultset.ResultSet object at 0x7ff606a3f250>}'


In [None]:
#print(g.V().hasLabel("@minboxradio").toList())

### Convert into networkx graph

In [30]:
G = nx.DiGraph()
sg = g.V().bothE().otherV().path().by(__.valueMap(True)).toList()
for i in range(0,len(sg)):
    # Add Nodes
    G.add_node(list(sg[i][0].values())[2],screen_name=list(sg[i][0].values())[0][0], ntype= list(sg[i][0].values())[1])
    G.add_node(list(sg[i][2].values())[2],screen_name=list(sg[i][2].values())[0][0], ntype= list(sg[i][0].values())[1])
    # Add Edges
    G.add_edge(list(sg[i][0].values())[2],list(sg[i][2].values())[2],elabel=list(sg[i][1].values())[2],weight=list(sg[i][1].values())[0],etype=list(sg[i][1].values())[1])

Extract to show edge and node lists

In [31]:
G_edgelist = nx.to_pandas_edgelist(G)
#G_edgelist = G_edgelist[G_edgelist['elabel'] != "retweeted"]
G_edgelist.head()

Unnamed: 0,source,target,etype,weight,elabel
0,1351,1,mentioned_by,0.068121,e14
1,1,1351,mentioned_by,0.068121,e14
2,1,1367,mentioned_by,0.04675,e30
3,1,1383,mentioned_by,0.038736,e46
4,1,1399,mentioned_by,0.033393,e62


In [32]:
G_nodelist = pd.DataFrame([i[1] for i in G.nodes(data=True)], index=[i[0] for i in G.nodes(data=True)]).rename_axis('node').reset_index()
G_nodelist

Unnamed: 0,node,screen_name,ntype
0,1351,@iooner,Tweeter
1,1,@MinBoxRadio,Tweeter
2,1367,@Retalyx,Tweeter
3,1383,@WTCraft,Tweeter
4,1399,@Foulimage,Tweeter
...,...,...,...
8034,7909,@MpM2dPlh3c9lomDJD0q120yjZbcpzIiZ6jiW5l,Tweeter
8035,1316,@ONkki4n+E0yCYHDIn24IsYFBbTsBbH28W1UeFmxPyY=,Tweeter
8036,8009,@curious4fun44,Tweeter
8037,8016,@Keskin_Sozler1,Tweeter


### Export to JSON for data viz

In [33]:
with open('networkdata1.json', 'w') as outfile1:
    outfile1.write(json.dumps(json_graph.node_link_data(G)))

### Close DB Connection

In [233]:
remoteConn.close()