# pydgraph example notebook

## Self-managed cluster version

This example notebook version uses an an existing Dgraph cluster that you control. If you have Docker, the TLDR; version is:

```sh
docker run --rm -it -p 8080:8080 -p 9080:9080 -p 5080:5080 dgraph/standalone:latest
```

For more information on starting Dgraph with Docker or Docker Compose, see this [document](https://dgraph.io/docs/learn/data-engineer/get-started-with-dgraph/tutorial-1/). This notebook was tested both via a local Jupyter environment and on Google Colab.

This example notebook uses a schema and data from the [Dgraph ICIJ offshore leaks repository](https://github.com/dgraph-io/vlg). Please refer to that repo for a discussion of the schema and data.

Covered in this example:
* load a GraphQL schema
* use the Dgraph Live Loader to load data
* create a DQL-based pydgraph client
* perform DQL queries and mutations
* create a GraphQL client
* perform a GraphQL query
* perform a recursive query using DQL
* visualize query results using Graphistry

**Please note that this notebook updates the schema and loads data into the configured cluster.**

In [None]:
# Set the hostname of the Dgraph alpha service
dgraph_hostname = "localhost"

In [None]:
# This cell checks that the required ports for the Dgraph cluster are accessible from this notebook. It also sets
# important port variables used in later cells

import socket

def check_port(url, port):
    """
    check_port returns true if the port at the url is accepting connections
    """
    try:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock.settimeout(3)  # Set a timeout value for the connection attempt
        result = sock.connect_ex((url, port))
        sock.close()
        if result == 0:
            return True
        else:
            return False
    except socket.error:
        return False

# check ports to ensure access. these are the defaults, change these to match your custom setup if different
dgraph_http_port = 8080
dgraph_grpc_port = 9080
dgraph_zero_port = 5080
if not check_port(dgraph_hostname, dgraph_http_port):
    raise Exception(f"Port {dgraph_http_port} at {dgraph_hostname} not responding, is the server running?")
if not check_port(dgraph_hostname, dgraph_grpc_port):
    raise Exception(f"Port {dgraph_grpc_port} at {dgraph_hostname} not responding, is the server running?")
if not check_port(dgraph_hostname, dgraph_zero_port):
    raise Exception(f"Port {dgraph_zero_port} at {dgraph_hostname} not responding, is the server running?")

print("Required ports accepting connections")

In [None]:
# Apply a GraphQL Schema to the cluster

# download the schema
!curl -Ss https://raw.githubusercontent.com/dgraph-io/vlg/main/schema/schema.graphql --output schema.graphql
# update the schema in the cluster
admin_endpoint = f"http://{dgraph_hostname}:{dgraph_http_port}/admin/schema"
!curl --data-binary '@./schema.graphql' {admin_endpoint}

In [None]:
# Load data into the cluster

# download the rdf file
!curl -Ss https://raw.githubusercontent.com/dgraph-io/vlg/main/rdf-subset/data.rdf.gz --output data.rdf.gz

# find ways to load data into the cluster
import shutil, os, platform

pwd = os.getcwd()
if shutil.which('docker'):
    docker_host = dgraph_hostname
    if dgraph_hostname == 'localhost':
        docker_host = 'host.docker.internal'
    !docker run -it -v {pwd}:/data dgraph/standalone:latest dgraph live -f /data/data.rdf.gz --alpha {docker_host}:{dgraph_grpc_port} --zero {docker_host}:{dgraph_zero_port}
elif shutil.which('dgraph'):
    !dgraph live -f ./data.rdf.gz --alpha {dgraph_hostname}:{dgraph_grpc_port} --zero {dgraph_hostname}:{dgraph_zero_port}
elif platform.system() == "Linux":
    !curl https://get.dgraph.io -sSf | bash -s -- -y
    !dgraph live -f ./data.rdf.gz --alpha {dgraph_hostname}:{dgraph_grpc_port} --zero {dgraph_hostname}:{dgraph_zero_port}
else:
    raise Exception("Unable to find a way to load data into your cluster.")
    

In [None]:
# Install pydgraph

%pip install pydgraph

In [None]:
# Create a pydgraph client

import pydgraph

client_stub = pydgraph.DgraphClientStub(addr=f"{dgraph_hostname}:{dgraph_grpc_port}", options=[('grpc.max_receive_message_length', 1024*1024*1024)])
pyd_client = pydgraph.DgraphClient(client_stub)
print("Dgraph Version:", pyd_client.check_version())

In [None]:
# Perform a DQL query (stem search on the name predicate)

import json

query = """
query fulltext($name: string) {
  q(func: anyoftext(Record.name, $name), first: 10) {
    uid
    id: Record.nodeID
    name: Record.name
  }
}
"""
res = pyd_client.txn(read_only=True).query(query=query, variables={"$name": "living"})
records = json.loads(res.json)
print(json.dumps(records, indent=2))


In [None]:
# Perform a DQL mutation

txn = pyd_client.txn()
try:
    uid = records['q'][0]['uid']
    name = f"New Name (formerly {records['q'][0]['name']})"
    p = {
        'uid': uid,
        'Record.name': name
    }
    response = txn.mutate(set_obj=p)
    txn.commit()
finally:
    txn.discard()
    

In [None]:
# Install an open source GraphQL client

%pip install python-graphql-client

In [None]:
# Create a GraphQL client

from python_graphql_client import GraphqlClient

gql_client = GraphqlClient(endpoint=f"http://{dgraph_hostname}:{dgraph_http_port}/graphql")

In [None]:
# Perform a GraphQL query (stem search on the name predicate)

ft_query = """
query ($filter: EntityFilter, $first: Int) {
  queryEntity(filter: $filter, first: $first) {
    id: nodeID
    type: __typename
    name
  }
}
"""
variables = {
    "filter": {
        "name": {
            "anyoftext": "living"
        }
    },
    "first": 10
}
data = gql_client.execute(query=ft_query, variables=variables)
for res in data['data']['queryEntity']:
    print(res['name'])

In [None]:
# Perform a recursive DQL query on a list of known records (these records are highly-connected)

highly_connected_records = ['236724', '230000018', '54662', '23000136', '240000001', '23000147', '81027146', '23000156', '23000330', '81027090', '23000133', '32000236', '11001746', '81029389', '23000213', '298333', '288469', '23000046', '23000280', '11011863', '12160432', '96909', '11008027', '298293', '23000381', '11001708', '285729', '11012037', '23000198', '23000219', '294268', '230000057', '49684', '23000362', '23000228', '11007372', '230000005', '80000191', '11009351', '23000400', '23000235', '23000406', '23000162', '23000365', '80011301', '23000281', '80011987', '58007938', '88002083', '11011539', '264051', '298258', '240230001', '297687', '230000038', '24000074', '20642', '230000007', '11010643', '23000222', '58922', '81027087', '279944', '23000377', '240360001', '298170', '24883', '11012290', '11009218', '23000130', '43724', '225000056', '11009139', '298147', '237148', '23000396', '230000054', '237076', '237583', '23000146', '11006103', '230000021', '11012118', '120001922', '230000066', '236748', '23000131', '295141', '298166', '230000025', '230000020', '11000489', '23000204', '23000260', '11012146', '56917', '11011469', '271169', '236832', '81001128', '33000151', '81073055', '11010502', '75595', '32000238', '240110001', '23000256', '23000001', '32000226', '23000237', '11014056', '56072048', '50622', '23000437', '23000307', '32000235', '24000031', '14025646', '263908', '11010460', '23000145', '230000070', '260937', '23000360', '23000166', '271677', '58009618', '297689', '263996', '14026068', '230000004', '230000016', '23000161', '23000157', '298020', '297596', '11003948', '230000017', '58044817', '23000141', '230000003', '290240', '58034506', '81038065', '88007148', '82019954', '23000343', '56072081', '80051573', '80086304']

recurse_query = """
{
    q(func: eq(Record.nodeID, {LIST})) @recurse(depth: 8) {
        # predicates to return for each recurse
        id: Record.nodeID
        name: Record.name
        type: <dgraph.type>
        # predicates to loop through
        hasaddress: Record.hasAddress
        addressFor: RecordRecord.addressFor
        hasOfficer: Record.hasOfficer
        officerFor: Record.officerFor
        hasIntermediary: Record.hasIntermediary
        intermediaryFor: Record.intermediaryFor
        connectedTo: RecordRecord.connectedTo  
  }
}
"""

recurse_query = recurse_query.replace("{LIST}", json.dumps(highly_connected_records))
res = pyd_client.txn(read_only=True).query(recurse_query)
data = json.loads(res.json)


In [None]:
# Convert the nested JSON DQL result to a dictionary of nodes and an array of edges using a utility 
# function in pydgraph (convert.extract_dict).
# These structures are common requirements for graph analysis and visualization

from pydgraph import convert

nodes = {}
edges = []
convert.extract_dict(nodes, edges, data)
print("nodes count", len(nodes), ", edges count", len(edges))

In [None]:
# Install pandas

%pip install pandas

In [None]:
# Transform the node dictionary to Pandas dataframe

import pandas as pd

nodes_df = pd.DataFrame.from_dict(nodes, orient='index')
nodes_df.sample(5)

In [None]:
# Transform the edges array to Pandas dataframe

edges_df = pd.DataFrame(edges)
edges_df.sample(5)

In [None]:
# Setup Graphistry

# sign up for a free Graphisty account at https://hub.graphistry.com. Use the "Create Account" flow in which you
# specify a username and password (not the OAUTH flow).

%pip install graphistry

import graphistry
print("Graphistry version", graphistry.__version__)

# replace these <place holders> with your credentials
graphistry_username = "<YOUR GRAPHISTRY USERNAME>"
graphistry_password = "<YOUR GRAPHISTRY PASSWORD>"
graphistry.register(api=3, protocol="https", server="hub.graphistry.com", username=graphistry_username, password=graphistry_password)


In [None]:
# Visualize the recursively-derived subgraph in Graphistry

g = graphistry.nodes(nodes_df, 'id').edges(edges_df, 'src', 'dst').bind(point_title='name')
# set colors by node type
g2 = g.encode_point_color('type', categorical_mapping={
    'Entity': '#3bdbdb', 
    'Intermediary': '#E99233', 
    'Officer': '#6DB364', 
    'Address': '#F7D82F'
}, default_mapping='gray')
# set font awesome icons by node type
g3 = g2.encode_point_icon('type', shape="circle", #clip excess
  categorical_mapping={
      'Entity': 'fa-building',
      'Intermediary': 'fa-handshake-o',
      'Address': 'fa-map-marker',
      'Officer': 'fa-user'
  },
  default_mapping="question")

# render the visualization
g3.plot()