
Notebook to explore data similarity in DGraph cluster


**pre-requesite**
- Dgraph
  - Get a [Dgraph Cloud account](https://cloud.dgraph.io/)
  - Have your account user name and password available
  - Have a Dgraph cluster running in your Dgraph Cloud account
  - Obtain the GraphQL Endpoint of the Dgraph cluster from the [cloud dashboard](https://cloud.dgraph.io/_/dashboard)
  - Obtain an Admin API key for the Dgraph Cluster from the [settings](https://cloud.dgraph.io/_/settings?tab=api-keys) tab.



  The first step is to import the packages needed.

-  ``pydgraph``, the official [python client library for Dgraph Query Language](https://dgraph.io/docs/dql/clients/python/)
-  ``GraphqlClient``, a GraphQL client to invoke the GraphQL API generated from your schema and the GraphQL admin API of Dgraph.

**Make sure to update the endpoints with the correct values for your Dgraph cluster!**


In [None]:
!pip install pydgraph python-graphql-client ipycytoscape
import sys
import pydgraph
import json
import base64
import getpass
import pandas as pd 
from python_graphql_client import GraphqlClient



In [None]:

# copy your Dgraph cloud endpoints
# The GraphQL endpoint is found at https://cloud.dgraph.io/_/dashboard
# dgraph_graphql_endpoint = "https://xyz.us-east-1.aws.cloud.dgraph.io/graphql"
dgraph_graphql_endpoint = "http://localhost:8080/graphql"


# The gRPC endpoint is found at https://cloud.dgraph.io/_/settings
# dgraph_grpc = "xyz.grpc.us-east-1.aws.cloud.dgraph.io:443"
dgraph_grpc = "localhost:9080"

# graph admin endpoint is /admin
dgraph_graphql_admin = dgraph_graphql_endpoint.replace("/graphql", "/admin")
# graph health endpoint is /health
dgraph_graphql_health = dgraph_graphql_endpoint.replace("/graphql", "/health")



Enter your credentials and test the clients


In [None]:

# Cloud credentials
# we need the cloud login credential to upload the Lambda code.
# we need the an Admin API key generated at https://cloud.dgraph.io/_/settings?tab=api-keys for DQL alter and query


API_KEY = getpass.getpass("DGRAPH API KEY?")

# DQL Client
if dgraph_grpc.find("cloud") > 0:
   client_stub = pydgraph.DgraphClientStub.from_cloud(dgraph_grpc,API_KEY )
else:
   client_stub = pydgraph.DgraphClientStub(addr=dgraph_grpc) 

client = pydgraph.DgraphClient(client_stub)

# GraphQL client and admin client
gql_client = GraphqlClient(endpoint=dgraph_graphql_endpoint)
headers = { "Dg-Auth": API_KEY }
gql_admin_client = GraphqlClient(endpoint=dgraph_graphql_admin, headers=headers)
gql_health_client = GraphqlClient(endpoint=dgraph_graphql_health)


#
#  Testing the connection to the Dgraph cluster
#
data = gql_health_client.execute(query="")
if 'errors' in data:
   raise Exception(data['errors'][0]['message'])

print("Check cluster health:", json.dumps(data, indent=2))

#
#  Testing the DQL connection
#
txn = client.txn(read_only=True)
query = "schema{}"
res = txn.query(query)
dqlschema = json.loads(res.json)
txn.discard()
print("get DQL schema - succeeded")


In [None]:
# function to get embeddings from existing Dgraph cluster
# update the query to match your schema, entity and embedding predicate


def fetchEmbeddings(entity, emdedding_predicate, name_predicate):
  # for example fetchEmbeddings("Product","Product.embedding","Product.title")
  # returns a dataframe with uid, name, and embedding
  txn = client.txn(read_only=True)
  after = ""
  data = pd.DataFrame(columns=['uid', 'name', 'embedding'])
  while True:
    print("fetching products after", after)
    query = """
    query queryItems() {{
        items(func:type({1}) {0},first:500)  @filter(has({2})) {{
          uid
          name: {3}
          embedding:{2}
        }}
    }}
    """.format(after,entity,emdedding_predicate,name_predicate)

    variables = {}

    res = txn.query(query, variables=variables)
    jdata = json.loads(res.json)
    print(len(jdata['items']))
    if jdata['items'] == None or len(jdata['items']) == 0:
      break
    data = pd.concat([ data, pd.json_normalize(jdata['items']) ],ignore_index=True)
    after = """,after: {}""".format(jdata['items'][-1]['uid'])
    print(after)
  return data


def dataframe_to_rdf(data, filehandle = sys.stdout):
    for _, row in data.iterrows():
        rdf= ""
        rdf += "<_:{}> <Product.embedding> \"{}\" .\n".format(row['uid'],row['embedding'])
        rdf += "<_:{}> <Product.name> \"{}\" .\n".format(row['uid'],row['name'])
        rdf += "<_:{}> <dgraph.type> \"Product\" .\n".format(row['uid'])
        filehandle.write(rdf)
    return

In [None]:
import gzip
data = fetchEmbeddings("Product","Product.embedding","Product.name")
data.to_csv("products_with_embedding.csv.gz",index=False,compression='gzip',header=True)
# gzip file must use wt for write text
with gzip.open("./products.rdf.gz","wt") as f:
    dataframe_to_rdf(data, f)