
Notebook to explore data similarity in DGraph cluster


**pre-requesite**
- Dgraph
  - Get a [Dgraph Cloud account](https://cloud.dgraph.io/)
  - Have your account user name and password available
  - Have a Dgraph cluster running in your Dgraph Cloud account
  - Obtain the GraphQL Endpoint of the Dgraph cluster from the [cloud dashboard](https://cloud.dgraph.io/_/dashboard)
  - Obtain an Admin API key for the Dgraph Cluster from the [settings](https://cloud.dgraph.io/_/settings?tab=api-keys) tab.



  The first step is to import the packages needed.

-  ``pydgraph``, the official [python client library for Dgraph Query Language](https://dgraph.io/docs/dql/clients/python/)
-  ``GraphqlClient``, a GraphQL client to invoke the GraphQL API generated from your schema and the GraphQL admin API of Dgraph.

**Make sure to update the endpoints with the correct values for your Dgraph cluster!**


## use a local Learning Environment

docker pull dgraph/standalone
docker run --name dgraph_learn -d -p "8080:8080" -p "9080:9080" -v <local path to /dgraph-data>:/dgraph dgraph/standalone:latest

### load data set
cp products.rdf.gz <local path to /dgraph-data>
cp products.schema <local path to /dgraph-data>

docker exec -it dgraph_learn  dgraph live -f /dgraph/products.rdf.gz -s /dgraph/products.schema

In [None]:
!pip install pydgraph python-graphql-client ipycytoscape
import pydgraph
import json
import base64
import getpass
import pandas as pd 

from python_graphql_client import GraphqlClient



In [None]:

# copy your Dgraph cloud endpoints
# The GraphQL endpoint is found at https://cloud.dgraph.io/_/dashboard
# dgraph_graphql_endpoint = "https://black-grass.us-east-1.aws.cloud.dgraph.io/graphql"
# dgraph_graphql_endpoint = "https://withered-bird.us-east-1.aws.cloud.dgraph.io/graphql"
dgraph_graphql_endpoint = "http://localhost:8080/graphql"


# The gRPC endpoint is found at https://cloud.dgraph.io/_/settings
# dgraph_grpc = "black-grass.grpc.us-east-1.aws.cloud.dgraph.io:443"
dgraph_grpc = "withered-bird.grpc.us-east-1.aws.cloud.dgraph.io:443"
dgraph_grpc = "localhost:9080"

# graph admin endpoint is /admin
dgraph_graphql_admin = dgraph_graphql_endpoint.replace("/graphql", "/admin")
# graph health endpoint is /health
dgraph_graphql_health = dgraph_graphql_endpoint.replace("/graphql", "/health")



Enter your credentials and test the different clients


In [None]:

# Cloud credentials
# we need the cloud login credential to upload the Lambda code.
# we need the an Admin API key generated at https://cloud.dgraph.io/_/settings?tab=api-keys for DQL alter and query


API_KEY = getpass.getpass("DGRAPH API KEY?")



# DQL Client
if dgraph_grpc.find("cloud") > 0:
   client_stub = pydgraph.DgraphClientStub.from_cloud(dgraph_grpc,API_KEY )
else:
   client_stub = pydgraph.DgraphClientStub(addr=dgraph_grpc) 

client = pydgraph.DgraphClient(client_stub)

# GraphQL client and admin client
gql_client = GraphqlClient(endpoint=dgraph_graphql_endpoint)
headers = { "Dg-Auth": API_KEY }
gql_admin_client = GraphqlClient(endpoint=dgraph_graphql_admin, headers=headers)
gql_health_client = GraphqlClient(endpoint=dgraph_graphql_health)

#
#  Testing the connection
#
data = gql_health_client.execute(query="")
if 'errors' in data:
   raise Exception(data['errors'][0]['message'])

print("Check cluster health:", json.dumps(data, indent=2))

#
#  Testing the DQL connection
#
txn = client.txn(read_only=True)
query = "schema{}"
res = txn.query(query)
dqlschema = json.loads(res.json)
txn.discard()
print("get DQL schema - succeeded")


In [None]:
# Deploy the GraphQL Schema

graphql_schema = """
type Product {
  id: ID!
  name: String @id  @search(by: [hash,term])
  embedding: [Float!] @embedding
}
"""
mutation = """
mutation($sch: String!) {
  updateGQLSchema(input: { set: { schema: $sch}})
  {
    gqlSchema {
      schema
      generatedSchema
    }
  }
}
"""
variables = {"sch": graphql_schema}
schemadata = gql_admin_client.execute(query=mutation, variables=variables)
print("GraphQL Schema after Update")
print(schemadata['data']['updateGQLSchema']['gqlSchema']['schema'])

In [None]:
# reset the index
def reset_index(predicate, index):
    print(f"remove index for {predicate}")
    schema = f"{predicate}: float32vector ."
    op = pydgraph.Operation(schema=schema)
    alter = client.alter(op)
    print(alter)
    print(f"create index for {predicate} {index}")
    schema = f"{predicate}: float32vector @index({index}) ."
    op = pydgraph.Operation(schema=schema)
    alter = client.alter(op)
    print(alter) 

In [None]:
data = pd.read_csv('products_with_embedding.csv.gz', compression='gzip') 
data['embedding'] = [json.loads(data.iloc[i]['embedding']) for i in range(len(data))]

In [None]:
# build a ground truth similarity index (KNN)
from sklearn import metrics
embeddings = data['embedding'].tolist()
distances = metrics.pairwise_distances(embeddings, embeddings, metric='euclidean')
knn_all = distances.argsort(axis=1)


In [None]:
print(data.iloc[3]['name'])
data.iloc[knn_all[3][:10]]['name'].tolist()



In [None]:
# get topK ANN from Graph - Approximate Nearest Neighbors
def topK_ANN(name, k=10):
  query = """
    query QuerySimilarById($name: String!, $k: Int!) {{
        list:querySimilar{}ById(by: embedding, topK: $k, name: $name) {{
            uid:id
            name:name
        }}
    }}
    """.format("Product")
  variables={"name":name,"k":k}
  res = gql_client.execute(query, variables)
  return pd.json_normalize(res['data']['list'])

topK_ANN(data.iloc[3]['name'], 10)

In [None]:
# compute precision
def compute_item_precision(index, k, verbose = False):
  name = data.iloc[index]['name']
  if verbose:
    print("compute_precision: ",index,name)
    print("Ground Truth:",name)
  knn = data.iloc[knn_all[index][:k]]['name'].tolist()
  if verbose:
    print("KNN:",knn)
  knn = set(knn)
  ann = topK_ANN(name,k)['name'].tolist()
  if verbose:
     print("ANN:",ann)
  ann = set(ann)
  precision_at_k = len(knn.intersection(ann))/len(knn)
  return precision_at_k

# compute precision for first m products
def average_precision(first = 10, k = 10):
    precision = 0.0
    m = first
    for i in range(first):
        p = compute_item_precision(i,k) 
        precision += p
    precision = precision/m
    return precision
print (compute_item_precision(3,5))
print (average_precision(10,5))



In [None]:
for index in ['hnsw(metric: "euclidian")',
              'hnsw(metric: "euclidian", maxLevels: "3", efSearch: "40", efConstruction: "100")',
              'hnsw(metric: "euclidian", maxLevels: "3", efSearch: "40", efConstruction: "400")']:
    reset_index("Product.embedding",index)
    print("index = ",index)
    for k in [1,3,5,10]:
        p_at_k = average_precision(100,k)
        print("""Precision@{} = {}""".format(k, p_at_k))