
Notebook to explore data similarity in DGraph cluster


**pre-requesite**
- Dgraph
  - Get a [Dgraph Cloud account](https://cloud.dgraph.io/)
  - Have your account user name and password available
  - Have a Dgraph cluster running in your Dgraph Cloud account
  - Obtain the GraphQL Endpoint of the Dgraph cluster from the [cloud dashboard](https://cloud.dgraph.io/_/dashboard)
  - Obtain an Admin API key for the Dgraph Cluster from the [settings](https://cloud.dgraph.io/_/settings?tab=api-keys) tab.



  The first step is to import the packages needed.

-  ``pydgraph``, the official [python client library for Dgraph Query Language](https://dgraph.io/docs/dql/clients/python/)
-  ``GraphqlClient``, a GraphQL client to invoke the GraphQL API generated from your schema and the GraphQL admin API of Dgraph.

**Make sure to update the endpoints with the correct values for your Dgraph cluster!**


## use a local Learning Environment

docker pull dgraph/standalone
docker run --name dgraph_learn -d -p "8080:8080" -p "9080:9080" -v <local path to /dgraph-data>:/dgraph dgraph/standalone:latest

### load data set
cp products.rdf.gz <local path to /dgraph-data>
cp products.schema <local path to /dgraph-data>

docker exec -it dgraph_learn  dgraph live -f /dgraph/products.rdf.gz -s /dgraph/products.schema

In [1]:
!pip install pydgraph python-graphql-client ipycytoscape
import pydgraph
import json
import base64
import getpass
import pandas as pd 
from python_graphql_client import GraphqlClient




[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:

# copy your Dgraph cloud endpoints
# The GraphQL endpoint is found at https://cloud.dgraph.io/_/dashboard
# dgraph_graphql_endpoint = "https://black-grass.us-east-1.aws.cloud.dgraph.io/graphql"
# dgraph_graphql_endpoint = "https://withered-bird.us-east-1.aws.cloud.dgraph.io/graphql"
dgraph_graphql_endpoint = "http://localhost:8080/graphql"


# The gRPC endpoint is found at https://cloud.dgraph.io/_/settings
# dgraph_grpc = "black-grass.grpc.us-east-1.aws.cloud.dgraph.io:443"
dgraph_grpc = "withered-bird.grpc.us-east-1.aws.cloud.dgraph.io:443"
dgraph_grpc = "localhost:9080"

# graph admin endpoint is /admin
dgraph_graphql_admin = dgraph_graphql_endpoint.replace("/graphql", "/admin")
# graph health endpoint is /health
dgraph_graphql_health = dgraph_graphql_endpoint.replace("/graphql", "/health")



Enter your credentials and test the different clients


In [3]:

# Cloud credentials
# we need the cloud login credential to upload the Lambda code.
# we need the an Admin API key generated at https://cloud.dgraph.io/_/settings?tab=api-keys for DQL alter and query


API_KEY = getpass.getpass("DGRAPH API KEY?")



# DQL Client
if dgraph_grpc.find("cloud") > 0:
   client_stub = pydgraph.DgraphClientStub.from_cloud(dgraph_grpc,API_KEY )
else:
   client_stub = pydgraph.DgraphClientStub(addr=dgraph_grpc) 

client = pydgraph.DgraphClient(client_stub)

# GraphQL client and admin client
gql_client = GraphqlClient(endpoint=dgraph_graphql_endpoint)
headers = { "Dg-Auth": API_KEY }
gql_admin_client = GraphqlClient(endpoint=dgraph_graphql_admin, headers=headers)
gql_health_client = GraphqlClient(endpoint=dgraph_graphql_health)

#
#  Testing the connection
#
data = gql_health_client.execute(query="")
if 'errors' in data:
   raise Exception(data['errors'][0]['message'])

print("Check cluster health:", json.dumps(data, indent=2))

#
#  Testing the DQL connection
#
txn = client.txn(read_only=True)
query = "schema{}"
res = txn.query(query)
dqlschema = json.loads(res.json)
txn.discard()
print("get DQL schema - succeeded")


Check cluster health: [
  {
    "instance": "alpha",
    "address": "dgraph-alpha:7080",
    "status": "healthy",
    "group": "1",
    "version": "v24.0.2",
    "uptime": 8628,
    "lastEcho": 1725995508,
    "ongoing": [
      "opRollup"
    ],
    "ee_features": [
      "backup_restore",
      "cdc"
    ],
    "max_assigned": 294131
  }
]
get DQL schema - succeeded


In [21]:
# Deploy the GraphQL Schema

graphql_schema = """
type Product {
  id: ID!
  name: String @id  @search(by: [hash,term])
  embedding: [Float!] @embedding
}
"""
mutation = """
mutation($sch: String!) {
  updateGQLSchema(input: { set: { schema: $sch}})
  {
    gqlSchema {
      schema
      generatedSchema
    }
  }
}
"""
variables = {"sch": graphql_schema}
schemadata = gql_admin_client.execute(query=mutation, variables=variables)
print("GraphQL Schema after Update")
print(schemadata['data']['updateGQLSchema']['gqlSchema']['schema'])

GraphQL Schema after Update

type Product {
  id: ID!
  name: String @id  @search(by: [hash,term])
  embedding: [Float!] @embedding
}



In [46]:
# reset the index
def reset_index(predicate, index):
    print(f"remove index for {predicate}")
    schema = f"{predicate}: float32vector ."
    op = pydgraph.Operation(schema=schema)
    alter = client.alter(op)
    print(alter)
    print(f"create index for {predicate} {index}")
    schema = f"{predicate}: float32vector @index({index}) ."
    op = pydgraph.Operation(schema=schema)
    alter = client.alter(op)
    print(alter) 

In [4]:
data = pd.read_csv('products_with_embedding.csv.gz', compression='gzip') 
data['embedding'] = [json.loads(data.iloc[i]['embedding']) for i in range(len(data))]

In [5]:
# build a ground truth similarity index (KNN)
from sklearn import metrics
embeddings = data['embedding'].tolist()
distances = metrics.pairwise_distances(embeddings, embeddings, metric='euclidean')
knn_all = distances.argsort(axis=1)


In [16]:
print(data.iloc[3]['name'])
data.iloc[knn_all[3][:10]]['name'].tolist()



Butters


['Butters',
 'Butter',
 'Fruit Butters',
 'Nut Butters',
 'Seed Butters',
 'Ice Cream Butter',
 'Apple Butter',
 'Butter Beans',
 'Salted Butter',
 'Flavored Butter']

In [37]:
# get topK ANN from Graph - Approximate Nearest Neighbors
def topK_ANN(name, k=10):
  query = """
    query QuerySimilarById($name: String!, $k: Int!) {{
        list:querySimilar{}ById(by: embedding, topK: $k, name: $name) {{
            uid:id
            name:name
        }}
    }}
    """.format("Product")
  variables={"name":name,"k":k}
  res = gql_client.execute(query, variables)
  return pd.json_normalize(res['data']['list'])

topK_ANN(data.iloc[3]['name'], 10)

Unnamed: 0,uid,name
0,0x493e1,Butters
1,0x29332b,Butter
2,0xdbbf7,Fruit Butters
3,0x249f14,Nut Butters
4,0x249f58,Seed Butters
5,0x7a18e,Ice Cream Butter
6,0x3b,Apple Butter
7,0x249f43,Butter Beans
8,0x493e9,Salted Butter
9,0x19f0e7,Flavored Butter


In [52]:
# compute precision
def compute_item_precision(index, k, verbose = False):
  name = data.iloc[index]['name']
  if verbose:
    print("compute_precision: ",index,name)
    print("Ground Truth:",name)
  knn = data.iloc[knn_all[index][:k]]['name'].tolist()
  if verbose:
    print("KNN:",knn)
  knn = set(knn)
  ann = topK_ANN(name,k)['name'].tolist()
  if verbose:
     print("ANN:",ann)
  ann = set(ann)
  precision_at_k = len(knn.intersection(ann))/len(knn)
  return precision_at_k

# compute precision for first m products
def average_precision(first = 10, k = 10):
    precision = 0.0
    m = first
    for i in range(first):
        p = compute_item_precision(i,k) 
        precision += p
    precision = precision/m
    return precision
print (compute_item_precision(3,5))
print (average_precision(10,5))



0.6
0.44000000000000006


compute_precision:  0 Ginkgo Biloba
Ground Truth: Ginkgo Biloba
KNN: ['Ginkgo Biloba', 'Ginkgo Biloba Extract', 'Ginkgo Biloba Leaf Powder', 'Barbacoa', 'Gin']
ANN: ['Ginkgo Biloba', 'Ginkgo Biloba Extract', 'Ginkgo Biloba Leaf Powder', 'Gin', 'Siberian Ginseng']
Precision@5: 0.8
0 0.8
compute_precision:  1 Fish Oil Powder
Ground Truth: Fish Oil Powder
KNN: ['Fish Oil Powder', 'Fish Oil', 'Coconut Oil Powder', 'Shea Butter Oil Powder', 'CLA Oil Powder']
ANN: ['Fish Oil Powder', 'Fish Oil', 'Coconut Oil Powder', 'CLA Oil Powder', 'Borage Oil Powder']
Precision@5: 0.8
1 0.8
compute_precision:  2 Ecklonia Cava
Ground Truth: Ecklonia Cava
KNN: ['Ecklonia Cava', 'Calvados', 'Cascara Sagrada', 'Asafetida', 'Calamari']
ANN: ['Ecklonia Cava', 'Cascara Sagrada', 'Asafetida', 'Rioja', 'Kombucha']
Precision@5: 0.6
2 0.6
compute_precision:  3 Butters
Ground Truth: Butters
KNN: ['Butters', 'Butter', 'Fruit Butters', 'Nut Butters', 'Seed Butters']
ANN: ['Butters', 'Butter', 'Fruit Butters', 'Nut But

0.792

In [66]:
for index in ['hnsw(metric: "euclidian")',
              'hnsw(metric: "euclidian", maxLevels: "3", efSearch: "40", efConstruction: "100")',
              'hnsw(metric: "euclidian", maxLevels: "3", efSearch: "40", efConstruction: "400")']:
    reset_index("Product.embedding",index)
    print("index = ",index)
    for k in [1,3,5,10]:
        p_at_k = average_precision(100,k)
        print("""Precision@{} = {}""".format(k, p_at_k))

remove index for Product.embedding

create index for Product.embedding hnsw(metric: "euclidian")

index =  hnsw(metric: "euclidian")
Precision@1 = 0.51
Precision@3 = 0.46333333333333326
Precision@5 = 0.4599999999999999
Precision@10 = 0.46
remove index for Product.embedding

create index for Product.embedding hnsw(metric: "euclidian", maxLevels: "3", efSearch: "40", efConstruction: "100")

index =  hnsw(metric: "euclidian", maxLevels: "3", efSearch: "40", efConstruction: "100")
Precision@1 = 0.9
Precision@3 = 0.8266666666666667
Precision@5 = 0.7879999999999994
Precision@10 = 0.7539999999999997
remove index for Product.embedding

create index for Product.embedding hnsw(metric: "euclidian", maxLevels: "4", efSearch: "40", efConstruction: "100")

index =  hnsw(metric: "euclidian", maxLevels: "4", efSearch: "40", efConstruction: "100")
Precision@1 = 0.81
Precision@3 = 0.7800000000000001
Precision@5 = 0.7839999999999996
Precision@10 = 0.7429999999999995
