<a href="https://colab.research.google.com/github/chihpoc/chihpoc/blob/main/session11/Going_Meta_11_Graph_Expectations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install neo4j 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
pip install rdflib

In [None]:
pip install -i https://test.pypi.org/simple/ graphexpectations

**STEP1: import the graphexpectations package**

In [None]:
import graphexpectations as ge
import pandas as pd

**STEP2: Create a set of graph expectations per category (label) or based on query**

In [None]:
supplierExpectations = ge.Set(nodeType="Supplier")
supplierExpectations.expect_property_values_to_match_regex(property="country", regex="^[A-Za-z]+$", message="R001_INVALID_COUNTRY")
supplierExpectations.expect_number_of_incoming_relationship_to_be_between(relationship="supplied_by",min=2,message="R002_LOW_PRODUCT_OFFERING")

productExpectations = ge.Set("Product")
productExpectations.expect_number_of_property_values_to_be_between(property="unitPrice", min=1, max=1,message="R003_SINGLE_PRICE")
productExpectations.expect_property_values_to_be_between(property="unitPrice", minInclusive=10, maxExclusive=500, message="R004_PRICE_LIMIT")

customerExpectations = ge.Set("Customer")
customerExpectations.expect_property_values_to_be_of_type(property="")
customerExpectations.expect_outgoing_relationship_to_connect_to_nodes_of_type(relationship="places",targetType="Order", message="R005_CUST_BAD_SCHEMA")
customerExpectations.expect_number_of_outgoing_relationship_to_be_between(relationship="places",min="1", message="R006_CUST_NO_ORDERS")

americanProducts = ge.Set(query=" (focus:Product)-[:supplied_by]->(:Supplier { country: 'USA' }) ")
americanProducts.expect_property_values_to_be_between(property="productID", minExclusive=10,message="R007_US_PROD_ID")

**STEP3: Combine expectation sets in a suite**

In [None]:
s = ge.Suite(desc="suite of expectations for my Neo4j Northwind KG")
s.add_expectations([supplierExpectations, productExpectations, customerExpectations, americanProducts])

Save expectations (as SHACL shapes) for version control, etc...

In [None]:
print(s.serialise())

@prefix ns1: <http://www.w3.org/ns/shacl#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

[] a ns1:NodeShape ;
    ns1:property [ ns1:maxExclusive 500 ;
            ns1:message "R004_PRICE_LIMIT" ;
            ns1:minInclusive 10 ;
            ns1:path <neo4j://graph.schema#unitPrice> ],
        [ ns1:maxCount 1 ;
            ns1:message "R003_SINGLE_PRICE" ;
            ns1:minCount 1 ;
            ns1:path <neo4j://graph.schema#unitPrice> ] ;
    ns1:targetClass <neo4j://graph.schema#Product> .

[] a ns1:NodeShape ;
    ns1:property [ ns1:message "R006_CUST_NO_ORDERS" ;
            ns1:minCount "1" ;
            ns1:path <neo4j://graph.schema#places> ],
        [ ns1:class <neo4j://graph.schema#Order> ;
            ns1:message "R005_CUST_BAD_SCHEMA" ;
            ns1:path <neo4j://graph.schema#places> ] ;
    ns1:targetClass <neo4j://graph.schema#Customer> .

[] a ns1:NodeShape ;
    ns1:property [ ns1:message "R007_US_PROD_ID" ;
            ns1:minExclusive 10 ;
            n

**STEP4: Bind suite to DB to create an execution context**

In [None]:
context = s.bind_to_db("bolt://34.203.42.32:7687","neo4j","bears-cabinets-brake")

context successfully bound to DB


**STEP5: Run the validations and print out the results (or persist them in an RDB, or save to a file...)**

In [None]:
df = pd.DataFrame(context.run())

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.precision', 3)

display(df[['node','nodeType','violationType','offendingValue','schemaElement','comment','msg']])

Unnamed: 0,node,nodeType,violationType,offendingValue,schemaElement,comment,msg
0,"{'unitPrice': 22.0, 'unitsInStock': 53, 'reord...",[query-based selection],MinExclusiveConstraintComponent,4.0,productID,,R007_US_PROD_ID
1,"{'unitPrice': 21.35, 'reorderLevel': 0, 'units...",[query-based selection],MinExclusiveConstraintComponent,5.0,productID,,R007_US_PROD_ID
2,"{'unitPrice': 25.0, 'unitsInStock': 120, 'reor...",[query-based selection],MinExclusiveConstraintComponent,6.0,productID,,R007_US_PROD_ID
3,"{'unitPrice': 30.0, 'unitsInStock': 15, 'reord...",[query-based selection],MinExclusiveConstraintComponent,7.0,productID,,R007_US_PROD_ID
4,"{'unitPrice': 40.0, 'unitsInStock': 6, 'reorde...",[query-based selection],MinExclusiveConstraintComponent,8.0,productID,,R007_US_PROD_ID
5,"{'country': 'Spain', 'contactTitle': 'Accounti...",Customer,MinCountConstraintComponent,,places,cardinality (0) is outside the defined min-max...,R006_CUST_NO_ORDERS
6,"{'country': 'France', 'contactTitle': 'Owner',...",Customer,MinCountConstraintComponent,,places,cardinality (0) is outside the defined min-max...,R006_CUST_NO_ORDERS
7,"{'unitsInStock': 24, 'reorderLevel': 5, 'unitP...",Product,ValueRangeConstraintComponent,6.0,unitPrice,,R004_PRICE_LIMIT
8,"{'unitPrice': 9.2, 'unitsInStock': 25, 'reorde...",Product,ValueRangeConstraintComponent,9.2,unitPrice,,R004_PRICE_LIMIT
9,"{'unitsInStock': 61, 'reorderLevel': 25, 'unit...",Product,ValueRangeConstraintComponent,9.0,unitPrice,,R004_PRICE_LIMIT


In [None]:
import plotly
pd.options.plotting.backend = "plotly"
aggregate = df[["msg","node"]].groupby(["msg"]).count().rename(columns={'node':'violation_count'})
fig = aggregate.plot.bar()
fig.show()