In [9]:
import json
from neo4j import GraphDatabase
import numpy as np
import os
import pandas as pd
from pprint import pprint
import time
from tqdm import tqdm

from langchain_openai import ChatOpenAI
from langchain_community.graphs import Neo4jGraph
from langchain.chains import GraphCypherQAChain

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

## Introduction

In order to run this notebook you need to have the full tabular data set along with the Senzing entity-resolved data set loaded into Neo4j, which is achieved in the `load_json_data.ipynb` and `load_senzing_data.ipynb` notebooks.  We will then use LangChain to create a Cypher QA chain to ask questions of this graph, much like we did in `sql_qa.ipynb` with the tabular data.

In [2]:
URI = "bolt://localhost:7687"
USER = "neo4j"
PWD = " "
AUTH = (USER, PWD)

os.environ["NEO4J_URI"] = URI
os.environ["NEO4J_USERNAME"] = USER
os.environ["NEO4J_PASSWORD"] = PWD

### Connecting LangChain to Neo4j...

In [5]:
graph = Neo4jGraph()

In [6]:
graph.query("""MATCH (n) RETURN COUNT(n)""")

[{'COUNT(n)': 86374}]

In [7]:
graph.refresh_schema()
print(graph.schema)

Node properties:
SGEntity {record_type: STRING, data_source: STRING, name: STRING, mailing_verified_status: STRING, category_tags: STRING, top_category: STRING, naics_code: STRING, brands: STRING, address: STRING, sub_category: STRING, uid: STRING, record_id: STRING}
PPPEntity {date_approved: STRING, loan_range: STRING, data_source: STRING, business_type: STRING, jobs_reported: INTEGER, name: STRING, lender: STRING, naics_code: INTEGER, address: STRING, record_type: STRING, uid: STRING, record_id: INTEGER}
DOLEntity {case_id: INTEGER, naics_code_description: STRING, legal_name_org: STRING, record_type: STRING, data_source: STRING, naics_code: INTEGER, address: STRING, name: STRING, case_violtn_cnt: INTEGER, uid: STRING, record_id: INTEGER}
Entity {data_source: STRING, uid: STRING, name: STRING}
Relationship properties:
RELATED_TO {is_disclosed: INTEGER, is_ambiguous: INTEGER, match_level_code: STRING, match_level: INTEGER, match_key: STRING}
The relationships:
(:SGEntity)-[:RELATED_TO]

## Creating a chain

We use the LangChain package `GraphCypherQAChain` to convert questions into Cypher queries that are passed to the database for generating answers.

In [10]:
llm = ChatOpenAI(model="gpt-4o-2024-05-13", temperature=0)

chain = GraphCypherQAChain.from_llm(graph=graph, 
                                    llm=llm,
                                    validate_cypher=True,
                                    verbose=True)

In [12]:
response = chain.invoke({"query": "Tell me about Mandalay Bay Resort & Casino"})
response



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (e:Entity {name: "Mandalay Bay Resort & Casino"})
OPTIONAL MATCH (e)-[:RELATED_TO|RESOLVES*1..2]-(related)
RETURN e, related
[0m
Full Context:
[32;1m[1;3m[{'e': {'uid': 'SAFEGRAPH.222-22g@5yv-j8g-rff', 'name': 'Mandalay Bay Resort & Casino', 'data_source': 'SAFEGRAPH'}, 'related': {'naics_code_description': 'Casino Hotels', 'naics_code': 72112, 'uid': 'DOL_WHISARD.111491', 'record_id': 111491, 'case_violtn_cnt': 0, 'address': '3950 Las Vegas Blvd.  South Las Vegas NV 89117', 'case_id': 1444602, 'legal_name_org': 'Mandalay Bay Resort & Casino', 'name': 'Mandalay Bay', 'record_type': 'ORGANIZATION', 'data_source': 'DoL_WHISARD'}}, {'e': {'uid': 'SAFEGRAPH.222-22g@5yv-j8g-rff', 'name': 'Mandalay Bay Resort & Casino', 'data_source': 'SAFEGRAPH'}, 'related': {'naics_code': '7211', 'uid': 'SAFEGRAPH.222-22g@5yv-j8g-rff', 'record_id': '222-22g@5yv-j8g-rff', 'brands': '[]', 'address': '3960 Las 

{'query': 'Tell me about Mandalay Bay Resort & Casino',
 'result': 'Mandalay Bay Resort & Casino, located at 3950 Las Vegas Blvd. South, Las Vegas, NV 89117, is categorized under Casino Hotels with a NAICS code of 72112. It is also listed at 3960 Las Vegas Blvd S, Las Vegas, NV 89119-1046, and falls under the top category of Traveler Accommodation. The resort has had various records and addresses associated with it, including 3950 S. Las Vegas Blvd, Las Vegas, NV 89119, and 3950 Las Vegas Blvd South, Las Vegas, NV 89119. The legal names associated with it include Mandalay Bay Resort & Casino and Mandalay Corporation. The resort has had some case violations, with counts ranging from 0 to 2 in different records.'}

### That is great, but...

...it still missed the fact that Mandalay Bay is owned by MGM Grand, which has the most violations.  However, it is nice to see that the chain was able to provide some meaningful information about the multiple addresses associated with Mandalay Bay.  We should notice that the generated Cypher statement assumed that we were looking for something of node label `:Entity`, which is a bad assumption on its part.  It also was only looking for nodes within 2 hops, which again might not be the best assumption.

Let's try again...

In [19]:
response = chain.invoke({"query": "Tell me about MGM Grand Hotel"})
response



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (e:Entity {name: "MGM Grand Hotel"})
OPTIONAL MATCH (e)-[:RELATED_TO|RESOLVES]-(related)
RETURN e, collect(related) AS related_entities
[0m
Full Context:
[32;1m[1;3m[{'e': {'uid': 'SAFEGRAPH.222-22x@5yv-j8h-cyv', 'name': 'MGM Grand Hotel', 'data_source': 'SAFEGRAPH'}, 'related_entities': [{'naics_code': '722511', 'uid': 'SAFEGRAPH.zzy-223@5yv-j8h-ct9', 'record_id': 'zzy-223@5yv-j8h-ct9', 'brands': '[]', 'address': '3799 Las Vegas Blvd S Las Vegas NV 89109-4319', 'sub_category': 'Full-Service Restaurants', 'mailing_verified_status': 'VERIFIED_DELIVERY_POINT', 'name': 'Pan Asian Express', 'category_tags': '[ "Asian Food", "Brunch" ]', 'top_category': 'Restaurants and Other Eating Places', 'record_type': 'ORGANIZATION', 'data_source': 'SAFEGRAPH'}, {'naics_code': '722511', 'uid': 'SAFEGRAPH.222-22r@5yv-j8h-cyv', 'record_id': '222-22r@5yv-j8h-cyv', 'brands': '[]', 'address': '3799 Las Vegas 

{'query': 'Tell me about MGM Grand Hotel',
 'result': "The MGM Grand Hotel, located at 3799 Las Vegas Blvd S, Las Vegas, NV 89109-4319, is a prominent establishment. It is categorized under Casino Hotels and has several related entities, including various full-service and limited-service restaurants such as Pan Asian Express, Hecho en Vegas, Fiamma Italian Kitchen, Pieology Pizzeria, and McDonald's MGM Grand The District Food Court. Additionally, it features the MGM Grand Poker Room and the MGM Grand Express Store. The hotel is associated with multiple legal names and has been involved in various cases, as recorded by DoL_WHISARD."}

### Much better!

Again, the LLM really prefers to only search for the `:Entity` node label, which is unfortunate.  But I like that it came back with a lot more related nodes, like Pan Asian Express and Pieology Pizzeria, which are restaurants within MGM Grand.  Unfortunately, it was not able to probe much deeper:

In [21]:
response = chain.invoke({"query": "How many violations have entities Fiamma Italian Kitchen is related to been involved in?"})
response



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (e:Entity {name: "Fiamma Italian Kitchen"})-[:RELATED_TO*1..2]-(d:DOLEntity)
RETURN SUM(d.case_violtn_cnt) AS total_violations
[0m
Full Context:
[32;1m[1;3m[{'total_violations': 0}][0m

[1m> Finished chain.[0m


{'query': 'How many violations have entities Fiamma Italian Kitchen is related to been involved in?',
 'result': 'Entities related to Fiamma Italian Kitchen have been involved in 0 violations.'}