In [1]:
import json
from neo4j import GraphDatabase
import numpy as np
import os
import pandas as pd
from pprint import pprint
import time
from tqdm import tqdm

from langchain_openai import ChatOpenAI
from langchain_community.graphs import Neo4jGraph
from langchain.chains import GraphCypherQAChain

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

## Introduction

In order to run this notebook you need to have the full tabular data set along with the Senzing entity-resolved data set loaded into Neo4j, which is achieved in the `load_json_data.ipynb` and `load_senzing_data.ipynb` notebooks.  We will then use LangChain to create a Cypher QA chain to ask questions of this graph, much like we did in `sql_qa.ipynb` with the tabular data.

In [2]:
URI = "bolt://localhost:7687"
USER = "neo4j"
PWD = " "
AUTH = (USER, PWD)

os.environ["NEO4J_URI"] = URI
os.environ["NEO4J_USERNAME"] = USER
os.environ["NEO4J_PASSWORD"] = PWD

### Connecting LangChain to Neo4j...

In [3]:
graph = Neo4jGraph()

In [4]:
graph.query("""MATCH (n) RETURN COUNT(n)""")

[{'COUNT(n)': 86374}]

In [5]:
graph.refresh_schema()
print(graph.schema)

Node properties:
SGEntity {data_source: STRING, top_category: STRING, name: STRING, record_type: STRING, mailing_verified_status: STRING, category_tags: STRING, naics_code: STRING, brands: STRING, address: STRING, sub_category: STRING, uid: STRING, record_id: STRING, tracking_closed_since: STRING, closed_on: STRING}
PPPEntity {date_approved: STRING, loan_range: STRING, data_source: STRING, business_type: STRING, jobs_reported: INTEGER, name: STRING, lender: STRING, naics_code: INTEGER, address: STRING, record_type: STRING, uid: STRING, record_id: INTEGER}
DOLEntity {case_id: INTEGER, naics_code_description: STRING, legal_name_org: STRING, record_type: STRING, data_source: STRING, naics_code: INTEGER, address: STRING, name: STRING, case_violtn_cnt: INTEGER, uid: STRING, record_id: INTEGER}
Entity {data_source: STRING, uid: STRING, name: STRING}
Relationship properties:
RELATED_TO {match_key: STRING, is_ambiguous: INTEGER, match_level_code: STRING, match_level: INTEGER, is_disclosed: INT

## Creating a chain

We use the LangChain package `GraphCypherQAChain` to convert questions into Cypher queries that are passed to the database for generating answers.

In [6]:
llm = ChatOpenAI(model="gpt-4o-2024-05-13", temperature=0)

chain = GraphCypherQAChain.from_llm(graph=graph, 
                                    llm=llm,
                                    validate_cypher=True,
                                    verbose=True)

In [9]:
response = chain.invoke({"query": "Tell me about Union Cab Co"})
response



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (e:Entity {name: "Union Cab Co"})-[:RELATED_TO|RESOLVES*1..2]-(related)
RETURN e, related
[0m
Full Context:
[32;1m[1;3m[{'e': {'uid': 'DOL_WHISARD.109039', 'name': 'Union Cab Co', 'data_source': 'DOL_WHISARD'}, 'related': {'naics_code': '485310', 'brands': '[]', 'address': '5010 S Valley View Blvd Las Vegas NV 89118-1705', 'sub_category': 'Taxi Service', 'category_tags': '[ "Airport Shuttles", "Taxis" ]', 'data_source': 'SAFEGRAPH', 'record_type': 'ORGANIZATION', 'uid': 'SAFEGRAPH.222-224@5yv-j8k-djv', 'record_id': '222-224@5yv-j8k-djv', 'tracking_closed_since': '2019-07-01', 'mailing_verified_status': 'VERIFIED_DELIVERY_POINT', 'closed_on': '2022-03-01', 'name': 'ABC Union Cab Co', 'top_category': 'Taxi and Limousine Service'}}, {'e': {'uid': 'DOL_WHISARD.109039', 'name': 'Union Cab Co', 'data_source': 'DOL_WHISARD'}, 'related': {'naics_code_description': 'Taxi Service', 'naics_code': 4

{'query': 'Tell me about Union Cab Co',
 'result': 'Union Cab Co, also known as ABC Union Cab Co, was located at 5010 S Valley View Blvd, Las Vegas, NV 89118-1705. It operated in the Taxi Service industry, specifically under the NAICS code 485310. The company was categorized under "Taxi and Limousine Service" and had tags such as "Airport Shuttles" and "Taxis." Union Cab Co had a verified mailing status and was tracked as closed since July 1, 2019, with its official closure on March 1, 2022. The company had one recorded case violation and was related to other entities like Vegas Western Cab, Inc and A NLV Cab Co.'}

### Interesting!

Again, the LLM really prefers to only search for the `:Entity` node label, which is unfortunate.  But I like that it came back with a lot more related nodes, like Las Vegas Western Cab and A NLV Cab Co., which are cab companies located at the same address.

In [11]:
response = chain.invoke({"query": "How many violations have entities Union Cab Co is related to been involved in?"})
response



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (e:Entity {name: "Union Cab Co"})-[:RELATED_TO]-(d:DOLEntity)
RETURN SUM(d.case_violtn_cnt) AS total_violations
[0m
Full Context:
[32;1m[1;3m[{'total_violations': 6}][0m

[1m> Finished chain.[0m


{'query': 'How many violations have entities Union Cab Co is related to been involved in?',
 'result': 'Entities related to Union Cab Co have been involved in a total of 6 violations.'}

## This is the correct answer!

(Although to be fair, I received different answers every time I ran this code.)