# GraphRAG Pipeline
1. Connect to Neo4j database
2. Cypher query
3. QA Pipeline with LangChain
4. Evaluation with DeepEval

### Setup Sample Database
Prerequisites: setup neo4j database connection and change env configuration

In [1]:
# Start Neo4j Docker Container with the Graph Data Science library
! docker run -d --name neo4j --env NEO4J_PLUGINS='["graph-data-science", "genai"]'\
    -p 7474:7474 -p 7687:7687 \
    -e NEO4J_AUTH=neo4j/password \
    neo4j:latest

docker: Error response from daemon: Conflict. The container name "/neo4j" is already in use by container "fb2e377de8a98107475a3751b17ad74bb12a612e05b191a2cc2842a6da953b85". You have to remove (or rename) that container to be able to reuse that name.
See 'docker run --help'.


In [1]:
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from graphdatascience import GraphDataScience
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.graphs import Neo4jGraph
from langchain.prompts import PromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnableLambda
import gradio as gr

In [2]:
if os.path.exists('private-config.env'):
    load_dotenv('private-config.env', override=True)
else:
    raise Exception('private-config.env not found')

NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
AURA_DS = os.getenv('AURA_DS')

LLM = os.getenv('LLM')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [3]:
# Use Neo4j URI and credentials according to our setup
gds = GraphDataScience(
    NEO4J_URI,
    auth=(NEO4J_USERNAME, NEO4J_PASSWORD),
    # aura_ds=AURA_DS,
)

# Necessary if you enabled Arrow on the db - this is true for AuraDS
gds.set_database("neo4j")

In [4]:
# get source data - it has been pre-formatted. If you would like to re-generate from source on Kaggle, see the data-prep.ipynb notebook
department_df = pd.read_csv('https://storage.googleapis.com/neo4j-workshop-data/genai-hm/department.csv')
product_df = pd.read_csv('https://storage.googleapis.com/neo4j-workshop-data/genai-hm/product.csv')
article_df = pd.read_csv('https://storage.googleapis.com/neo4j-workshop-data/genai-hm/article.csv')
customer_df = pd.read_csv('https://storage.googleapis.com/neo4j-workshop-data/genai-hm/customer.csv')
transaction_df = pd.read_csv('https://storage.googleapis.com/neo4j-workshop-data/genai-hm/transaction.csv')

In [5]:
# create constraints - one uniqueness constraint for each node label
gds.run_cypher('CREATE CONSTRAINT unique_department_no IF NOT EXISTS FOR (n:Department) REQUIRE n.departmentNo IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT unique_product_code IF NOT EXISTS FOR (n:Product) REQUIRE n.productCode IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT unique_article_id IF NOT EXISTS FOR (n:Article) REQUIRE n.articleId IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT unique_customer_id IF NOT EXISTS FOR (n:Customer) REQUIRE n.customerId IS UNIQUE')

In [6]:
from typing import Tuple, Union
from numpy.typing import ArrayLike


def make_map(x):
    if type(x) == str:
        return x, x
    elif type(x) == tuple:
        return x
    else:
        raise Exception("Entry must of type string or tuple")


def make_set_clause(prop_names: ArrayLike, element_name='n', item_name='rec'):
    clause_list = []
    for prop_name in prop_names:
        clause_list.append(f'{element_name}.{prop_name} = {item_name}.{prop_name}')
    return 'SET ' + ', '.join(clause_list)


def make_node_merge_query(node_key_name: str, node_label: str, cols: ArrayLike):
    template = f'''UNWIND $recs AS rec\nMERGE(n:{node_label} {{{node_key_name}: rec.{node_key_name}}})'''
    prop_names = [x for x in cols if x != node_key_name]
    if len(prop_names) > 0:
        template = template + '\n' + make_set_clause(prop_names)
    return template + '\nRETURN count(n) AS nodeLoadedCount'


def make_rel_merge_query(source_target_labels: Union[Tuple[str, str], str],
                         source_node_key: Union[Tuple[str, str], str],
                         target_node_key: Union[Tuple[str, str], str],
                         rel_type: str,
                         cols: ArrayLike,
                         rel_key: str = None):
    source_target_label_map = make_map(source_target_labels)
    source_node_key_map = make_map(source_node_key)
    target_node_key_map = make_map(target_node_key)

    merge_statement = f'MERGE(s)-[r:{rel_type}]->(t)'
    if rel_key is not None:
        merge_statement = f'MERGE(s)-[r:{rel_type} {{{rel_key}: rec.{rel_key}}}]->(t)'

    template = f'''\tUNWIND $recs AS rec
    MATCH(s:{source_target_label_map[0]} {{{source_node_key_map[0]}: rec.{source_node_key_map[1]}}})
    MATCH(t:{source_target_label_map[1]} {{{target_node_key_map[0]}: rec.{target_node_key_map[1]}}})\n\t''' + merge_statement
    prop_names = [x for x in cols if x not in [rel_key, source_node_key_map[1], target_node_key_map[1]]]
    if len(prop_names) > 0:
        template = template + '\n\t' + make_set_clause(prop_names, 'r')
    return template + '\n\tRETURN count(r) AS relLoadedCount'


def chunks(xs, n=10_000):
    n = max(1, n)
    return [xs[i:i + n] for i in range(0, len(xs), n)]


def load_nodes(gds: GraphDataScience, node_df: pd.DataFrame, node_key_col: str, node_label: str, chunk_size=10_000):
    records = node_df.to_dict('records')
    print(f'======  loading {node_label} nodes  ======')
    total = len(records)
    print(f'staging {total:,} records')
    query = make_node_merge_query(node_key_col, node_label, node_df.columns.copy())
    print(f'\nUsing This Cypher Query:\n```\n{query}\n```\n')
    cumulative_count = 0
    for recs in chunks(records, chunk_size):
        res = gds.run_cypher(query, params={'recs': recs})
        cumulative_count += res.iloc[0, 0]
        print(f'Loaded {cumulative_count:,} of {total:,} nodes')


def load_rels(gds: GraphDataScience,
              rel_df: pd.DataFrame,
              source_target_labels: Union[Tuple[str, str], str],
              source_node_key: Union[Tuple[str, str], str],
              target_node_key: Union[Tuple[str, str], str],
              rel_type: str,
              rel_key: str = None,
              chunk_size=10_000):
    records = rel_df.to_dict('records')
    print(f'======  loading {rel_type} relationships  ======')
    total = len(records)
    print(f'staging {total:,} records')
    query = make_rel_merge_query(source_target_labels, source_node_key,
                                 target_node_key, rel_type, rel_df.columns.copy(), rel_key)
    print(f'\nUsing This Cypher Query:\n```\n{query}\n```\n')
    cumulative_count = 0
    for recs in chunks(records, chunk_size):
        res = gds.run_cypher(query, params={'recs': recs})
        cumulative_count += res.iloc[0, 0]
        print(f'Loaded {cumulative_count:,} of {total:,} relationships')

In [7]:
slice = 50
department_df = department_df[:slice]
product_df = product_df[:slice]
article_df = article_df[:slice]
customer_df = customer_df[:slice]
transaction_df = transaction_df[:slice]
department_df.shape, product_df.shape, article_df.shape, customer_df.shape, transaction_df.shape

((50, 4), (50, 8), (50, 9), (50, 7), (50, 6))

In [8]:
# load nodes
load_nodes(gds, department_df, 'departmentNo', 'Department')
load_nodes(gds, article_df.drop(columns=['productCode', 'departmentNo']), 'articleId', 'Article')
load_nodes(gds, product_df, 'productCode', 'Product')
load_nodes(gds, customer_df, 'customerId', 'Customer')

# load relationships
load_rels(gds, article_df[['articleId', 'departmentNo']], source_target_labels=('Article', 'Department'),
                      source_node_key='articleId', target_node_key='departmentNo',
                      rel_type='FROM_DEPARTMENT')
load_rels(gds, article_df[['articleId', 'productCode']], source_target_labels=('Article', 'Product'),
                      source_node_key='articleId',target_node_key='productCode',
                      rel_type='VARIANT_OF')
load_rels(gds, transaction_df, source_target_labels=('Customer', 'Article'),
                      source_node_key='customerId', target_node_key='articleId', rel_key='txId',
                      rel_type='PURCHASED')

# convert transaction dates
gds.run_cypher('''
MATCH (:Customer)-[r:PURCHASED]->()
SET r.tDat = date(r.tDat)
''')

# convert NaN product descriptions
gds.run_cypher('''
MATCH (n:Product) WHERE valueType(n.detailDesc) <> "STRING NOT NULL"
SET n.detailDesc = ""
RETURN n
''')

# create combined text property. This will help simplify later with semantic search and RAG
gds.run_cypher("""
    MATCH(p:Product)
    SET p.text = 'Product-- ' +
        'Name: ' + p.prodName + ' || ' +
        'Type: ' + p.productTypeName + ' || ' +
        'Group: ' + p.productGroupName + ' || ' +
        'Garment Type: ' + p.garmentGroupName + ' || ' +
        'Description: ' + p.detailDesc
    RETURN count(p) AS propertySetCount
    """)

# write dummy urls to illustrate sourcing in future retrieval
gds.run_cypher("""
MATCH(p:Product)
SET p.url = 'https://representative-domain/product/' + p.productCode
""")

staging 50 records

Using This Cypher Query:
```
UNWIND $recs AS rec
MERGE(n:Department {departmentNo: rec.departmentNo})
SET n.departmentName = rec.departmentName, n.sectionNo = rec.sectionNo, n.sectionName = rec.sectionName
RETURN count(n) AS nodeLoadedCount
```

Loaded 50 of 50 nodes
staging 50 records

Using This Cypher Query:
```
UNWIND $recs AS rec
MERGE(n:Article {articleId: rec.articleId})
SET n.prodName = rec.prodName, n.productTypeName = rec.productTypeName, n.graphicalAppearanceNo = rec.graphicalAppearanceNo, n.graphicalAppearanceName = rec.graphicalAppearanceName, n.colourGroupCode = rec.colourGroupCode, n.colourGroupName = rec.colourGroupName
RETURN count(n) AS nodeLoadedCount
```

Loaded 50 of 50 nodes
staging 50 records

Using This Cypher Query:
```
UNWIND $recs AS rec
MERGE(n:Product {productCode: rec.productCode})
SET n.prodName = rec.prodName, n.productTypeNo = rec.productTypeNo, n.productTypeName = rec.productTypeName, n.productGroupName = rec.productGroupName, n.gar

In [9]:
#generate embeddings

gds.run_cypher('''
MATCH (n:Product) WHERE size(n.detailDesc) <> 0
WITH collect(n) AS nodes, toInteger(rand()*$numberOfBatches) AS partition
CALL {
    WITH nodes
    CALL genai.vector.encodeBatch([node IN nodes| node.text], "OpenAI", { token: $token})
    YIELD index, vector
    CALL db.create.setNodeVectorProperty(nodes[index], "textEmbedding", vector)
} IN TRANSACTIONS OF 1 ROW''', params={'token':OPENAI_API_KEY, 'numberOfBatches':100})

In [10]:
#create vector index

embedding_dimension = 1536 #default for OpenAI text-embedding-ada-002

gds.run_cypher('''
CREATE VECTOR INDEX product_text_embeddings IF NOT EXISTS FOR (n:Product) ON (n.textEmbedding)
OPTIONS {indexConfig: {
 `vector.dimensions`: toInteger($dimension),
 `vector.similarity_function`: 'cosine'
}}''', params={'dimension': embedding_dimension})

#wait for index to come online
gds.run_cypher('CALL db.awaitIndex("product_text_embeddings", 300)')

## GraphRAG
1. Vector Search
2. Semantic Search
3. ML Technique (Optional)

### Vector Search

In [11]:
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings()
embedding_dimension = 1536

kg_vector_search = Neo4jVector.from_existing_index(
    embedding=embedding_model,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name='product_text_embeddings')

In [14]:
# Instead of 2 steps (Embed search term, perform vector search) 
# LangChain handles the orchestration for us in a single line of code
search_prompt = "Tights"
res = kg_vector_search.similarity_search(search_prompt, k=10)
pd.DataFrame([{'document': d.page_content} for d in res])

Unnamed: 0,document
0,Product-- Name: Box 4p Tights || Type: Underwe...
1,Product-- Name: 200 den 1p Tights || Type: Und...
2,Product-- Name: 40 den 2p Tights || Type: Unde...
3,Product-- Name: Fleece 1 p tights || Type: Leg...
4,Product-- Name: Support 40 den 1p Tights || Ty...
5,Product-- Name: Heavy plain 2 p tights || Type...
6,Product-- Name: Highwaist 30 den 1p Tights || ...
7,Product-- Name: Box 4p Kneehighs || Type: Unde...
8,Product-- Name: Control Top 100 den 1p Tights ...
9,Product-- Name: Shape Up 30 den 1p Tights || T...


In [15]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

query = "What is the product description for the tights?"

context = "\n\n".join([doc.page_content for doc in res])

def llm_invoke(llm, query, retrieved_context):
    prompt = f"""
    Based on the following context, answer the question:\n
    Context: \n
    {retrieved_context}\n
    Question: {query}\n
    Answer:
    """

    respone = llm.invoke(prompt)
    return respone.content

llm_invoke(llm, query, context)

'The product description for the tights varies by type. Here are the descriptions for each product:\n\n1. **Box 4p Tights**: Matt tights with an elasticated waist. 20 denier.\n2. **200 den 1p Tights**: Opaque matt tights. 200 denier.\n3. **40 den 2p Tights**: Tights with an elasticated waist. 40 denier.\n4. **Fleece 1 p tights**: Opaque tights with an elasticated waist and soft fleece inside.\n5. **Support 40 den 1p Tights**: Semi shiny tights that shape the tummy, thighs, and calves while also encouraging blood circulation in the legs. Elasticated waist.\n6. **Heavy plain 2 p tights**: Fine-knit tights with an elasticated waist.\n7. **Highwaist 30 den 1p Tights**: High-waisted tights that lift the bum and shape the waist and thighs. 30 denier.\n8. **Box 4p Kneehighs**: Four pairs of knee highs. 20 denier.\n9. **Control Top 100 den 1p Tights**: Matt opaque tights with a control top to hold in the tummy and bum. 100 denier.\n10. **Shape Up 30 den 1p Tights**: Tights with built-in suppor

### Senmantic Search

In [16]:
kg_personalized_search = Neo4jVector.from_existing_index(
    embedding=embedding_model,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    index_name='product_text_embeddings',
    retrieval_query="""
    WITH node AS product, score AS searchScore

    OPTIONAL MATCH(product)<-[:VARIANT_OF]-(:Article)<-[:PURCHASED]-(:Customer)
    -[:PURCHASED]->(a:Article)<-[:PURCHASED]-(:Customer {customerId: $customerId})

    WITH count(a) AS purchaseScore, product.text AS text, searchScore, product.productCode AS productCode
    RETURN text,
        (1+purchaseScore)*searchScore AS score,
        {productCode: productCode, purchaseScore:purchaseScore, searchScore:searchScore} AS metadata
    ORDER BY purchaseScore DESC, searchScore DESC LIMIT 15
    """)

In [19]:
CUSTOMER_ID = "057b674a88d197c9601fb334b36d74d9f7cfccabf6adf9e6b8b8c9a588fca398"

res = kg_personalized_search.similarity_search(search_prompt, k=10, params={'customerId': CUSTOMER_ID})

# Visualize as a dataframe
vector_kg_res = pd.DataFrame([{'productCode': d.metadata['productCode'],
               'document': d.page_content,
               'searchScore': d.metadata['searchScore'],
               'purchaseScore': d.metadata['purchaseScore']} for d in res])

In [20]:
vector_kg_res

Unnamed: 0,productCode,document,searchScore,purchaseScore
0,156231,Product-- Name: Box 4p Tights || Type: Underwe...,0.93689,0
1,111609,Product-- Name: 200 den 1p Tights || Type: Und...,0.935867,0
2,200182,Product-- Name: 40 den 2p Tights || Type: Unde...,0.934692,0
3,182909,Product-- Name: Fleece 1 p tights || Type: Leg...,0.933228,0
4,111593,Product-- Name: Support 40 den 1p Tights || Ty...,0.93248,0
5,201219,Product-- Name: Heavy plain 2 p tights || Type...,0.931961,0
6,158340,Product-- Name: Highwaist 30 den 1p Tights || ...,0.93187,0
7,156227,Product-- Name: Box 4p Kneehighs || Type: Unde...,0.931442,0
8,179208,Product-- Name: Control Top 100 den 1p Tights ...,0.930557,0
9,111586,Product-- Name: Shape Up 30 den 1p Tights || T...,0.929825,0


In [21]:
semantic_context = "\n\n".join([doc.page_content for doc in res])

response = llm_invoke(llm, query, semantic_context)
response

'The product description for the tights varies by type. Here are the descriptions for each product:\n\n1. **Box 4p Tights**: Matt tights with an elasticated waist. 20 denier.\n2. **200 den 1p Tights**: Opaque matt tights. 200 denier.\n3. **40 den 2p Tights**: Tights with an elasticated waist. 40 denier.\n4. **Fleece 1 p tights**: Opaque tights with an elasticated waist and soft fleece inside.\n5. **Support 40 den 1p Tights**: Semi shiny tights that shape the tummy, thighs, and calves while also encouraging blood circulation in the legs. Elasticated waist.\n6. **Heavy plain 2 p tights**: Fine-knit tights with an elasticated waist.\n7. **Highwaist 30 den 1p Tights**: High-waisted tights that lift the bum and shape the waist and thighs. 30 denier.\n8. **Box 4p Kneehighs**: Four pairs of knee highs. 20 denier.\n9. **Control Top 100 den 1p Tights**: Matt opaque tights with a control top to hold in the tummy and bum. 100 denier.\n10. **Shape Up 30 den 1p Tights**: Tights with built-in suppor