https://docs.llamaindex.ai/en/stable/examples/property_graph/property_graph_advanced/
https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/property_graph/property_graph_advanced.ipynb

In [1]:
%pip install --quiet llama-index
%pip install --quiet llama-index-graph-stores-neo4j
%pip install --quiet llama-index-program-openai
%pip install --quiet llama-index-llms-openai

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

--2024-08-24 14:38:18--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75042 (73K) [text/plain]
Saving to: ‘data/paul_graham/paul_graham_essay.txt’


2024-08-24 14:38:18 (3.08 MB/s) - ‘data/paul_graham/paul_graham_essay.txt’ saved [75042/75042]



In [3]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader("./data/paul_graham/").load_data()

In [4]:
import nest_asyncio

nest_asyncio.apply()

In [5]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

api_base="http://0.0.0.0:4000"
llm = OpenAI(model="gpt-4o", temperature=0.0, api_base=api_base)
embed_model = OpenAIEmbedding(model="text-embedding-3-small", api_base=api_base)

In [6]:
from typing import Literal

# best practice to use upper-case
entities = Literal["PERSON", "PLACE", "ORGANIZATION"]
relations = Literal["HAS", "PART_OF", "WORKED_ON", "WORKED_WITH", "WORKED_AT"]

# entities = ["PERSON", "PLACE", "ORGANIZATION"]
# relations = ["HAS", "PART_OF", "WORKED_ON", "WORKED_WITH", "WORKED_AT"]

# define which entities can have which relations
validation_schema = {
    "PERSON": ["HAS", "PART_OF", "WORKED_ON", "WORKED_WITH", "WORKED_AT"],
    "PLACE": ["HAS", "PART_OF", "WORKED_AT"],
    "ORGANIZATION": ["HAS", "PART_OF", "WORKED_WITH"],
}
# validation_schema = [
#     ("ORGANIZATION", "HAS", "PERSON"),
#     ("PERSON", "WORKED_AT", "ORGANIZATION"),
#     ("PERSON", "WORKED_WITH", "PERSON"),
#     ("PERSON", "WORKED_ON", "ORGANIZATION"),
#     ("PERSON", "PART_OF", "ORGANIZATION"),
#     ("ORGANIZATION", "PART_OF", "ORGANIZATION"),
#     ("PERSON", "WORKED_AT", "PLACE"),
# ]

In [7]:
from llama_index.core.indices.property_graph import SchemaLLMPathExtractor, DynamicLLMPathExtractor

# kg_extractor = DynamicLLMPathExtractor(
#     llm=llm,
#     max_triplets_per_chunk=20,
#     num_workers=4,
#     allowed_entity_types=["PERSON", "PLACE", "ORGANIZATION"],
#     allowed_relation_types=["HAS", "PART_OF", "WORKED_ON", "WORKED_WITH", "WORKED_AT"],
# )

kg_extractor = SchemaLLMPathExtractor(
    llm=llm,
    possible_entities=entities,
    possible_relations=relations,
    kg_validation_schema=validation_schema,
    num_workers=4,
    # if false, allows for values outside of the schema
    # useful for using the schema as a suggestion
    strict=True,
    # strict=False,
)

In [8]:
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore

username="neo4j"
password="spoon-ralph-point-topic-armani-9842"
url="bolt://localhost:7687"

graph_store = Neo4jPropertyGraphStore(
    username=username,
    password=password,
    url=url,
)

vec_store = None



In [9]:
from llama_index.core import PropertyGraphIndex

index = PropertyGraphIndex.from_documents(
    documents,
    kg_extractors=[kg_extractor],
    embed_model=embed_model,
    property_graph_store=graph_store,
    vector_store=vec_store,
    show_progress=True,
)

  from .autonotebook import tqdm as notebook_tqdm
Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 35.09it/s]
Extracting paths from text with schema: 100%|██████████| 22/22 [00:17<00:00,  1.24it/s]
Generating embeddings: 100%|██████████| 1/1 [00:00<00:00, 15.05it/s]
Generating embeddings: 0it [00:00, ?it/s]


In [None]:
index.property_graph_store.get_triplets()

In [None]:
from llama_index.core.indices.property_graph import (
    LLMSynonymRetriever,
    VectorContextRetriever,
)

llm_synonym = LLMSynonymRetriever(
    index.property_graph_store,
    llm=llm,
    include_text=False,
)
vector_context = VectorContextRetriever(
    index.property_graph_store,
    embed_model=embed_model,
    include_text=False,
)

retriever = index.as_retriever(
    sub_retrievers=[
        llm_synonym,
        vector_context,
    ]
)

In [None]:
nodes = retriever.retrieve("What happened at Interleaf?")

for node in nodes:
    print(node.text)

In [None]:
index.property_graph_store.get_triplets()