Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 33 additions & 49 deletions examples/docs_to_knowledge_graph/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
This example shows how to extract relationships from Markdown documents and build a knowledge graph.
This example shows how to extract relationships from documents and build a knowledge graph.
"""
import dataclasses
from dotenv import load_dotenv
Expand All @@ -13,17 +13,17 @@ class DocumentSummary:

@dataclasses.dataclass
class Relationship:
"""Describe a relationship between two nodes."""
"""Describe a relationship between two entities."""
subject: str
predicate: str
object: str

@cocoindex.flow_def(name="DocsToKG")
def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
"""
Define an example flow that extracts triples from files and build knowledge graph.
Define an example flow that extracts relationship from files and build knowledge graph.
"""

# configure neo4j connection
conn_spec = cocoindex.add_auth_entry(
"Neo4jConnection",
cocoindex.storages.Neo4jConnection(
Expand All @@ -41,78 +41,66 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
entity_mention = data_scope.add_collector()

with data_scope["documents"].row() as doc:
doc["chunks"] = doc["content"].transform(
cocoindex.functions.SplitRecursively(),
language="markdown", chunk_size=10000)

# extract summary from document
doc["summary"] = doc["content"].transform(
cocoindex.functions.ExtractByLlm(
llm_spec=cocoindex.LlmSpec(
# Supported LLM: https://cocoindex.io/docs/ai/llm
api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
output_type=DocumentSummary,
instruction="Please summarize the content of the document."))
document_node.collect(
filename=doc["filename"], title=doc["summary"]["title"],
summary=doc["summary"]["summary"])

with doc["chunks"].row() as chunk:
chunk["relationships"] = chunk["text"].transform(
cocoindex.functions.ExtractByLlm(
llm_spec=cocoindex.LlmSpec(
api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
# Replace by this spec below, to use Ollama API instead of OpenAI
# llm_spec=cocoindex.LlmSpec(
# api_type=cocoindex.LlmApiType.OLLAMA, model="llama3.2"),
# extract relationships from document
doc["relationships"] = doc["content"].transform(
cocoindex.functions.ExtractByLlm(
llm_spec=cocoindex.LlmSpec(
# Supported LLM: https://cocoindex.io/docs/ai/llm
api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
output_type=list[Relationship],
instruction=(
"Please extract relationships from CocoIndex documents. "
"Focus on concepts and ingnore specific examples. "
"Each relationship should be a tuple of (subject, predicate, object).")))

with chunk["relationships"].row() as relationship:
relationship["subject_embedding"] = relationship["subject"].transform(
cocoindex.functions.SentenceTransformerEmbed(
model="sentence-transformers/all-MiniLM-L6-v2"))
relationship["object_embedding"] = relationship["object"].transform(
cocoindex.functions.SentenceTransformerEmbed(
model="sentence-transformers/all-MiniLM-L6-v2"))
entity_relationship.collect(
id=cocoindex.GeneratedField.UUID,
subject=relationship["subject"],
subject_embedding=relationship["subject_embedding"],
object=relationship["object"],
object_embedding=relationship["object_embedding"],
predicate=relationship["predicate"],
)
entity_mention.collect(
id=cocoindex.GeneratedField.UUID, entity=relationship["subject"],
filename=doc["filename"], location=chunk["location"],
)
entity_mention.collect(
id=cocoindex.GeneratedField.UUID, entity=relationship["object"],
filename=doc["filename"], location=chunk["location"],
)
with doc["relationships"].row() as relationship:
# relationship between two entities
entity_relationship.collect(
id=cocoindex.GeneratedField.UUID,
subject=relationship["subject"],
object=relationship["object"],
predicate=relationship["predicate"],
)
# mention of an entity in a document, for subject
entity_mention.collect(
id=cocoindex.GeneratedField.UUID, entity=relationship["subject"],
filename=doc["filename"],
)
# mention of an entity in a document, for object
entity_mention.collect(
id=cocoindex.GeneratedField.UUID, entity=relationship["object"],
filename=doc["filename"],
)


# export to neo4j
document_node.export(
"document_node",
cocoindex.storages.Neo4j(
connection=conn_spec,
mapping=cocoindex.storages.NodeMapping(label="Document")),
primary_key_fields=["filename"],
)
# Declare reference Node to reference entity node in a relationship
flow_builder.declare(
cocoindex.storages.Neo4jDeclarations(
connection=conn_spec,
referenced_nodes=[
cocoindex.storages.ReferencedNode(
label="Entity",
primary_key_fields=["value"],
vector_indexes=[
cocoindex.VectorIndexDef(
field_name="embedding",
metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
),
],
)
]
)
Expand All @@ -128,17 +116,13 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
fields=[
cocoindex.storages.TargetFieldMapping(
source="subject", target="value"),
cocoindex.storages.TargetFieldMapping(
source="subject_embedding", target="embedding"),
]
),
target=cocoindex.storages.NodeReferenceMapping(
label="Entity",
fields=[
cocoindex.storages.TargetFieldMapping(
source="object", target="value"),
cocoindex.storages.TargetFieldMapping(
source="object_embedding", target="embedding"),
]
),
),
Expand Down