cocoindex-io · badmonster0 · Apr 26, 2025 · Apr 26, 2025
diff --git a/examples/docs_to_knowledge_graph/main.py b/examples/docs_to_knowledge_graph/main.py
@@ -1,5 +1,5 @@
 """
-This example shows how to extract relationships from Markdown documents and build a knowledge graph.
+This example shows how to extract relationships from documents and build a knowledge graph.
 """
 import dataclasses
 from dotenv import load_dotenv
@@ -13,17 +13,17 @@ class DocumentSummary:
 
 @dataclasses.dataclass
 class Relationship:
-    """Describe a relationship between two nodes."""
+    """Describe a relationship between two entities."""
     subject: str
     predicate: str
     object: str
 
 @cocoindex.flow_def(name="DocsToKG")
 def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
     """
-    Define an example flow that extracts triples from files and build knowledge graph.
+    Define an example flow that extracts relationship from files and build knowledge graph.
     """
-
+    # configure neo4j connection
     conn_spec = cocoindex.add_auth_entry(
         "Neo4jConnection",
         cocoindex.storages.Neo4jConnection(
@@ -41,78 +41,66 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
     entity_mention = data_scope.add_collector()
 
     with data_scope["documents"].row() as doc:
-        doc["chunks"] = doc["content"].transform(
-            cocoindex.functions.SplitRecursively(),
-            language="markdown", chunk_size=10000)
-
+        # extract summary from document
         doc["summary"] = doc["content"].transform(
             cocoindex.functions.ExtractByLlm(
                 llm_spec=cocoindex.LlmSpec(
+                    # Supported LLM: https://cocoindex.io/docs/ai/llm
                     api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
                 output_type=DocumentSummary,
                 instruction="Please summarize the content of the document."))
         document_node.collect(
             filename=doc["filename"], title=doc["summary"]["title"],
             summary=doc["summary"]["summary"])
 
-        with doc["chunks"].row() as chunk:
-            chunk["relationships"] = chunk["text"].transform(
-                cocoindex.functions.ExtractByLlm(
-                    llm_spec=cocoindex.LlmSpec(
-                        api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
-                    # Replace by this spec below, to use Ollama API instead of OpenAI
-                    #   llm_spec=cocoindex.LlmSpec(
-                    #       api_type=cocoindex.LlmApiType.OLLAMA, model="llama3.2"),
+        # extract relationships from document
+        doc["relationships"] = doc["content"].transform(
+            cocoindex.functions.ExtractByLlm(
+                llm_spec=cocoindex.LlmSpec(
+                    # Supported LLM: https://cocoindex.io/docs/ai/llm
+                    api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
                     output_type=list[Relationship],
                     instruction=(
                         "Please extract relationships from CocoIndex documents. "
                         "Focus on concepts and ingnore specific examples. "
                         "Each relationship should be a tuple of (subject, predicate, object).")))
 
-            with chunk["relationships"].row() as relationship:
-                relationship["subject_embedding"] = relationship["subject"].transform(
-                    cocoindex.functions.SentenceTransformerEmbed(
-                        model="sentence-transformers/all-MiniLM-L6-v2"))
-                relationship["object_embedding"] = relationship["object"].transform(
-                    cocoindex.functions.SentenceTransformerEmbed(
-                        model="sentence-transformers/all-MiniLM-L6-v2"))
-                entity_relationship.collect(
-                    id=cocoindex.GeneratedField.UUID,
-                    subject=relationship["subject"],
-                    subject_embedding=relationship["subject_embedding"],
-                    object=relationship["object"],
-                    object_embedding=relationship["object_embedding"],
-                    predicate=relationship["predicate"],
-                )
-                entity_mention.collect(
-                    id=cocoindex.GeneratedField.UUID, entity=relationship["subject"],
-                    filename=doc["filename"], location=chunk["location"],
-                )
-                entity_mention.collect(
-                    id=cocoindex.GeneratedField.UUID, entity=relationship["object"],
-                    filename=doc["filename"], location=chunk["location"],
-                )
+        with doc["relationships"].row() as relationship:
+            # relationship between two entities
+            entity_relationship.collect(
+                id=cocoindex.GeneratedField.UUID,
+                subject=relationship["subject"],
+                object=relationship["object"],
+                predicate=relationship["predicate"],
+            )
+            # mention of an entity in a document, for subject
+            entity_mention.collect(
+                id=cocoindex.GeneratedField.UUID, entity=relationship["subject"],
+                filename=doc["filename"],
+            )
+            # mention of an entity in a document, for object
+            entity_mention.collect(
+                id=cocoindex.GeneratedField.UUID, entity=relationship["object"],
+                filename=doc["filename"],
+            )
+
 
+    # export to neo4j
     document_node.export(
         "document_node",
         cocoindex.storages.Neo4j(
             connection=conn_spec,
             mapping=cocoindex.storages.NodeMapping(label="Document")),
         primary_key_fields=["filename"],
     )
+    # Declare reference Node to reference entity node in a relationship
     flow_builder.declare(
         cocoindex.storages.Neo4jDeclarations(
             connection=conn_spec,
             referenced_nodes=[
                 cocoindex.storages.ReferencedNode(
                     label="Entity",
                     primary_key_fields=["value"],
-                    vector_indexes=[
-                        cocoindex.VectorIndexDef(
-                            field_name="embedding",
-                            metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
-                        ),
-                    ],
                 )
             ]
         )
@@ -128,17 +116,13 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D
                     fields=[
                         cocoindex.storages.TargetFieldMapping(
                             source="subject", target="value"),
-                        cocoindex.storages.TargetFieldMapping(
-                            source="subject_embedding", target="embedding"),
                     ]
                 ),
                 target=cocoindex.storages.NodeReferenceMapping(
                     label="Entity",
                     fields=[
                         cocoindex.storages.TargetFieldMapping(
                             source="object", target="value"),
-                        cocoindex.storages.TargetFieldMapping(
-                            source="object_embedding", target="embedding"),
                     ]
                 ),
             ),