diff --git a/examples/docs_to_knowledge_graph/main.py b/examples/docs_to_knowledge_graph/main.py index b2c4b4a8..4e7905ac 100644 --- a/examples/docs_to_knowledge_graph/main.py +++ b/examples/docs_to_knowledge_graph/main.py @@ -1,5 +1,5 @@ """ -This example shows how to extract relationships from Markdown documents and build a knowledge graph. +This example shows how to extract relationships from documents and build a knowledge graph. """ import dataclasses from dotenv import load_dotenv @@ -13,7 +13,7 @@ class DocumentSummary: @dataclasses.dataclass class Relationship: - """Describe a relationship between two nodes.""" + """Describe a relationship between two entities.""" subject: str predicate: str object: str @@ -21,9 +21,9 @@ class Relationship: @cocoindex.flow_def(name="DocsToKG") def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope): """ - Define an example flow that extracts triples from files and build knowledge graph. + Define an example flow that extracts relationship from files and build knowledge graph. """ - + # configure neo4j connection conn_spec = cocoindex.add_auth_entry( "Neo4jConnection", cocoindex.storages.Neo4jConnection( @@ -41,13 +41,11 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D entity_mention = data_scope.add_collector() with data_scope["documents"].row() as doc: - doc["chunks"] = doc["content"].transform( - cocoindex.functions.SplitRecursively(), - language="markdown", chunk_size=10000) - + # extract summary from document doc["summary"] = doc["content"].transform( cocoindex.functions.ExtractByLlm( llm_spec=cocoindex.LlmSpec( + # Supported LLM: https://cocoindex.io/docs/ai/llm api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"), output_type=DocumentSummary, instruction="Please summarize the content of the document.")) @@ -55,44 +53,39 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D filename=doc["filename"], title=doc["summary"]["title"], summary=doc["summary"]["summary"]) - with doc["chunks"].row() as chunk: - chunk["relationships"] = chunk["text"].transform( - cocoindex.functions.ExtractByLlm( - llm_spec=cocoindex.LlmSpec( - api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"), - # Replace by this spec below, to use Ollama API instead of OpenAI - # llm_spec=cocoindex.LlmSpec( - # api_type=cocoindex.LlmApiType.OLLAMA, model="llama3.2"), + # extract relationships from document + doc["relationships"] = doc["content"].transform( + cocoindex.functions.ExtractByLlm( + llm_spec=cocoindex.LlmSpec( + # Supported LLM: https://cocoindex.io/docs/ai/llm + api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"), output_type=list[Relationship], instruction=( "Please extract relationships from CocoIndex documents. " "Focus on concepts and ingnore specific examples. " "Each relationship should be a tuple of (subject, predicate, object)."))) - with chunk["relationships"].row() as relationship: - relationship["subject_embedding"] = relationship["subject"].transform( - cocoindex.functions.SentenceTransformerEmbed( - model="sentence-transformers/all-MiniLM-L6-v2")) - relationship["object_embedding"] = relationship["object"].transform( - cocoindex.functions.SentenceTransformerEmbed( - model="sentence-transformers/all-MiniLM-L6-v2")) - entity_relationship.collect( - id=cocoindex.GeneratedField.UUID, - subject=relationship["subject"], - subject_embedding=relationship["subject_embedding"], - object=relationship["object"], - object_embedding=relationship["object_embedding"], - predicate=relationship["predicate"], - ) - entity_mention.collect( - id=cocoindex.GeneratedField.UUID, entity=relationship["subject"], - filename=doc["filename"], location=chunk["location"], - ) - entity_mention.collect( - id=cocoindex.GeneratedField.UUID, entity=relationship["object"], - filename=doc["filename"], location=chunk["location"], - ) + with doc["relationships"].row() as relationship: + # relationship between two entities + entity_relationship.collect( + id=cocoindex.GeneratedField.UUID, + subject=relationship["subject"], + object=relationship["object"], + predicate=relationship["predicate"], + ) + # mention of an entity in a document, for subject + entity_mention.collect( + id=cocoindex.GeneratedField.UUID, entity=relationship["subject"], + filename=doc["filename"], + ) + # mention of an entity in a document, for object + entity_mention.collect( + id=cocoindex.GeneratedField.UUID, entity=relationship["object"], + filename=doc["filename"], + ) + + # export to neo4j document_node.export( "document_node", cocoindex.storages.Neo4j( @@ -100,6 +93,7 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D mapping=cocoindex.storages.NodeMapping(label="Document")), primary_key_fields=["filename"], ) + # Declare reference Node to reference entity node in a relationship flow_builder.declare( cocoindex.storages.Neo4jDeclarations( connection=conn_spec, @@ -107,12 +101,6 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D cocoindex.storages.ReferencedNode( label="Entity", primary_key_fields=["value"], - vector_indexes=[ - cocoindex.VectorIndexDef( - field_name="embedding", - metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY, - ), - ], ) ] ) @@ -128,8 +116,6 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D fields=[ cocoindex.storages.TargetFieldMapping( source="subject", target="value"), - cocoindex.storages.TargetFieldMapping( - source="subject_embedding", target="embedding"), ] ), target=cocoindex.storages.NodeReferenceMapping( @@ -137,8 +123,6 @@ def docs_to_kg_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.D fields=[ cocoindex.storages.TargetFieldMapping( source="object", target="value"), - cocoindex.storages.TargetFieldMapping( - source="object_embedding", target="embedding"), ] ), ),