In [1]:
import os
import dotenv

dotenv.load_dotenv()

True

In [49]:
from llama_index.core import SimpleDirectoryReader, PropertyGraphIndex, KnowledgeGraphIndex
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import MarkdownNodeParser

from llama_index.llms.cohere import Cohere
from llama_index.embeddings.cohere import CohereEmbedding

from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore

import nest_asyncio; nest_asyncio.apply()

llm = Cohere(model="command-r")
embed_model = CohereEmbedding(model_name="embed-english-v3.0")

# load docs
# create knowledge graph
reader = SimpleDirectoryReader(
    input_files=[
        # "datasets/cds/md/bama.md",
        # "datasets/cds/md/mississippi-state.md",
        "datasets/cds/md/nyu.md",
        # "datasets/cds/md/uw-madison.md",
        # "datasets/cds/md/penn-state.md",
    ],
)

documents = reader.load_data()

pipeline = IngestionPipeline(
    transformations=[MarkdownNodeParser()],
)

nodes = pipeline.run(documents=documents)

graph_store = Neo4jPropertyGraphStore(
    username="neo4j",
    password="llamaindex",
    url="bolt://localhost:7687",
)

index = PropertyGraphIndex(
    nodes,
    llm=llm,
    embed_model=embed_model,
    property_graph_store=graph_store,
    show_progress=True,
)

# index = KnowledgeGraphIndex(
#     documents,
#     llm=llm,
#     embed_model=embed_model,
#     show_progress=True,
# )

Extracting paths from text: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 74/74 [00:24<00:00,  3.04it/s]
Extracting implicit paths: 100%|████████████████████████████████████████████████████████████████████████████████████████| 74/74 [00:00<00:00, 15050.11it/s]
Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 14.37it/s]
Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 114/114 [00:05<00:00, 20.83it/s]


In [57]:
index._kg_extractors

[SimpleLLMPathExtractor(llm=Cohere(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x2b53bff50>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x16c8a59e0>, completion_to_prompt=<function default_completion_to_prompt at 0x16c950720>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, model='command-r', temperature=None, max_retries=10, additional_kwargs={}, max_tokens=8192), extract_prompt=PromptTemplate(metadata={'prompt_type': <PromptType.KNOWLEDGE_TRIPLET_EXTRACT: 'knowledge_triplet_extract'>}, template_vars=['max_knowledge_triplets', 'text'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template="Some text is provided below. Given the text, extract up to {max_knowledge_triplets} knowledge triplets in the form of (subject, predicate, object). Avoid stopwords.\n---------------------\nExample:Text: Alice is Bob's mother.Triplets:\n(Alic

In [47]:
retriever = index.as_retriever(similarity_top_k=10)

retriever.retrieve("what is the average GPA at nyu?")

[NodeWithScore(node=TextNode(id_='fd7cf493-88cb-495c-9e07-7f72e6e3c116', embedding=None, metadata={'file_path': 'datasets/cds/md/nyu.md', 'file_name': 'nyu.md', 'file_size': 72624, 'creation_date': '2024-06-29', 'last_modified_date': '2024-06-26'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='548b898d-e447-46cf-8b17-c069eae192cf', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'datasets/cds/md/nyu.md', 'file_name': 'nyu.md', 'file_size': 72624, 'creation_date': '2024-06-29', 'last_modified_date': '2024-06-26'}, hash='7885a9bc980539ed6530d750bfba1ec31a0cf16220f3259e7f930f0627799f0b')}, text="Here are some facts extracted from the provided text:\n\nNyu -> Link for more info on postponement ->

In [48]:
chat_engine = index.as_query_engine(llm=llm)

chat_engine.query("What is the average GPA score at nyu?")

Response(response="Sorry, I couldn't find the average GPA score at NYU. However, I did find that NYU uses the SAT and ACT tests for placement and the latest date for these scores is January 31st.", source_nodes=[NodeWithScore(node=TextNode(id_='1c00e647-7212-43e7-b342-ea198f98fb45', embedding=None, metadata={'file_path': 'datasets/cds/md/nyu.md', 'file_name': 'nyu.md', 'file_size': 72624, 'creation_date': '2024-06-29', 'last_modified_date': '2024-06-26'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='d64e0977-ea46-4840-90fe-84b0527c2392', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'datasets/cds/md/nyu.md', 'file_name': 'nyu.md', 'file_size': 72624, 'creation_date': '2024-06-29', 'last_mo