In [1]:
import os
import dotenv

dotenv.load_dotenv()

True

In [46]:
from llama_index.core import SimpleDirectoryReader, PropertyGraphIndex, KnowledgeGraphIndex
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import MarkdownNodeParser

from llama_index.llms.cohere import Cohere
from llama_index.embeddings.cohere import CohereEmbedding

from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore

import nest_asyncio; nest_asyncio.apply()

llm = Cohere(model="command-r")
embed_model = CohereEmbedding(model_name="embed-english-v3.0")

# load docs
# create knowledge graph
reader = SimpleDirectoryReader(
    input_files=[
        # "datasets/cds/md/bama.md",
        # "datasets/cds/md/mississippi-state.md",
        "datasets/cds/md/nyu.md",
        # "datasets/cds/md/uw-madison.md",
        # "datasets/cds/md/penn-state.md",
    ],
)

documents = reader.load_data()

pipeline = IngestionPipeline(
    transformations=[MarkdownNodeParser()],
)

nodes = pipeline.run(documents=documents)

graph_store = Neo4jPropertyGraphStore(
    username="neo4j",
    password="llamaindex",
    url="bolt://localhost:7687",
)

index = PropertyGraphIndex(
    nodes,
    llm=llm,
    embed_model=embed_model,
    property_graph_store=graph_store,
    show_progress=True,
)
# index = KnowledgeGraphIndex(
#     documents,
#     llm=llm,
#     embed_model=embed_model,
#     show_progress=True,
# )

Extracting paths from text: 100%|██████████████████████████████████████████████████████████████████████████████████████████| 74/74 [00:22<00:00,  3.24it/s]
Extracting implicit paths: 100%|████████████████████████████████████████████████████████████████████████████████████████| 74/74 [00:00<00:00, 22130.37it/s]
Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 12.69it/s]
Generating embeddings: 100%|█████████████████████████████████████████████████████████████████████████████████████████████| 109/109 [00:03<00:00, 30.23it/s]


In [47]:
retriever = index.as_retriever(similarity_top_k=10)

retriever.retrieve("what is the average GPA at nyu?")

[NodeWithScore(node=TextNode(id_='fd7cf493-88cb-495c-9e07-7f72e6e3c116', embedding=None, metadata={'file_path': 'datasets/cds/md/nyu.md', 'file_name': 'nyu.md', 'file_size': 72624, 'creation_date': '2024-06-29', 'last_modified_date': '2024-06-26'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='548b898d-e447-46cf-8b17-c069eae192cf', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'datasets/cds/md/nyu.md', 'file_name': 'nyu.md', 'file_size': 72624, 'creation_date': '2024-06-29', 'last_modified_date': '2024-06-26'}, hash='7885a9bc980539ed6530d750bfba1ec31a0cf16220f3259e7f930f0627799f0b')}, text="Here are some facts extracted from the provided text:\n\nNyu -> Link for more info on postponement ->

In [35]:
chat_engine = index.as_query_engine(llm=llm)

chat_engine.query("What is the average GPA score at nyu?")

Response(response="I'm afraid I wasn't able to find the average GPA score at UW Madison. However, according to one source, the GPA is maintained by the secondary school the student attends.", source_nodes=[NodeWithScore(node=TextNode(id_='34b2afc1-6803-4a32-88c7-10f73f904eee', embedding=None, metadata={'file_path': 'datasets/cds/md/uw-madison.md', 'file_name': 'uw-madison.md', 'file_size': 145955, 'creation_date': '2024-06-12', 'last_modified_date': '2024-06-12'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='441ec929-8841-43ea-b7db-4c1e4155b17d', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'datasets/cds/md/uw-madison.md', 'file_name': 'uw-madison.md', 'file_size': 145955, 'creation_date'