In [1]:
!pip install llama-index-readers-file
!pip install neo4j
!pip install unstructured
!pip install "unstructured[pdf]"
!pip install llama-index-llms-ollama
!pip install llama-index-llms-openai
!pip install llama-index-graph-stores-neo4j
!pip install llama-index-embeddings-ollama


Collecting llama-index-readers-file
  Downloading llama_index_readers_file-0.1.22-py3-none-any.whl.metadata (5.3 kB)
Collecting beautifulsoup4<5.0.0,>=4.12.3 (from llama-index-readers-file)
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting llama-index-core<0.11.0,>=0.10.1 (from llama-index-readers-file)
  Downloading llama_index_core-0.10.37.post1-py3-none-any.whl.metadata (2.6 kB)
Collecting pypdf<5.0.0,>=4.0.1 (from llama-index-readers-file)
  Downloading pypdf-4.2.0-py3-none-any.whl.metadata (7.4 kB)
Collecting striprtf<0.0.27,>=0.0.26 (from llama-index-readers-file)
  Downloading striprtf-0.0.26-py3-none-any.whl.metadata (2.1 kB)
Collecting aiohttp<4.0.0,>=3.8.6 (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-file)
  Downloading aiohttp-3.9.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (7.5 kB)
Collecting dataclasses-json (from llama-index-core<0.11.0,>=0.10.1->llama-index-readers-file)
  Downloading dataclasses_

In [2]:
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import (
    DocxReader,
    HWPReader,
    PDFReader,
    EpubReader,
    FlatReader,
    HTMLTagReader,
    ImageCaptionReader,
    ImageReader,
    ImageVisionLLMReader,
    IPYNBReader,
    MarkdownReader,
    MboxReader,
    PptxReader,
    PandasCSVReader,
    VideoAudioReader,
    UnstructuredReader,
    PyMuPDFReader,
    ImageTabularChartReader,
    XMLReader,
    PagedCSVReader,
    CSVReader,
    RTFReader,
)

In [3]:
# PDF Reader with `SimpleDirectoryReader`
parser = PDFReader()
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(
    "/data-transfer/papers", file_extractor=file_extractor
).load_data()

In [4]:
!ls /data-transfer/papers

2305.14449v3.pdf


In [5]:
# Check loaded documents

print(f"Number of documents: {len(documents)}")

for doc in documents:
    print(doc.doc_id)
    print(doc.text[:500] + '...')
    

Number of documents: 8
ce4e5b12-d978-4e25-bdd0-0b6a837baca6
Graph Meets LLM: A Novel Approach to Collaborative Filtering
for Robust Conversational Understanding
Zheng Chen∗
zgchen@amazon.com
Amazon Alexa AI
Seattle, WA, USAZiyan Jiang∗
ziyjiang@amazon.com
Amazon Alexa AI
Seattle, WA, USAFan Yang∗
ffanyang@amazon.com
Amazon Alexa AI
Seattle, WA, USA
Eunah Cho†
eunahch@amazon.com
Amazon Alexa AI
Seattle, WA, USAXing Fan†
fanxing@amazon.com
Amazon Alexa AI
Seattle, WA, USAXiaojiang Huang†
xjhuang@amazon.com
Amazon Alexa AI
Seattle, WA, USA
Yanbin Lu†
luyanb...
b3bfd0b8-f73b-48e1-8848-fb99fc56b183
Conference acronym ’XX, June 03–05, 2018, Woodstock, NY Zheng and Ziyan and Fan, et al.
Personalized query rewriting (Personalized QR) takes into account
individual preferences or unique error patterns identified from a
user’s historical interactions with the conversational AI. It plays
a crucial role in addressing a wide range of user-specific defects,
particularly in the torso and tail distribu

In [6]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

import os
# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = "LLAMACLOUD-API-KEY"

# Using OpenAI API for embeddings/llms
os.environ["OPENAI_API_KEY"] = "OPENAI-API-KEY"

In [16]:
#from llama_index.llms.openai import OpenAI
from llama_index.llms.ollama import Ollama
#from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import  VectorStoreIndex
from llama_index.core import Settings

EMBEDDING_MODEL  = "nomic-embed-text"
GENERATION_MODEL = "llama3"

#llm = OpenAI(model=GENERATION_MODEL)


llm = Ollama(model=GENERATION_MODEL, base_url="http://192.168.1.102:11434", request_timeout=120.0)


Settings.llm = llm


In [12]:
username = "neo4j"
password = "abc123abc123"
url = "bolt://neo4j-1:7687"
database = "neo4j"

In [17]:
from llama_index.embeddings.ollama import OllamaEmbedding

embedding_llm = OllamaEmbedding(
    model_name="nomic-embed-text",
    base_url="http://192.168.1.102:11434",
    ollama_additional_kwargs={"mirostat": 0},
)

In [19]:
from llama_index.core import KnowledgeGraphIndex, SimpleDirectoryReader
from llama_index.core import StorageContext, ServiceContext
from llama_index.graph_stores.neo4j import Neo4jGraphStore

from IPython.display import Markdown, display

graph_store = Neo4jGraphStore(
    username=username,
    password=password,
    url=url,
    database=database,
)

# Initialize Service Context
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embedding_llm,
    chunk_size_limit=512
)

storage_context = StorageContext.from_defaults(graph_store=graph_store)

# NOTE: can take a while!
index = KnowledgeGraphIndex.from_documents(
    documents,
    storage_context=storage_context,
    service_context=service_context,
    max_triplets_per_chunk=2,
)

  service_context = ServiceContext.from_defaults(
