<a href="https://colab.research.google.com/github/edgarbc/llm-knowledge-extractor/blob/main/my_RAG_example_PDFs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Example of a simple RAG to query an LLM on local data from PDFs.

Edgar Bermudez

November, 2023

In [10]:
!pip install openai
!pip install llama-index
!pip install tiktoken



In [13]:
import openai
import tiktoken
from llama_index import ServiceContext, LLMPredictor, OpenAIEmbedding, PromptHelper
from llama_index.llms import OpenAI
from llama_index.text_splitter import TokenTextSplitter
from llama_index.node_parser import SimpleNodeParser
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index import set_global_service_context

Load the API_KEY to be able to use gpt-3.5-turbo LLM from OpenAI.

TODO: extend to an opensource LLM from HuggingFace.

In [8]:
import os

In [9]:
os.environ['OPENAI_API_KEY']="MY API CODE"

Load the PDF documents stored in the local directory data_dir

In [None]:
from llama_index.readers.file.base import SimpleDirectoryReader
documents = SimpleDirectoryReader(input_dir='data_dir').load_data()

In [16]:
text_splitter = TokenTextSplitter(
    separator=" ",
    chunk_size=1024,
    chunk_overlap=20,
    backup_separators=["\n"],
    tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
)

In [18]:
node_parser = SimpleNodeParser.from_defaults(
    text_splitter=TokenTextSplitter()
)

In [None]:
text_splitter=SentenceSplitter(
    separator=" ",
    chunk_size=1024,
    chunk_overlap=20,
    paragraph_separator="\n\n\n",
    secondary_chunking_regex="[^,.; ]+[,.; ]?",
    tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
)



In [20]:
from llama_index.embeddings.openai import OpenAIEmbedding
llm = OpenAI(model="gpt-3.5-turbo", temperature=0, max_tokens=256)
embed_model = OpenAIEmbedding()
prompt_helper = PromptHelper(
    context_window=4096,
    num_output=256,
    chunk_overlap_ratio=0.1,
    chunk_size_limit=None
)
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
    node_parser=node_parser,
    prompt_helper=prompt_helper
)

In [None]:
from llama_index.indices.vector_store.base import VectorStoreIndex
index = VectorStoreIndex.from_documents(
    documents,
    service_context=service_context
)

In [None]:
query_engine=index.as_query_engine(service_context=service_context)


In [None]:
response = query_engine.query("What is HNSW?")
print(response)