In [1]:
! python --version

Python 3.12.2


## Install the required packages
- `%%capture` is used to suppress the output of the installation commands.

In [2]:
%%capture
! pip install llama-index-readers-file pymupdf
! pip install llama-index-vector-stores-postgres
! pip install llama-index-embeddings-huggingface
! pip install psycopg2-binary
! pip install ipywidgets
! pip install SQLAlchemy

# Setup the embedding model
- Here we setup the embedding model to use and get the size of the embedding to pass to PgVector.

In [3]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

"""
Get embeddings using Hugging Face embeddings (can be local or remote)
Note: we use this because the OpenAI API embedding model cannot be used in Llama Index
"""

embedding_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

text_embedding = embedding_model.get_text_embedding("Once upon a time, there was a cat.")
print(text_embedding[:5])
print(f"Emedding length: {len(text_embedding)}")
vector_size = len(text_embedding)

[-0.04875882342457771, -0.04734064266085625, 0.020610108971595764, 0.02316340245306492, 0.04693278670310974]
Emedding length: 768


# Setup the LLM
- Here we setup the LLM hosted through LM Studio

In [4]:

from llama_index.llms.lmstudio import LMStudio

llm = LMStudio(
    model_name="lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF",
    base_url="http://localhost:1234/v1",
    temperature=0,
)


# Setup PgVector extension in Postgres SQL
- In the code below, we drop the database everytime, just to ensure that we are starting from scratch. This is not recommended in production.

In [5]:
import psycopg2
import nest_asyncio

try:
    pg_pw = "mysecretpassword"
    pg_db = "vector_store"
    connection_string = f"postgresql://postgres:{pg_pw}@localhost:5432"
    db_name = pg_db
    conn = psycopg2.connect(connection_string)
    conn.autocommit = True

    with conn.cursor() as c:
        c.execute(f"DROP DATABASE {db_name} WITH (FORCE);")
        c.execute(f"CREATE DATABASE {db_name};")

    conn.commit()
    conn.close()
    
    nest_asyncio.apply()
    
except Exception as e:
    print(e)

# Download the PDF/s we want to ingest
- We download the PDF/s we want to ingest and store them in the `data` folder.


In [6]:
! mkdir data
! wget --user-agent "Mozilla" "https://arxiv.org/pdf/2401.05856.pdf" -O "data/RAG_Failure_Points.pdf"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


mkdir: data: File exists


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


--2024-07-11 12:50:06--  https://arxiv.org/pdf/2401.05856.pdf
Resolving arxiv.org (arxiv.org)... 151.101.3.42, 151.101.195.42, 151.101.67.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.3.42|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://arxiv.org/pdf/2401.05856 [following]
--2024-07-11 12:50:06--  http://arxiv.org/pdf/2401.05856
Connecting to arxiv.org (arxiv.org)|151.101.3.42|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 570647 (557K) [application/pdf]
Saving to: ‘data/RAG_Failure_Points.pdf’


2024-07-11 12:50:07 (5.31 MB/s) - ‘data/RAG_Failure_Points.pdf’ saved [570647/570647]



# PDF data file ingestion
- we configure LlamaIndex's Settings global configuration with the models we want to use, and the chunk size.
- We ingest the PDF data file into the database using the `pgvector` extension.


In [7]:

from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader
from sqlalchemy import make_url
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext
from llama_index.vector_stores.postgres import PGVectorStore

# Configuring the LLM (LLM2)
Settings.llm = llm

# Configuring the embedding model (LLM1)
Settings.embed_model = embedding_model
Settings.chunk_size = 768
Settings.chunk_overlap = 20

BASE_DIR = "./data"

def simple_RAG(vector_size):
    """
    Simple Retrieval Augmented Generation (RAG) using Llama Index.
    """

    url = make_url(connection_string)
    print(f"Url {url}")
    
    vector_store = PGVectorStore.from_params(
        database=db_name,
        host=url.host,
        password=url.password,
        port=url.port,
        user=url.username,
        table_name="basic_rag",
        embed_dim=vector_size
    )

    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    nodes = ingest_documents(BASE_DIR)
    
    print(f"Number of nodes: {len(nodes)}")

    index = VectorStoreIndex.from_documents(nodes, storage_context=storage_context, show_progress=True)
    return index

def ingest_documents(directory):
    """
    Ingest documents from a directory into the vector store. 
    """
    reader = SimpleDirectoryReader(input_dir=directory)
    return reader.load_data(show_progress=True)

# Run the ingestion
- now run the data ingestion
- also setup the query engines including the streaming version of the query engine

In [8]:
index = simple_RAG(vector_size=vector_size)
query_engine = index.as_query_engine(verbose=True)
streaming_query_engine = index.as_query_engine(verbose=True, streaming=True)

Url postgresql://postgres:***@localhost:5432


Loading files: 100%|██████████| 1/1 [00:00<00:00,  5.29file/s]

Number of nodes: 6





Parsing nodes:   0%|          | 0/6 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/12 [00:00<?, ?it/s]

# Query the data
- we query the data, then pretty print the results
- also show the node source

In [13]:
from llama_index.core.response.pprint_utils import pprint_response

response = query_engine.query("What is the first RAG Failure Point? Please provide a brief description.")
pprint_response(response, show_source=True)

Final Response: FP1: Missing Content  The first failure point occurs
when asking a question that cannot be answered from the available
documents. In this scenario, the RAG system will respond with
something like "Sorry, I don't know". However, for questions related
to the content but without answers, the system could be fooled into
giving a response.
______________________________________________________________________
Source Node 1/2
Node ID: 07b3ecd1-ecd5-4ab6-a4cd-25b28b493a0a
Similarity: 0.6897327077707449
Text: CAIN 2024, April 2024, Lisbon, Portugal Scott Barnett, Stefanus
Kurniawan, Srikanth Thudumu, Zach Brannelly, Mohamed Abdelrazek Case
Study Domain Doc Types Dataset Size RAG Stages Sample Questions
Cognitive Reviewer*Research PDFs (Any size) Chunker, Rewriter, Re-
triever, ReaderWhat are the key points covered in this paper? AI
Tutor* Education V...
______________________________________________________________________
Source Node 2/2
Node ID: 3b683016-1673-49d1-be35-d7af8e

# Query the data using the streaming query engine
- we query the data using the streaming query engine, then print the results


In [12]:
streaming_response = streaming_query_engine.query("What is the third RAG Failure Point? Please provide a brief description.")
streaming_response.print_response_stream()

The third RAG failure point is "Not in Context - Consolidation strategy Limitations". This refers to a situation where documents with the correct answers are retrieved from the database, but do not make it into the context for generating an answer. This occurs when many documents are returned from the database and a consolidation process takes place to retrieve the answer.

# What about the prompts?
- By default, the prompts are already set to default values. You can change them to your liking.
- Below we are displaying the default prompts.
- Assignment: Investigate how to change the default prompts and see how it affects the results.

In [11]:
from IPython.display import Markdown, display

# define prompt viewing function
def display_prompt_dict(prompts_dict):
    for k, p in prompts_dict.items():
        text_md = f"**Prompt Key**: {k}<br>" f"**Text:** <br>"
        display(Markdown(text_md))
        print(p.get_template())
        display(Markdown("<br><br>"))
        

prompts_dict = query_engine.get_prompts()
display_prompt_dict(prompts_dict)


**Prompt Key**: response_synthesizer:text_qa_template<br>**Text:** <br>

Context information is below.
---------------------
{context_str}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query_str}
Answer: 


<br><br>

**Prompt Key**: response_synthesizer:refine_template<br>**Text:** <br>

The original query is as follows: {query_str}
We have provided an existing answer: {existing_answer}
We have the opportunity to refine the existing answer (only if needed) with some more context below.
------------
{context_msg}
------------
Given the new context, refine the original answer to better answer the query. If the context isn't useful, return the original answer.
Refined Answer: 


<br><br>

# Review of the Basic RAG Process

![Review of the Basic RAG Process](./images/4-basic-rag-system.png)