<a href="https://colab.research.google.com/github/chueneelvin/Databricks/blob/main/Multi_Modal_RAG_with_Unstructured_io.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Build a RAG system with Llama 3B-Instruct for your PDFs

In this quick tutorial, we'll build a simple RAG system with an LLM from Meta AI - Llama 3, specifically the `Llama-3-8B-Instruct` version that you can get on the Hugging Face Hub.
We'll use [Unstructured Serverless API](https://unstructured.io/) for preprocessing PDF files, LangChain for RAG, FAISS for vector storage, and HuggingFace `transformers` to get the model. Let's go!

Install all the libraries, and get your [Unstructured API key](https://unstructured.io/api-key-hosted) - it comes with a 14-day trial, and a cap of 1000 pages/day.

In [1]:
!pip install -qU "unstructured-ingest[pdf]" unstructured langchain langchain-community transformers accelerate bitsandbytes sentence-transformers faiss-cpu langchain_groq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.4 MB/s[0

In [2]:
import os

os.environ["UNSTRUCTURED_API_KEY"] = "aQajECaUnBCGzUqJZq9n62u7Vrwmhw" # Add your key here
os.environ["UNSTRUCTURED_URL"] ="https://api.unstructuredapp.io/general/v0/general" # You can find the URL in your personalized dashboard
os.environ["GROQ_API_KEY"] = "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"
os.environ["LANGCHAIN_API_KEY"] = "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"
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "multi-modal-rag"

In [3]:
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.interfaces import ProcessorConfig
from unstructured_ingest.v2.processes.connectors.local import (
    LocalIndexerConfig,
    LocalDownloaderConfig,
    LocalConnectionConfig,
    LocalUploaderConfig
)
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
from unstructured_ingest.v2.processes.chunker import ChunkerConfig

We will use the ingest functionality to partition PDF files in a local directory.

In [4]:
directory_with_pdfs="/content/data"
directory_with_results="/content/output"

Pipeline.from_configs(
    context=ProcessorConfig(),
    indexer_config=LocalIndexerConfig(input_path=directory_with_pdfs),
    downloader_config=LocalDownloaderConfig(),
    source_connection_config=LocalConnectionConfig(),
    partitioner_config=PartitionerConfig(
        partition_by_api=True,
        api_key=os.getenv("UNSTRUCTURED_API_KEY"),
        partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
        strategy="hi_res",
        additional_partition_args={
            "split_pdf_page": True,
            "split_pdf_concurrency_level": 15,
            },
        ),
    uploader_config=LocalUploaderConfig(output_dir=directory_with_results)
).run()


2024-09-30 11:40:29,548 MainProcess INFO     created index with configs: {"input_path": "/content/data", "recursive": false}, connection configs: {"access_config": "**********"}
2024-09-30 11:40:29,556 MainProcess INFO     Created download with configs: {"download_dir": null}, connection configs: {"access_config": "**********"}
2024-09-30 11:40:29,558 MainProcess INFO     created partition with configs: {"strategy": "hi_res", "ocr_languages": null, "encoding": null, "additional_partition_args": {"split_pdf_page": true, "split_pdf_concurrency_level": 15}, "skip_infer_table_types": null, "fields_include": ["element_id", "text", "type", "metadata", "embeddings"], "flatten_metadata": false, "metadata_exclude": [], "metadata_include": [], "partition_endpoint": null, "partition_by_api": true, "api_key": "*******", "hi_res_model_name": null}
2024-09-30 11:40:29,560 MainProcess INFO     Created upload with configs: {"output_dir": "/content/output"}, connection configs: {"access_config": "*****

Load document elements from json outputs, create LangChain documents from document chunks and their metadata, and ingest those documents into the FAISS vectorstore.

Set up the retriever.

In [None]:
from unstructured.staging.base import elements_from_json

def load_processed_files(directory_path):
    elements = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.json'):
            file_path = os.path.join(directory_path, filename)
            try:
                elements.extend(elements_from_json(filename=file_path))
            except IOError:
                print(f"Error: Could not read file {filename}.")

    return elements

elements = load_processed_files(directory_with_results)

In [None]:
elements[0]

In [None]:
from langchain_core.documents import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

documents = []
for element in elements:
    metadata = element.metadata.to_dict()
    documents.append(Document(page_content=element.text, metadata=metadata))

db = FAISS.from_documents(documents, HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5"))
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})

  db = FAISS.from_documents(documents, HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5"))
  from tqdm.autonotebook import tqdm, trange
INFO: Use pytorch device_name: cuda
INFO: Load pretrained SentenceTransformer: BAAI/bge-base-en-v1.5
Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

INFO: Loading faiss with AVX512 support.
INFO: Successfully loaded faiss with AVX512 support.


In [None]:
retriever.invoke("What is Multimodal Fusion Network?")

[Document(metadata={'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 6, 'filename': '2402.07016v1.pdf', 'data_source': {'record_locator': {'path': '/content/data/2402.07016v1.pdf'}, 'date_modified': '1727693858.264926', 'date_processed': '1727693873.626695', 'permissions_data': [{'mode': 33188}]}}, page_content='4.4 Multimodal Fusion Network'),
 Document(metadata={'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 4, 'parent_id': '1cb6845cb007fd775279f24b971eb640', 'filename': '2402.07016v1.pdf', 'data_source': {'record_locator': {'path': '/content/data/2402.07016v1.pdf'}, 'date_modified': '1727693858.264926', 'date_processed': '1727693873.626695', 'permissions_data': [{'mode': 33188}]}}, page_content='• Multimodal Fusion Network gets embedding hi from in- put modality Xi and fuses them in an adaptive way to get an enhances representation z.'),
 Document(metadata={'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 8, 'parent_id': 'a1

# Initialize LLM

In [None]:
from langchain_groq import ChatGroq
llm = ChatGroq(temperature=0, model_name="llama3-70b-8192")
llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x7d10c9029120>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7d10c90286d0>, model_name='llama3-70b-8192', temperature=1e-08, model_kwargs={}, groq_api_key=SecretStr('**********'))

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "Use the given context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "Use three sentence maximum and keep the answer concise. "
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
query = "What is the Mortality Outcome Prediction for MPIM?"

chain.invoke({"input": query})

INFO: HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


{'input': 'What is the Mortality Outcome Prediction for MPIM?',
 'context': [Document(metadata={'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 8, 'parent_id': '8e4a6f58c2ff711a2e3219104d298385', 'filename': '2402.07016v1.pdf', 'data_source': {'record_locator': {'path': '/content/data/2402.07016v1.pdf'}, 'date_modified': '1727693858.264926', 'date_processed': '1727693873.626695', 'permissions_data': [{'mode': 33188}]}}, page_content='Table 2: In-hospital mortality and readmission prediction results on MIMIC-III. Bold indicates the best performance. All metrics are multi- plied by 100 for readability purposes.'),
  Document(metadata={'text_as_html': '<table><thead><tr><th rowspan="3">Methods</th><th colspan="4">Outcome</th><th colspan="4">Readmission Prediction</th></tr><tr><th>AUROC</th><th>Mortality AUPRC</th><th>Prediction min(+P, Se)</th><th>F1</th><th>AUROC</th><th>30-Day AUPRC</th><th>min(+P, Se)</th><th>Fl1</th></tr><tr><th></th><th></th><th></th><th></th><th>

In [1]:
query = "Which model has lowest retrieval time for top 5 accuracy?"

chain.invoke({"input": query})

NameError: name 'chain' is not defined