# A Simple Mistral 7B Offline RAG with Reranker

<img src="../images/simple-rag-1-reranker.webp" width=700>

### 1) Load the embedding model and LLM:

In [1]:
from llama_index.core import Settings, PromptTemplate
from llama_index.core.embeddings import resolve_embed_model
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM

import torch

Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

Settings.llm = HuggingFaceLLM(
    context_window=2048,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.25, "do_sample": False},
    model_name="mistralai/Mistral-7B-Instruct-v0.2",
    tokenizer_name="mistralai/Mistral-7B-Instruct-v0.2",
    device_map="auto",
    
    model_kwargs={
        "torch_dtype": torch.bfloat16,
        # Since we are using a small GPU with limited memory
        # Set to False if you have a large GPU to speed things up.
        "offload_buffers": True, 
    }
)

Settings.chunk_size = 512

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

### 2) Load data / read in documents

In [3]:
from pathlib import Path
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

data_dir =  Path("..") / "data" / "example-1"

documents = SimpleDirectoryReader(data_dir).load_data()

# Sanity checks
unique_docs = set(d.metadata["file_name"] for d in documents)
print(f"Read documents: {unique_docs}")

Read documents: {'Basic-Scientific-Food-Preparation-Lab-Manual.pdf', 'Basic-Scientific-Food-Preparation-Lab-Manual.txt'}


### 3) Create vector database

- VectorStoreIndex is an in-memory vector database

In [4]:
index = VectorStoreIndex.from_documents(
    documents,
    show_progress=True
)

num_chunks = len(documents)
print(f"Database consists of {num_chunks} chunks")

Parsing nodes:   0%|          | 0/252 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/567 [00:00<?, ?it/s]

Database consists of 252 chunks


### 4) Set up custom prompt template

In [5]:
from llama_index.core import PromptTemplate

template = (
    "We have provided context information below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Given this information, please answer the question: {query_str}\n"
)
qa_template = PromptTemplate(template)
query_engine = index.as_query_engine()
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_template}
)

### 5) Query the vector database

In [6]:
response = query_engine.query("What is dehydration?")
print(response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Dehydration is a method of preserving food by removing most of the water content. It can be done through various methods such as sun drying, oven drying, or using a dehydrator. Properly dehydrated food can be stored for long periods of time without spoiling and can be rehydrated or used as is for consumption. In this lab, students will be pretreating, drying, storing, and serving some commonly dehydrated fruits and vegetables, and observing the characteristics of various dried fruits.


In [7]:
response = query_engine.query("How many tbsp. butter to use for the Cream Puffs?")
print(response)
# Correct answer is 2

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Answer: 2 tbsp. butter are required for the Cream Puffs.


In [11]:
response = query_engine.query("How many tbsp. butter to use for the Cream Puffs?")
print(response)
# Correct answer is 2

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Answer: 2 tbsp. butter are required for the Cream Puffs.


In [12]:
response = query_engine.query(
    "What are the toppings for the Braided Bread "
    "of the Braids, Coffeecake, and Sweet Rolls recipe?"
)
print(response)
# Correct answer is 
# 2 tsp. caraway seeds and ½ cup shredded Cheddar cheese.
# ½ cup diced Swiss cheese and paprika.
# ...

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



The toppings for the Braided Bread of the Braids, Coffeecake, and Sweet Rolls recipe are:

1. 2 tsp. caraway seeds and ½ cup shredded Cheddar cheese.
2. ½ cup diced Swiss cheese and paprika.
