In [1]:
!pip install --quiet langchain sentence_transformers faiss-gpu ctransformers
!pip install --quiet huggingface-hub
!pip install --quiet argostranslate
!pip install --quiet gdown
!pip install --quiet streamlit

Use LangChain to do apply the RAG. We will need:

1. Load the long document and a chunker that segments the document into N overlapping segments.
2. Use Embeddings from an encoder model. We use Microsoft's MiniLM which is fast and has very good performance.
3. Use in-memory vector database FAISS which stores the embeddings of the text and retrieves the data based on similarity.
4. Use the RetrievalQA chain to make the model answer user's questions

In [1]:
import os
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings as TextEmbeddings
from langchain.vectorstores import FAISS as faiss_vdb

from langchain.llms import CTransformers
from langchain import PromptTemplate
from langchain.chains import RetrievalQA, LLMChain
import torch


## Data Preparation

below are the experiment configurations. The performance of the solution depends on:

1. Chunk size: you may need to change it a couple of times before you get the best results.
2. Chunk overlap: we do overlap to avoid incoherent chunks
3. model type: please use Mistral for best and fastest retrieval capabilities, or llama for chat capabilities.

In [2]:
# Configurations

FILES_DIR:str = "./tmp/"
CHUNK_SIZES_CHARS:int = 500
CHUNKS_OVERLAP_CHARS:int = 50
DEVICE = "cuda" if torch.cuda.is_available() else 'cpu'

DB_NAME = "local_db"

GENERATION_TEMP = 0.01
MAX_GENERATION_NEW_TOKENS = 256

MODEL_TYPE = "LLAMA" # MISTRAL or LLAMA

DOWNLOAD_SAMPLE = True

In [3]:
if DOWNLOAD_SAMPLE:

  if not os.path.exists(FILES_DIR):
    os.mkdir(FILES_DIR)

  import gdown

  gdown.download("https://drive.google.com/uc?id=1AQUKn-0dreSEWUtJSRccmo-akPzKwg5W", output=f"{FILES_DIR}/a1.txt")

Downloading...
From: https://drive.google.com/uc?id=1AQUKn-0dreSEWUtJSRccmo-akPzKwg5W
To: /home/ahmed/work/code/rag-llama/tmp/a1.txt
100%|██████████| 13.9k/13.9k [00:00<00:00, 13.6MB/s]


In [4]:
# load the text documents
data_loader = DirectoryLoader(FILES_DIR, glob="*.txt", loader_cls=TextLoader)
text_chunker = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZES_CHARS, chunk_overlap=CHUNKS_OVERLAP_CHARS)


In [5]:
documents = data_loader.load()
documents

[Document(page_content='Diriyah[1] (Arabic: الدِرْعِيّة, ad-Dir‘īyah, approximate meaning ‘place of armor’ [2]), formerly romanized as Dereyeh[3] and Dariyya,[4] is a town and governorate in Saudi Arabia located on the northwestern outskirts of the Saudi capital, Riyadh. Diriyah was the original home of the Saudi royal family, and served as the capital of the Emirate of Diriyah under the first Saudi dynasty from 1727 to 1818.[5] Today, the town is the seat of the Diriyah Governorate—which also includes the villages of Uyayna, Jubayla, and Al-Ammariyyah, among others—and is part of Ar Riyad Province.\n\nThe Turaif district, the first capital of Saudis in Diriyah, was declared a UNESCO World Heritage Site in 2010.[6][7] The layout of the city itself can be easily studied in the National Museum of Saudi Arabia with the help of a large-scale detailed model of the city on display there. Diriyah also hosts the Diriyah ePrix race for the Formula E championship.\n\nLocation\nThe ruins of the o

In [6]:
chunks = text_chunker.split_documents(documents)
chunks

[Document(page_content='Diriyah[1] (Arabic: الدِرْعِيّة, ad-Dir‘īyah, approximate meaning ‘place of armor’ [2]), formerly romanized as Dereyeh[3] and Dariyya,[4] is a town and governorate in Saudi Arabia located on the northwestern outskirts of the Saudi capital, Riyadh. Diriyah was the original home of the Saudi royal family, and served as the capital of the Emirate of Diriyah under the first Saudi dynasty from 1727 to 1818.[5] Today, the town is the seat of the Diriyah Governorate—which also includes the villages of', metadata={'source': 'tmp/a1.txt'}),
 Document(page_content='Governorate—which also includes the villages of Uyayna, Jubayla, and Al-Ammariyyah, among others—and is part of Ar Riyad Province.', metadata={'source': 'tmp/a1.txt'}),
 Document(page_content='The Turaif district, the first capital of Saudis in Diriyah, was declared a UNESCO World Heritage Site in 2010.[6][7] The layout of the city itself can be easily studied in the National Museum of Saudi Arabia with the hel

In [7]:
len(chunks)

43

In [8]:
embedder = TextEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2", #"microsoft/Multilingual-MiniLM-L12-H384",
    model_kwargs={'device': DEVICE})


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
try:
    # do not recreate the database every time, just load it
    db_hander = faiss_vdb.load_local(DB_NAME)
except:
    db_handler = faiss_vdb.from_documents(chunks, embedder)
    db_handler.save_local(DB_NAME)


## Chat Model and Prompt Preparation

Download the model in GGUF format: this is a compressed format and runnable in C++ to make it as fast as possible

In [10]:
if not os.path.exists("./model/"):
    os.mkdir('./model/')

if MODEL_TYPE == "MISTRAL":
  model_key = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"
  if not os.path.exists(f"./model/{model_key}"):
    !huggingface-cli download TheBloke/Mistral-7B-Instruct-v0.2-GGUF mistral-7b-instruct-v0.2.Q4_K_M.gguf  --local-dir ./model --local-dir-use-symlinks False

elif MODEL_TYPE == "LLAMA":
  model_key = "firefly-llama2-13b-chat.Q4_0.gguf"
  if not os.path.exists(f"./model/{model_key}"):
    !huggingface-cli download TheBloke/firefly-llama2-13B-chat-GGUF firefly-llama2-13b-chat.Q4_0.gguf  --local-dir ./model --local-dir-use-symlinks False

else:
  raise Exception(f"Unsupported model type: {MODEL_TYPE}. Supported types are: [ `MISTRAL`,  `LLAMA` ]")


In [11]:
# load the language model

#! Note: if you get this error:
# OSError: /lib64/libm.so.6: version `GLIBC_2.29' not found (required by ~/lib/python3.12/site-packages/ctransformers/lib/avx2/libctransformers.so)
#! please follow the steps in this answer to solve it: https://stackoverflow.com/a/75630806

language_model = CTransformers(model=f"./model/{model_key}",
                    model_type='llama',
                    config={'max_new_tokens': MAX_GENERATION_NEW_TOKENS, 'temperature': GENERATION_TEMP},
                    n_gpu_layers=15
                  )


Tell the database that we need it to work as retriever. It uses similarity measures for high dimensional vectors, and it returns top 3 results. These results will be given to the LLM to help it answer the user's questions.

In [12]:
db_retriever = db_handler.as_retriever(search_kwargs={'k': 3, 'search_type':'similarity'})


In [13]:
# prepare the prmpt we will use on the language model.
# We are using minimal template to save time and speed-up the model inference.

prompt_format = """Use the following pieces of information to answer the user's question. If the question is in Arabic, respond in Arabic.
If you don't know the answer just say 'Information is NOT available' and don't try to make up an answer.
Context: {context}
Question: {question}
Only return the answer below and nothing else.
Answer:
"""

In [14]:
prompt_template = PromptTemplate(
    template=prompt_format,
    input_variables=['context', 'question'])

In [15]:
qa_lm = RetrievalQA.from_chain_type(llm=language_model,
                                     chain_type='stuff',
                                     retriever=db_retriever,
                                     return_source_documents=True,
                                     chain_type_kwargs={'prompt': prompt_template})

## Test the model

DISCLAIMER: Please note that we are running on CPU because the models are too large for available GPUs in colab. It's normal to take up to 7 minutes per example. Once you add GPU acceleration, it takes around 1 minute (depending on the type and size of the GPU)

In [16]:
# inference"How is the summer in Diriyah?"
user_prompt =  #"Who is the author of FftSharp? What is their favorite color?"
output = qa_lm({'query': user_prompt})
print(output["result"])


  warn_deprecated(


The summers in Diriyah are long, sweltering, and arid.


In [17]:
# inference
user_prompt = "Who is the author of FftSharp? What is their favorite color?"
output = qa_lm({'query': user_prompt})
print(output["result"])


Information is NOT available.


## Test Translation

Use an external translator to translate the user's Arabic prompt to english, and translate the response back to Arabic. A lot of work in literature shows that this technique acheives better results than directly passing the Arabic prompt to the LLM.

In [18]:
import argostranslate.package
import argostranslate.translate

from_code = "en"
to_code = "ar"

In [19]:
argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
package_to_install = next(
    filter(
        lambda x: x.from_code == from_code and x.to_code == to_code, available_packages
    )
)
argostranslate.package.install_from_path(package_to_install.download())

In [20]:
argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
package_to_install = next(
    filter(
        lambda x: x.from_code == to_code and x.to_code == from_code, available_packages
    )
)
argostranslate.package.install_from_path(package_to_install.download())

In [21]:
user_prompt = argostranslate.translate.translate("ماذا تعرف عن متحف السنين الماضية؟", to_code, from_code)
output = qa_lm({'query': user_prompt})
print(output["result"])
    

Museum of Bygone Days (متحف السنين الماضية) is located north of al-Bujairi and houses a collection reflecting everyday life in central Arabia in the early and middle twentieth century.


In [22]:
translatedText = argostranslate.translate.translate(output["result"],from_code, to_code)
print(translatedText)


ويقع متحف بيغون دايس (الموسم الثالث عشر) شمال البوجيري ويقيم مجموعة تعكس الحياة اليومية في وسط المملكة في أوائل ووسط القرن العشرين.
