# RAG with your PDFs for free in colab using huggingface, mongodb, llama-index, langchain

## 1. Download libraries

In [18]:
# to install llama-index
# NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install llama-index
!pip install datasets pandas pymongo sentence_transformers
!pip install -U transformers
!pip install accelerate
!pip install langchain

## 2. Preprocess pdf

### 2.1 Load PDF with llama-index

In [4]:
import os
from google.colab import userdata
import nest_asyncio
nest_asyncio.apply()

os.environ['LLAMA_CLOUD_API_KEY'] = userdata.get('LLAMA_INDEX')
from llama_parse import LlamaParse

documents = LlamaParse(result_type="markdown").load_data('lbg_relationship_tnc.pdf')

llama_text = documents[0].text

Started parsing the file under job_id 4a8721e2-e60c-412e-95c5-de04b0a881e0


In [5]:
llama_text[:500]

'NO_CONTENT_HERE\n---\nNO_CONTENT_HERE\n---\n|Contents|Page|\n|---|---|\n|Important information|1|\n|General| |\n|Information about our relationship with you|5|\n|Definitions and interpretation|6|\n|Your relationship with us|9|\n|Providing services to you|9|\n|Your warranties|9|\n|Who is authorised to give instructions to us|11|\n|Confidentiality|12|\n|Changes to the Terms and Conditions|13|\n|Terms applying to charges|14|\n|Interest paid and charged|16|\n|Third Party Providers| |\n|Partnerships|18|\n|Suspension of '

### 2.2 Chunk PDF text using langchain

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
llama_split = text_splitter.split_text(llama_text)

In [7]:
type(llama_split), type(llama_split[0]), len(llama_split)

(list, str, 156)

### 2.3 Embed chunks

For more embedding models: https://huggingface.co/spaces/mteb/leaderboard

In [8]:
from sentence_transformers import SentenceTransformer

# https://huggingface.co/thenlper/gte-large
emb_model = SentenceTransformer("thenlper/gte-large")

def embed(text):

    if not text.strip():
        print("Attempted to embed an empty string")
        return []
    embedding = emb_model.encode(text)

    return embedding.tolist()

texts_n_embs = []
for text in llama_split:
  item = {}
  item['text'] = text
  item['embedding'] = embed(text)
  texts_n_embs.append(item)

print(len(texts_n_embs))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

156


## 3. Set up mongodb

1. account
2. cluster
3. *vector search index*:
```
{
 "fields": [{
     "numDimensions": 1024,
     "path": "embedding",
     "similarity": "cosine",
     "type": "vector"
   }]
}
```


### 3.1 Connect to the db

In [9]:
import pymongo

def get_mongo_client(mongo_uri):
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("[SUCCESS] Connected to MongoDB")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"[FAILED] Connection failed: {e}")
        return None

# Connect -> Drivers -> Select Python as driver -> Copy connection string
mongo_uri = userdata.get("MONGO_URI")
if not mongo_uri:
    print("MONGO_URI not in env")

mongo_client = get_mongo_client(mongo_uri)

db = mongo_client["my_db"] # any name
collection = db["my_collection"] # any name

[SUCCESS] Connected to MongoDB


### 3.2 Delete existing (if any), and insert your data

In [11]:
collection.delete_many({})
collection.insert_many(texts_n_embs)
print("[SUCCESS] Data inserted into MongoDB!")

[SUCCESS] Data inserted into MongoDB!


## 4. Find relevant texts

### 4.1 Perform vector search in db

In [12]:
def vector_search(query, collection):

    query_embedding = embed(query)

    # define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 100,  # Number of candidate matches to consider
                "limit": 5,  # Return top n matches
            }
        },
        {
            "$project": {
                "_id": 0,  # 0 -> exclude
                "text": 1, # 1 -> include
                "score": {"$meta": "vectorSearchScore"},  # include the search score
            }
        },
    ]

    results = collection.aggregate(pipeline) # type: CommandCursor

    return list(results) # list of dicts with text and score

In [15]:
def get_context(query, collection):

    results = vector_search(query, collection)
    context_items = ""
    for result in results:
        context_items += f"{result.get('text', 'N/A')}\n"

    return context_items

In [16]:
get_context('What is a business day?', collection)

'Business Day: means 9am to 5pm every Monday to Friday other than public or bank holidays in England and Wales, unless you are transacting through one of our branches which opens for shorter hours or we notify you of different times for the processing of payments.\n---\n## Our Website\n\nmeans our website appearing at lloydsbank.com/business (or any other URL as we may notify to you from time to time).\n\n## Payment Instrument\n\nmeans any:\n\n- i. personalised device; or\n- ii. personalised set of procedures agreed between you and us such as the use of a password, security details or a PIN, used by you to instruct us to execute payment transactions for you.\n\n## Payment Services Regulations\n\nmeans the Payment Services Regulations 2017 (S.I. 2017/752).\n\n## PIN\n\nmeans Personal Identification Number.\n\n## Product\n\nhas the meaning given to it in the section entitled "Information about our relationship with you".\n\n## Reference Interest Rate\n| |24.3.2 on a day which is not a Bu

## 5. Load gemma using huggingface

In [17]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = 'google/gemma-2b-it'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id).to('cuda')

tokenizer_config.json:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/888 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

## 6. Prompt engineering + Talk with your PDF

https://www.promptingguide.ai/models/gemma

In [64]:
query = "Do you pay or charge interest?"
context = get_context(query, collection)
base_prompt = """You are a helpful assisstant to customers about a bank's terms and conditions.
Give yourself room to think by extracting relevant passages from the context before answering the query.
Don't return the thinking, only return the answer.
Make sure your answers are as clear and concise.
Use the following couple of examples as reference for the ideal answer style, but don't use the below example answers as answers to the query.
\nExample 1:
User query: I'm considering opening a new savings account with a competitive interest rate. However, I noticed a clause regarding minimum balance requirements. Could you elaborate on the potential implications of not maintaining this minimum balance?
Model answer: That's a prudent inquiry!  Many banks offer attractive interest rates on savings accounts, but they may stipulate a minimum balance requirement.  Failing to maintain this minimum can trigger various consequences, including incurring fees or forfeiting the advertised interest rate. Carefully review the minimum balance stipulation within the T&Cs to ensure it aligns with your financial situation.
\nExample 2:
User query: My bank has been sending frequent notifications regarding mobile banking security. While I appreciate the reminder, is utilizing mobile banking inherently risky?
Model answer: Mobile banking offers undeniable convenience but does necessitate vigilance. While not inherently risky, online transactions always carry a certain level of risk.  To mitigate these risks, ensure your mobile device is equipped with a strong password and avoid using public Wi-Fi networks for banking activities. Your bank's security notifications serve as a valuable reminder to prioritize online safety measures.
\nNow based on the following context items:
{context};
\n And answer the user's query:
User query: <start_of_turn>user{query}<end_of_turn>
Model: answer:"""
base_prompt = base_prompt.format(context=context, query=query)

dialogue_template = [
        {
            'role': 'user',
            'content': base_prompt
        }
    ]

prompt = tokenizer.apply_chat_template(conversation=dialogue_template, tokenize=False, add_generation_prompt=True)

In [65]:
prompt

"<bos><start_of_turn>user\nYou are a helpful assisstant to customers about a bank's terms and conditions. \nGive yourself room to think by extracting relevant passages from the context before answering the query.\nDon't return the thinking, only return the answer.\nMake sure your answers are as clear and concise.\nUse the following couple of examples as reference for the ideal answer style, but don't use the below example answers as answers to the query.\n\nExample 1:\nUser query: I'm considering opening a new savings account with a competitive interest rate. However, I noticed a clause regarding minimum balance requirements. Could you elaborate on the potential implications of not maintaining this minimum balance?\nAI answer: That's a prudent inquiry!  Many banks offer attractive interest rates on savings accounts, but they may stipulate a minimum balance requirement.  Failing to maintain this minimum can trigger various consequences, including incurring fees or forfeiting the adverti

In [71]:
from transformers import TextStreamer
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
response = model.generate(**input_ids, streamer=streamer, max_new_tokens=500, temperature=0.7, do_sample=True)
# tokenizer.decode(response[0])

 Yes, the Core Banking Agreement states that interest is paid and charged on a daily basis, and the interest rate applicable to your account(s) is stated in the Product & Services Terms & Conditions or, if no such terms are provided, on the website.
