In [1]:
!pip install langchain_community &> /dev/null
!pip install -U bitsandbytes &> /dev/null
!pip install -U accelerate &> /dev/null
!pip install pypdf &> /dev/null
!pip install sentence-transformers &> /dev/null
!pip install faiss-gpu &> /dev/null

# Import necessary libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import LlamaTokenizer, LlamaForCausalLM
import transformers


In [2]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the model
MODEL_NAME = "sarvamai/OpenHathi-7B-Hi-v0.1-Base"

def load_model(model_name):
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    tokenizer = LlamaTokenizer.from_pretrained(model_name)
    model = LlamaForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto"
    )
    return model, tokenizer

model, tokenizer = load_model(MODEL_NAME)

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/936 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/968k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.85M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/667 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.81G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [3]:
# Load and preprocess PDF documents
def load_and_split_documents(pdf_paths):
    documents = []
    for path in pdf_paths:
        loader = PyPDFLoader(path)
        documents.extend(loader.load())

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    split_docs = text_splitter.split_documents(documents)
    return split_docs

# Create embeddings and vector store
def create_vector_store(split_docs):
    # embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
    vector_store = FAISS.from_documents(split_docs, embeddings)
    return vector_store

# Step 3: Set up the language model pipeline
def setup_llm_pipeline(model, tokenizer):
    pipe = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.5,
        top_p=0.95,
        repetition_penalty=1.15,
        # device=device
    )
    return HuggingFacePipeline(pipeline=pipe)

# Create the RAG chain
def create_rag_chain(llm, vector_store):
    prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

    {context}

    Question: {question}
    Answer:"""

    PROMPT = PromptTemplate(
        template=prompt_template, input_variables=["context", "question"]
    )

    chain_type_kwargs = {"prompt": PROMPT}
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
        chain_type_kwargs=chain_type_kwargs,
        return_source_documents=True
    )
    return qa_chain

# Main function to set up and use the RAG system
def setup_rag_system(pdf_paths, model, tokenizer):
    split_docs = load_and_split_documents(pdf_paths)
    vector_store = create_vector_store(split_docs)
    llm = setup_llm_pipeline(model, tokenizer)
    qa_chain = create_rag_chain(llm, vector_store)
    return qa_chain


In [18]:
def generate_response_without_rag(query, model, tokenizer):
  inputs = tokenizer(query, return_tensors="pt").to(device)
  outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.5, top_p=0.95, repetition_penalty=1.15)
  response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
  return response

def ask_question(rag_system, question):
    result = rag_system({"query": question})
    return result["result"], result["source_documents"]

# Mount the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Fact checking: Example 1

In [None]:
# Example usage without RAG
query = "इस मुहावरा का अर्थ क्या है: Blue	ribbon"
response = generate_response_without_rag(query, model, tokenizer)
print(response)

इस मुहावरा का अर्थ क्या है: Blue	ribbon
 Hinweisः "ब्लू रिबन" शब्द एक ऐसा वाक्यांश हो सकता है जिसका उपयोग किसी विशेष कार्यक्रम या घटना के लिए पुरस्कारों को इंगित करने के लिए किया जाता है। The phrase is often used in the context of sports, music or other competitive events.


In [None]:
# Example usage with RAG
pdf_paths = [
    "/content/drive/My Drive/Idioms.pdf"
    ]
rag_system = setup_rag_system(pdf_paths, model, tokenizer)

question = "इस मुहावरा का अर्थ क्या है: Blue	ribbon"
answer, sources = ask_question(rag_system, question)
answer = answer.split('Answer')[-1]
print(f"Answer: {answer}")
print("\nSources:")
for i, doc in enumerate(sources):
    print(f"Source {i+1}: {doc.metadata['source']} (page {doc.metadata['page']})")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  return HuggingFacePipeline(pipeline=pipe)
  result = rag_system({"query": question})


Answer: :
---
ब्लू रिबन एक प्रतिष्ठित या उत्कृष्ट गुणवत्ता के साथ एक पुरस्कार या सम्मान को संदर्भित करता है। It is often used in reference to a high quality or exceptional achievement.

Sources:
Source 1: /content/drive/My Drive/Idioms.pdf (page 0)
Source 2: /content/drive/My Drive/Idioms.pdf (page 6)
Source 3: /content/drive/My Drive/Idioms.pdf (page 1)


#Fact Checking: Example 2

In [None]:
# Example usage without RAG
query = "इसका अर्थ क्या है: gedi"
response = generate_response_without_rag(query, model, tokenizer)
print(response)

इसका अर्थ क्या है: gedi
 संतुलित, सुसंगत और सामंजस्यपूर्ण लगता है। The use of "gedi" is a traditional way to express the beauty and harmony found in nature.


In [None]:

# Example usage with RAG
pdf_paths = [
    "/content/drive/My Drive/cs.pdf"
    ]
rag_system = setup_rag_system(pdf_paths, model, tokenizer)

question = "इसका अर्थ क्या है: gedi"
answer, sources = ask_question(rag_system, question)
answer = answer.split('Answer')[-1]
print(f"Answer: {answer}")
print("\nSources:")
for i, doc in enumerate(sources):
    print(f"Source {i+1}: {doc.metadata['source']} (page {doc.metadata['page']})")



Answer: :
---
"गेडी" का अर्थ "राउंड लेने के लिए" होता है।

Sources:
Source 1: /content/drive/My Drive/cs.pdf (page 3)
Source 2: /content/drive/My Drive/cs.pdf (page 6)
Source 3: /content/drive/My Drive/cs.pdf (page 1)


#Fact chcking: example 3

In [None]:
# Example usage without RAG
query = "Which year was Hindi Divas celebrated for the first time?"
response = generate_response_without_rag(query, model, tokenizer)
print(response)

Which year was Hindi Divas celebrated for the first time?
 nobody knows
A:
---
1936 में हिंदी दिवस पहली बार मनाया गया था।


In [None]:

# Example usage with RAG
pdf_paths = [
    "/content/drive/My Drive/hindi_divas.pdf"
    ]
rag_system = setup_rag_system(pdf_paths, model, tokenizer)

question = "Which year was Hindi Divas celebrated for the first time?"
answer, sources = ask_question(rag_system, question)
answer = answer.split('Answer')[-1]
print(f"Answer: {answer}")
print("\nSources:")
for i, doc in enumerate(sources):
    print(f"Source {i+1}: {doc.metadata['source']} (page {doc.metadata['page']})")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  return HuggingFacePipeline(pipeline=pipe)
  result = rag_system({"query": question})


Answer: : 1953
---
हिंदी दिवस पहली बार 1953 में मनाया गया था।

Sources:
Source 1: /content/drive/My Drive/hindi_divas.pdf (page 1)
Source 2: /content/drive/My Drive/hindi_divas.pdf (page 0)
Source 3: /content/drive/My Drive/hindi_divas.pdf (page 3)


#Self consistency: Example 1

In [16]:
# Example usage without RAG
query = "When Did the Middle Ages End?"
response = generate_response_without_rag(query, model, tokenizer)
print(response)

When Did the Middle Ages End?
 संतुलित करने के लिए, हम इस बात पर विचार कर सकते हैं कि क्या मध्य युग में गिरावट आई है।
The Middle Ages ended in 1492. यह वर्ष है जब क्रिस्टोफर कोलंबस ने पहली बार नई दुनिया की खोज की थी। This event marked a significant change in European society and culture, which was often referred to as "the beginning of modernity." हालाँकि, कुछ इतिहासकारों का तर्क है कि यूरोप में परिवर्तन और विकास की एक लंबी अवधि थी जो 8वीं शताब्दी में शुरू हुई और 16वीं शताब्दी तक जारी रही। The end date of this period is debated among historians and scholars.
मध्यकालीन काल का अंत कब हुआ?
There are several dates that have been proposed for when the Middle Ages came to an end:
इन तिथियों पर बहस होती रहती है क्योंकि वे अक्सर अलग-अलग क्षेत्रों या समय अवधि से संबंधित होते हैं। For example, some argue that the Renaissance began earlier than others do because it was first observed in Italy rather than elsewhere. इसके अतिरिक्त, मध्ययुगीन काल की शुरुआत और अंत की तारीखें राजनीतिक, सामाजिक और आर्थ

In [19]:
# Example usage without RAG
query = "When Did the Middle Ages End?"
response = generate_response_without_rag(query, model, tokenizer)
print(response)

When Did the Middle Ages End?
jahrhundert, und wie lange dauern sie vor Ende" (1804).
19वीं शताब्दी।
The 19th century saw an increasing interest in medieval history. यह विशेष रूप से जर्मन भाषी देशों में सच था, जहाँ मध्ययुगीन इतिहास की एक निश्चित मात्रा को राष्ट्रीय इतिहास के हिस्से के रूप में देखा गया था। In England and France, by contrast, there was a strong belief that the Middle Ages were of little importance to modern society. फ्रांस में, चार्ल्स-लुई एंटोनी डी बोइसगुएनन ने अपने "हिस्टॉयर डेस रेग्नेस डी फ्रांस एट देस बैरन्स" (1765) और "हिस्टॉयर जनरल डु मोयेनटेम्प्स" (1772) में तर्क दिया कि फ्रांसीसी क्रांति का श्रेय प्राचीन शासन को नहीं बल्कि पुनर्स्थापना के लिए राजाओं को दिया जाना चाहिए। The French Revolution itself seemed to have no direct antecedents from which it could be derived; instead, de Boisluguin argued, it came out of the natural evolution of human affairs. इंग्लैंड में, इतिहासकारों ने इस विचार पर जोर दिया कि अंग्रेजी गृहयुद्ध ने केवल एक क्रांतिकारी परिवर्तन किया जो बाद 

In [21]:
# Example usage with RAG
pdf_paths = [
    "/content/drive/My Drive/midage.pdf"
    ]
rag_system = setup_rag_system(pdf_paths, model, tokenizer)

question = "When Did the Middle Ages End?"
answer, sources = ask_question(rag_system, question)
answer = answer.split('Answer')[-1]
print(f"Answer: {answer}")
print("\nSources:")
for i, doc in enumerate(sources):
    print(f"Source {i+1}: {doc.metadata['source']} (page {doc.metadata['page']})")



Answer: : 1380
---
मध्य युग की शुरुआत 476 ईस्वी में हुई थी और यह लगभग 1,000 वर्षों तक चली, जो 1400 से 1450 के बीच समाप्त हो गई।

Sources:
Source 1: /content/drive/My Drive/midage.pdf (page 2)
Source 2: /content/drive/My Drive/midage.pdf (page 2)
Source 3: /content/drive/My Drive/midage.pdf (page 3)


#Self consistency: Example 2

In [31]:
# Example usage without RAG
query = "Who was the first European to land on 'Terra Australis'?"
response = generate_response_without_rag(query, model, tokenizer)
print(response)

Who was the first European to land on 'Terra Australis'?
 गड्ढाः
---
The first European to land on "Terra Australis" was Willem de Vlamingh in 1697.


In [32]:
# Example usage without RAG
query = "Who was the first European to land on 'Terra Australis'?"
response = generate_response_without_rag(query, model, tokenizer)
print(response)

Who was the first European to land on 'Terra Australis'?
 nobody knows.
यह सवाल कि क्या कोई भी यूरोपीय व्यक्ति कभी ऑस्ट्रेलिया में उतरा, आज तक अनसुलझा है। This is because there were no records of any shipwrecks or castaways in Australia during the period from about 1500 until 1788 when Captain James Cook made his landing at Botany Bay in New South Wales and claimed it for Britain. इस अवधि के दौरान किसी भी यूरोपीय ने महाद्वीप का दौरा नहीं किया और न ही उन्होंने इसके तटों पर कदम रखा। The question has been debated by historians since the late nineteenth century but it remains unresolved.


In [33]:
# Example usage with RAG
pdf_paths = [
    "/content/drive/My Drive/aus.pdf"
    ]
rag_system = setup_rag_system(pdf_paths, model, tokenizer)

question = "Who was the first European to land on 'Terra Australis'?"
answer, sources = ask_question(rag_system, question)
answer = answer.split('Answer')[-1]
print(f"Answer: {answer}")
print("\nSources:")
for i, doc in enumerate(sources):
    print(f"Source {i+1}: {doc.metadata['source']} (page {doc.metadata['page']})")



Answer: :
---
विलेम जान्सून

Sources:
Source 1: /content/drive/My Drive/aus.pdf (page 0)
Source 2: /content/drive/My Drive/aus.pdf (page 3)
Source 3: /content/drive/My Drive/aus.pdf (page 7)


#Self consistency: Example 3

In [36]:
# Example usage without RAG
query = "When Did the Cold War between United States and the Soviet Union End?"
response = generate_response_without_rag(query, model, tokenizer)
print(response)

When Did the Cold War between United States and the Soviet Union End?
 Unterscheidung।
The end of the Cold War was in 1985.


In [37]:
# Example usage without RAG
query = "When Did the Cold War between United States and the Soviet Union End?"
response = generate_response_without_rag(query, model, tokenizer)
print(response)

When Did the Cold War between United States and the Soviet Union End?
 गद्दार का नाम क्या था जिसने सोवियत संघ को परमाणु बम के बारे में बताया था?
- The name of the traitor who told about nuclear bomb to the Soviets was Julius Rosenberg.
1960 में, किस देश ने बर्लिन की दीवार बनाई थी?
- In 1960, West Germany built a wall in Berlin.


In [39]:
# Example usage with RAG
pdf_paths = [
    "/content/drive/My Drive/coldwar.pdf"
    ]
rag_system = setup_rag_system(pdf_paths, model, tokenizer)

question = "When Did the Cold War between United States and the Soviet Union End?"
answer, sources = ask_question(rag_system, question)
answer = answer.split('Answer')[-1]
print(f"Answer: {answer}")
print("\nSources:")
for i, doc in enumerate(sources):
    print(f"Source {i+1}: {doc.metadata['source']} (page {doc.metadata['page']})")



Answer: : The cold war ended in 1991.

Sources:
Source 1: /content/drive/My Drive/coldwar.pdf (page 0)
Source 2: /content/drive/My Drive/coldwar.pdf (page 3)
Source 3: /content/drive/My Drive/coldwar.pdf (page 6)
