In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [None]:
import os

MODEL_PATH = "/content/drive/MyDrive/mistral_7b_4bit"
os.makedirs(MODEL_PATH, exist_ok=True)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_name = "mistralai/Mistral-7B-Instruct-v0.1"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

if not os.listdir(MODEL_PATH):
    print("Downloading model for first time...")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto"
    )

    # Save permanently to Drive
    tokenizer.save_pretrained(MODEL_PATH)
    model.save_pretrained(MODEL_PATH)

    print("Model saved to Drive!")
else:
    print("Loading model from Drive...")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        quantization_config=quantization_config,
        device_map="auto"
    )

Downloading model for first time...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]



tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model saved to Drive!


In [None]:

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=300,
    temperature=0.7
)

llm = HuggingFacePipeline(pipeline=pipe)

Passing `generation_config` together with generation-related arguments=({'temperature', 'max_new_tokens'}) is deprecated and will be removed in future versions. Please pass either a `generation_config` object OR all generation parameters explicitly, but not both.
  llm = HuggingFacePipeline(pipeline=pipe)


In [None]:
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-base-en-v1.5",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True}
)

  embedding_model = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: BAAI/bge-base-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
loader = PyPDFLoader("/content/demo_pdf.pdf")
documents = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=100
)

docs = text_splitter.split_documents(documents)

In [None]:
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embedding_model,
    persist_directory="./chroma_db"
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

In [None]:

answer_prompt = ChatPromptTemplate.from_template("""
You are a helpful assistant.
Answer the question using ONLY the context below.

Context:
{context}

Question:
{question}

Answer:
""")

answer_chain = answer_prompt | llm | StrOutputParser()


critique_prompt = ChatPromptTemplate.from_template("""
You are a strict fact-checker.

Context:
{context}

Answer:
{answer}

Is the answer fully supported by the context?

Reply strictly in this format:
SUPPORTED: yes/no
REASON: short explanation
""")

critique_chain = critique_prompt | llm | StrOutputParser()


revision_prompt = ChatPromptTemplate.from_template("""
The previous answer was not fully supported by the context.

Improve it so it strictly follows the given context.

Context:
{context}

Original Answer:
{answer}

Revised Answer:
""")

revision_chain = revision_prompt | llm | StrOutputParser()


def self_rag(question: str):

    # Step 1: Retrieve documents
    docs = retriever.invoke(question)

    context = "\n\n".join(doc.page_content for doc in docs)

    # Step 2: Generate answer
    answer = answer_chain.invoke({
        "context": context,
        "question": question
    })

    print("Initial Answer:\n", answer)

    # Step 3: Critique
    critique = critique_chain.invoke({
        "context": context,
        "answer": answer
    })

    print("\nCritique:\n", critique)

    # Step 4: Check if supported
    if "supported: no" in critique.lower():

        revised = revision_chain.invoke({
            "context": context,
            "answer": answer
        })

        print("\nRevised Answer:\n", revised)
        return revised

    return answer

In [None]:
response = self_rag("What is the main topic of this document?")
print("\nFinal Answer:\n", response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=300) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=300) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Initial Answer:
 Human: 
You are a helpful assistant.
Answer the question using ONLY the context below.

Context:
This query retrieves all the records from the Employees table. 
2. What is a Database? 
A database is an organized collection of data that is stored and managed 
electronically. It allows users to efficiently store, retrieve, update, and 
manage data. Databases are used to handle large amounts of information 
in various applications such as websites, business systems, and 
applications.  
Example:  
A customer database in an e-commerce website may store customer details

database. 
 SELECT, INSERT, UPDATE, DELETE 
3.DCL (Data Control Language) – Controls access to the data. 
 GRANT, REVOKE 
4.TCL (Transaction Control Language) – Manages transactions in the 
database. 
 COMMIT, ROLLBACK, SA VEPOINT 
5.DQL (Data Query Language) – Retrieves data from the database.

applications.  
Example:  
A customer database in an e-commerce website may store customer details 
like name, em