In [6]:
!pip install sentence-transformers chromadb tqdm transformers accelerate bitsandbytes -qq
!pip install rouge-score -qq
!pip install ragas datasets -qq

In [7]:
import os, re
import math
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import chromadb
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import gc
from sklearn.metrics import precision_score, recall_score, f1_score
from rouge_score import rouge_scorer
import numpy as np

In [9]:
dataset_path = "/content/"
file_name = "Baronness Orczy___The Heart of a Woman.txt"
file_path = os.path.join(dataset_path, file_name)

In [10]:
if not os.path.exists(file_path):
    print(f"Error: File not found at {file_path}. Please upload the file first.")

with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
    raw_text = f.read()

print(f"Raw text length: {len(raw_text)} characters")

Raw text length: 450293 characters


In [11]:
def clean_gutenberg_text(text):
    text = re.sub(r'[_*#=~‚Äú‚Äù"\'‚Äî]+', ' ', text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

paragraphs = raw_text.split('\n\n')

filtered_chunks = []
for p in paragraphs:
    p = clean_gutenberg_text(p)
    if not p:
        continue
    if p.upper().startswith("CHAPTER") or len(p.split()) < 10:
        continue

    filtered_chunks.append(p)


print(f"\nExtracted {len(filtered_chunks)} chunks (Paragraphs)")
print("\n ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á Chunk ‡∏´‡∏•‡∏±‡∏á clean:")
for example in filtered_chunks[:3]:
    print("-", example[:150] + ("..." if len(example) > 150 else ""))


Extracted 1741 chunks (Paragraphs)

 ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á Chunk ‡∏´‡∏•‡∏±‡∏á clean:
- No! No! she was not going to gush!--Not even though there was nothing in the room at this moment to stand up afterward before her as dumb witness to a...
- But a wood fire crackled on the small hearth . . . and . . . and those citron-coloured carnations were favourite flowers of his . . . and his picture ...
- And Louisa counted herself one of the strong ones of this earth. Just think of her name. Have you ever known a Louisa who gushed? who called herself t...


In [12]:
model_emb = SentenceTransformer("all-mpnet-base-v2")
print("\nUsing Embedding Model: all-mpnet-base-v2 (768 dimensions)")

client = chromadb.PersistentClient(path="/content/chroma_db_optimized")
try:
    client.delete_collection(name="baroness_orczy_optimized")
except:
    pass
collection = client.get_or_create_collection(name="baroness_orczy_optimized")

batch_size = 256
num_chunks = len(filtered_chunks)
num_batches = math.ceil(num_chunks / batch_size)

print(f"Uploading {num_chunks} chunks in {num_batches} batches...")

for i in tqdm(range(num_batches), desc="Uploading Chunks"):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, num_chunks)

    batch_chunks = filtered_chunks[start_idx:end_idx]

    batch_embeddings = model_emb.encode(batch_chunks, show_progress_bar=False).tolist()

    batch_ids = [file_name + "_" + str(start_idx + j) for j in range(len(batch_chunks))]

    collection.add(
        documents=batch_chunks,
        embeddings=batch_embeddings,
        ids=batch_ids
    )

print("All chunks uploaded successfully!")
print("Total documents in collection:", collection.count())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Using Embedding Model: all-mpnet-base-v2 (768 dimensions)
Uploading 1741 chunks in 7 batches...


Uploading Chunks: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:14<00:00,  2.07s/it]

All chunks uploaded successfully!
Total documents in collection: 1741





In [None]:
if torch.cuda.is_available():
    del model_emb
    gc.collect()
    torch.cuda.empty_cache()
    print("VRAM cleared before LLM load.")

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
llm_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
print("Mistral-7B-Instruct-v0.2 loaded with 4-bit Quantization.")

VRAM cleared before LLM load.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Mistral-7B-Instruct-v0.2 loaded with 4-bit Quantization.


In [None]:
query_text = "Why is Louisa overwhelmed with happiness at the beginning of the story?"
print("\nQuery:", query_text)

model_emb_query = SentenceTransformer("all-mpnet-base-v2").to(device)
query_embedding = model_emb_query.encode([query_text])

results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=3
)

del model_emb_query
gc.collect()
torch.cuda.empty_cache()

print("\nTop relevant chunks (Context):\n")
context = ""
for i, doc in enumerate(results["documents"][0]):
    print(f"--- Chunk {i+1} ---")
    print(doc[:250] + "...")
    context += doc + "\n\n"

prompt = f"""You are a helpful assistant.
Use the following context from the book to answer the question clearly and concisely in one paragraph.
If the answer cannot be found in the context, state that clearly.

Context:
{context}

Question: {query_text}

Answer:"""

inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096).to(device)

outputs = llm_model.generate(
    **inputs,
    max_new_tokens=350,
    temperature=0.7,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)


Query: Why is Louisa overwhelmed with happiness at the beginning of the story?

Top relevant chunks (Context):

--- Chunk 1 ---
Louisa gave ungrudging admiration, and whispered praise to the young girl. She was proud of Edie s behaviour, and grateful to her too. This atmosphere of reserve did her good. She could not have endured a scene of weeping, and keep her own nerves in ...
--- Chunk 2 ---
But Louisa, though a modern product of an ultra-modern world, was an absolutely ordinary woman--just a commonplace, sensible creature who thought and felt in a straight and essentially wholesome manner. Though she had read Tolstoi and Dostoyefsky and...
--- Chunk 3 ---
Louisa smiled confidently, proudly. He held her hand and she felt that his--hot and dry--quivered in every muscle at her touch. The commonplace woman had opened the magic book of Love. She had turned its first pages, the opening chapters had been sim...


In [31]:
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n" + "="*50)
print(" **RAG Model Answer**".center(50))
print("="*50)

answer_start = answer.find("Answer:")

if answer_start != -1:
    final_answer = answer[answer_start + len("Answer:"):].strip()

    formatted_answer = final_answer.replace('\n', ' ').replace('\r', '') # ‡∏•‡∏ö Newline ‡πÄ‡∏Å‡πà‡∏≤

    words = formatted_answer.split()
    line_length = 0
    max_line_length = 85

    final_output = ""
    for word in words:
        if line_length + len(word) + 1 > max_line_length:
            final_output += "\n" + word + " "
            line_length = len(word) + 1
        else:
            final_output += word + " "
            line_length += len(word) + 1

    print(final_output.strip())

else:
    print(answer)

print("="*50 + "\n")


               **RAG Model Answer**               
Louisa is overwhelmed with happiness at the beginning of the story because she has 
finally experienced love and her feelings for Edith's behavior and Edith's love for 
her have opened the "magic book of Love" for her. Despite encountering the 
complexity and challenges that come with love, Louisa remains unaffected and 
unruffled, reveling in her heartfelt connection with Edith.



In [27]:
test_QA = [
    {
        "question": "Why is Louisa overwhelmed with happiness at the beginning of the story?",
        "answer": "Because she receives a letter from Luke confessing his deep love and intention to marry her."
    },
    {
        "question": "Why does Louisa suppress her excitement even when she is alone?",
        "answer": "Because she believes a ‚Äòsensible and strong‚Äô woman like her should not behave romantically or foolishly."
    },
    {
        "question": "How does Louisa view herself in contrast to romantic heroines?",
        "answer": "She sees herself as ordinary, plain, and not the type of woman who experiences dramatic romance."
    },
    {
        "question": "What event abruptly breaks Louisa‚Äôs romantic mood?",
        "answer": "She witnesses a murder in a taxicab during a storm on Boulevard Waterloo."
    },
    {
        "question": "What does the murder scene symbolize in relation to Louisa‚Äôs emotional state?",
        "answer": "It contrasts sharply with her dreamy happiness, pulling her back to the harshness of reality."
    }
]

print("Loaded", len(test_QA), "evaluation questions.\n")

Loaded 5 evaluation questions.



In [28]:
from sentence_transformers import SentenceTransformer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

model_emb_query = SentenceTransformer("all-mpnet-base-v2").to(device)

print("Model loaded on:", device)

Model loaded on: cuda


In [None]:
def recall_at_k(gt_answer, retrieved_docs, k):
    retrieved_text = " ".join(retrieved_docs[:k]).lower()
    return 1 if any(token in retrieved_text for token in gt_answer.lower().split()) else 0

def mrr_score(gt_answer, retrieved_docs):
    retrieved_text = " ".join(retrieved_docs).lower()
    for rank, doc in enumerate(retrieved_docs, start=1):
        if any(token in doc.lower() for token in gt_answer.lower().split()):
            return 1.0 / rank
    return 0.0

def exact_match(pred, gt):
    return int(pred.strip().lower() == gt.strip().lower())

def f1(pred, gt):
    pred_tokens = pred.lower().split()
    gt_tokens = gt.lower().split()

    common = set(pred_tokens) & set(gt_tokens)
    if len(common) == 0:
        return 0.0

    precision = len(common) / len(pred_tokens)
    recall = len(common) / len(gt_tokens)
    return 2 * precision * recall / (precision + recall)

rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

retrieval_recalls = []
retrieval_mrrs = []
gen_em = []
gen_f1 = []
gen_rouge = []

for item in test_QA:
    q = item["question"]
    true_a = item["answer"]

    # 1) retrieval ‚Äî FIX HERE
    q_emb = model_emb_query.encode([q]).tolist()
    retrieved = collection.query(
        query_embeddings=q_emb,
        n_results=5
    )["documents"][0]

    # Retrieval metrics
    retrieval_recalls.append(recall_at_k(true_a, retrieved, k=5))
    retrieval_mrrs.append(mrr_score(true_a, retrieved))

    # 2) generation
    ctx = "\n".join(retrieved)
    prompt = f"""
    Context:
    {ctx}

    Question: {q}
    Answer briefly:
    """

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = llm_model.generate(**inputs, max_new_tokens=200)
    pred_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Generation metrics
    gen_em.append(exact_match(pred_answer, true_a))
    gen_f1.append(f1(pred_answer, true_a))
    gen_rouge.append(rouge.score(pred_answer, true_a)["rougeL"].fmeasure)

print("\n==============================")
print("üìä RAG Evaluation Results")
print("==============================\n")

print(f"Retrieval Recall@5: {np.mean(retrieval_recalls):.3f}")
print(f"Retrieval MRR:      {np.mean(retrieval_mrrs):.3f}\n")

print(f"Generation EM:      {np.mean(gen_em):.3f}")
print(f"Generation F1:      {np.mean(gen_f1):.3f}")
print(f"Generation ROUGE-L: {np.mean(gen_rouge):.3f}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



üìä RAG Evaluation Results

Retrieval Recall@5: 1.000
Retrieval MRR:      1.000

Generation EM:      0.000
Generation F1:      0.026
Generation ROUGE-L: 0.025


In [26]:
!zip -r chroma_db_optimized.zip /content/chroma_db_optimized

  adding: content/chroma_db_optimized/ (stored 0%)
  adding: content/chroma_db_optimized/chroma.sqlite3 (deflated 39%)
  adding: content/chroma_db_optimized/c5119f26-3daa-466d-9602-6577e45dc7b6/ (stored 0%)
  adding: content/chroma_db_optimized/c5119f26-3daa-466d-9602-6577e45dc7b6/link_lists.bin (deflated 86%)
  adding: content/chroma_db_optimized/c5119f26-3daa-466d-9602-6577e45dc7b6/index_metadata.pickle (deflated 91%)
  adding: content/chroma_db_optimized/c5119f26-3daa-466d-9602-6577e45dc7b6/header.bin (deflated 61%)
  adding: content/chroma_db_optimized/c5119f26-3daa-466d-9602-6577e45dc7b6/length.bin (deflated 84%)
  adding: content/chroma_db_optimized/c5119f26-3daa-466d-9602-6577e45dc7b6/data_level0.bin (deflated 9%)
