<a href="https://colab.research.google.com/github/ghoshmoumita04/EnterpriseSearch/blob/main/Dummy_Enterprise_Knowledge_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Imports (TOP of file)

import wikipedia
import re
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer


In [None]:
#Text Cleaning Function

def clean_text(text):
    # Remove LaTeX blocks
    text = re.sub(r"\$.*?\$", " ", text)
    text = re.sub(r"\{\\displaystyle.*?\}", " ", text)

    # Remove equations and math-heavy lines
    text = re.sub(r"=.*?\]", " ", text)

    # Remove brackets and references
    text = re.sub(r"\[.*?\]", " ", text)
    text = re.sub(r"\(.*?\)", " ", text)

    # Remove non-ASCII junk
    text = re.sub(r"[^\x00-\x7F]+", " ", text)

    # Normalize whitespace
    text = re.sub(r"\n+", " ", text)
    text = re.sub(r"\s+", " ", text)

    return text.strip()




In [None]:
#Wikipedia Corpus Loader (Data Ingestion)

import wikipedia

def load_wikipedia_corpus(topics):
    documents = []

    for topic in topics:
        try:
            page = wikipedia.page(topic, auto_suggest=False)
            documents.append({
                "title": page.title,
                "content": page.content,
                "url": page.url
            })
        except Exception as e:
            print(f"Skipping {topic}: {e}")

    return documents


topics = [
    "Artificial intelligence",
    "Machine learning",
    "Transformer (machine learning)",
    "Vector embeddings",
    "Retrieval augmented generation"
]

documents = load_wikipedia_corpus(topics)
len(documents)


Skipping Vector embeddings: Page id "Vector embeddings" does not match any pages. Try another id!


4

In [None]:
#Chunking (Critical for RAG)

def chunk_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap

    return chunks


chunked_docs = []
for doc in documents:
    chunks = chunk_text(doc["content"])
    for chunk in chunks:
        chunked_docs.append({
            "title": doc["title"],
            "content": chunk,
            "url": doc["url"]
        })

len(chunked_docs)


608

In [None]:
#Embedding Model (SentenceTransformer)

from sentence_transformers import SentenceTransformer
import numpy as np

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

texts = [d["content"] for d in chunked_docs]
embeddings = embedding_model.encode(texts, convert_to_numpy=True)

embeddings.shape


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


(608, 384)

In [None]:
#FAISS Vector Index

import faiss

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print("FAISS index size:", index.ntotal)


FAISS index size: 608


In [None]:
#Semantic Search (Top-K Retrieval)

def retrieve_top_k(query, top_k=5):
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for idx in indices[0]:
        results.append(chunked_docs[idx])

    return results


In [None]:
#Prompt Construction (RAG Core)

def build_prompt(query, retrieved_docs):
    context = "\n\n".join(
        [f"- {doc['content']}" for doc in retrieved_docs]
    )

    prompt = f"""
You are an expert assistant.
Answer the question ONLY using the context below.
If the answer is not present, say "I don't know".

Context:
{context}

Question:
{query}

Answer:
"""
    return prompt




In [None]:
#PEFT Fine-Tuned Model (Answer Generator)

!pip install bitsandbytes accelerate transformers --upgrade

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

base_model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto"
)




Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]



In [None]:
#Apply PEFT (LoRA)

from peft import PeftModel

peft_model_path = "./lora_wikipedia_rag"

# model = PeftModel.from_pretrained(base_model, peft_model_path)
# model.eval()

# NOTE: The above lines are commented out because the PEFT model 'lora_wikipedia_rag' has not been trained and saved.
# If you wish to use a fine-tuned model, you need to first train and save it,
# or uncomment these lines if 'lora_wikipedia_rag' already exists at the specified path.
# For now, the 'base_model' loaded previously will be used for answer generation.

In [None]:
#Answer Generation (Grounded)

def generate_answer(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(base_model.device)

    with torch.no_grad():
        output = base_model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.2,
            do_sample=False
        )

    return tokenizer.decode(output[0], skip_special_tokens=True)


In [None]:
#End-to-End Query Flow

query = "What is retrieval augmented generation?"

top_docs = retrieve_top_k(query, top_k=5)
prompt = build_prompt(query, top_docs)
answer = generate_answer(prompt)

print(answer)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


KeyboardInterrupt: 

In [None]:
#PEFT Fine-Tuning (Training Skeleton)

from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()


trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.0470


In [None]:
print("\nüîç Semantic Search (type 'exit' to quit)\n")

while True:
    query = input("Enter your query: ")

    if query.lower() == "exit":
        print("Goodbye üëã")
        break

    results = retrieve_top_k(query)

    print("\nTop Results:\n")
    for i, res in enumerate(results, 1):
        print(f"--- Result {i} ---")
        print(res["content"][:500])
        print()


üîç Semantic Search (type 'exit' to quit)


Top Results:

--- Result 1 ---
 
                          K
                          
                            
                              T
                            
                          
                        
                      
                      
                        
                          d
                          
                            k
                          
                        
                      
                    
                  
                  )
          

--- Result 2 ---
                
                                T
                              
                            
                          
                        
                        
                          
                            d
                            
                              k
                            
                          
                        
                 