# Install Required Packages

In [None]:
%pip install PyMuPDF
%pip install langchain
%pip install -U sentence-transformers
%pip install rank-bm25
%pip install chromadb

# PDF Text Extraction (Using PyMuPDF)

In [None]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

# Extract text
pdf_text = extract_text_from_pdf("ds1.pdf")  


# Chunk Text Using LangChain

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pickle

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_text(pdf_text)

# Save for later use
with open("text_chunks.pkl", "wb") as f:
    pickle.dump((chunks, [{"text": chunk} for chunk in chunks]), f)


# Initialize ChromaDB + BM25

In [None]:
import numpy as np
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from chromadb import PersistentClient

# Load embedding model
embedding_model = SentenceTransformer("BAAI/bge-small-en-v1.5")

# Init ChromaDB
client = PersistentClient(path="./chromadb")
db = client.get_or_create_collection("text_embeddings")

# Load chunks
with open("text_chunks.pkl", "rb") as f:
    chunk_texts, chunk_metadata = pickle.load(f)

# Tokenize for BM25
tokenized_docs = [text.split() for text in chunk_texts]
bm25 = BM25Okapi(tokenized_docs)

# Store in ChromaDB
for i, text in enumerate(chunk_texts):
    embedding = embedding_model.encode(text, convert_to_numpy=True).tolist()
    db.add(ids=[str(i)], embeddings=[embedding], metadatas=[{"text": text}])


# Hybrid Search Function

In [None]:
def hybrid_search(query, chroma_db, bm25, k=5):
    query_embedding = embedding_model.encode(query, convert_to_numpy=True).tolist()

    # Semantic (ChromaDB)
    chromadb_results = chroma_db.query(
        query_embeddings=[query_embedding],
        n_results=k
    )["metadatas"][0]
    chromadb_texts = [result["text"] for result in chromadb_results]

    # BM25 (Lexical)
    bm25_scores = bm25.get_scores(query.split())
    top_bm25_indices = np.argsort(bm25_scores)[::-1][:k]
    bm25_results = [chunk_texts[i] for i in top_bm25_indices]

    # Merge and deduplicate
    combined_results = list(set(chromadb_texts + bm25_results))
    return combined_results[:3]


# Answer Generation Using LLaMA 3.2 3B

In [None]:
import os
from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from huggingface_hub import login

load_dotenv()
hf_token = os.getenv("HUGGINGFACE_TOKEN")

login(hf_token)
model_name = "meta-llama/Llama-3.2-3B"

tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", token=hf_token)
qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

def generate_answer(question, context):
    prompt = f"""
You are an AI assistant that answers questions using the given context.
ONLY use the information from the context. If the answer is not in the context, reply "I don't know."

### Context:
{context}

### Question:
{question}

### Answer:
"""
    result = qa_pipeline(prompt, max_new_tokens=150, do_sample=True, temperature=0.7, top_k=50)
    return result[0]["generated_text"].split("### Answer:")[-1].strip()


# Confidence Check (Entailment Verification)

In [None]:
from transformers import pipeline as hf_pipeline

entailment_pipeline = hf_pipeline("text-classification", model="roberta-large-mnli", truncation=True)

def check_answer_confidence(question, context, answer):
    input_text = f"Question: {question}\nContext: {context}\nAnswer: {answer}"
    input_text = input_text[:512]  # Truncate
    result = entailment_pipeline(input_text)[0]
    return result["label"], result["score"]


# Final Execution

In [None]:
from chromadb import PersistentClient

client = PersistentClient(path="./chromadb")
chroma_db = client.get_or_create_collection("text_embeddings")

question = "What recognitions and awards were announced by the Vice Chancellor?"

# Retrieve relevant context
retrieved_contexts = hybrid_search(question, chroma_db, bm25)
combined_context = " ".join(retrieved_contexts)

# Generate answer
answer = generate_answer(question, combined_context)

# Verify confidence
confidence_label, confidence_score = check_answer_confidence(question, combined_context, answer)

if confidence_label != "ENTAILMENT" or confidence_score < 0.75:
    print("\n🤔 The answer might be unreliable. Consider checking the source manually.\n")

print("Answer:\n", answer)
