In [1]:
import logging
import os
from typing import List, Any
import hashlib
import numpy as np
from tqdm import tqdm

from langchain.docstore.document import Document as LangchainDocument
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import CrossEncoder

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

In [2]:
import langchain
print(langchain.__version__)

0.3.25


In [3]:
def initialize_embeddings(model_name: str = "sentence-transformers/all-MiniLM-L6-v2") -> HuggingFaceEmbeddings:
    """Initialize Hugging Face embeddings."""
    try:
        embeddings = HuggingFaceEmbeddings(
            model_name=model_name,
            multi_process=True,
            encode_kwargs={"normalize_embeddings": True}
        )
        logger.info("Embedding model initialized successfully.")
        return embeddings
    except Exception as e:
        logger.error(f"Error initializing embedding model: {str(e)}")
        raise

def initialize_reranker(model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2") -> CrossEncoder:
    """Initialize cross-encoder reranker."""
    try:
        reranker = CrossEncoder(model_name)
        logger.info("Reranker initialized successfully.")
        return reranker
    except Exception as e:
        logger.error(f"Error initializing reranker: {str(e)}")
        raise

# Initialize models
EMBEDDINGS = initialize_embeddings()
RERANKER = initialize_reranker()

2025-05-11 05:19:05,513 - INFO - Use pytorch device_name: cpu
2025-05-11 05:19:05,516 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
2025-05-11 05:19:24,730 - INFO - Embedding model initialized successfully.
2025-05-11 05:19:28,107 - INFO - Use pytorch device: cpu
2025-05-11 05:19:28,671 - INFO - Reranker initialized successfully.


In [4]:
def load_pdfs(directory: str = "data") -> List[LangchainDocument]:
    """Load all PDFs from a directory."""
    if not os.path.exists(directory):
        logger.error(f"Directory not found at {directory}")
        raise FileNotFoundError(f"Directory not found at {directory}")
    try:
        loader = PyPDFDirectoryLoader(directory)
        documents = loader.load()
        if not documents:
            logger.warning(f"No PDFs found in {directory}")
            raise ValueError(f"No PDFs found in {directory}")
        logger.info(f"Loaded {len(documents)} pages from PDFs in {directory}")
        return documents
    except Exception as e:
        logger.error(f"Error loading PDFs: {str(e)}")
        raise

def remove_duplicates(docs: List[LangchainDocument]) -> List[LangchainDocument]:
    """Remove duplicate documents based on content and metadata."""
    seen_hashes = set()
    unique_docs = []
    for doc in docs:
        content_hash = hashlib.md5(doc.page_content.encode('utf-8')).hexdigest()
        metadata_key = f"{doc.metadata.get('source', '')}:{doc.metadata.get('page', '')}"
        unique_key = f"{content_hash}:{metadata_key}"
        if unique_key not in seen_hashes:
            seen_hashes.add(unique_key)
            unique_docs.append(doc)
    return unique_docs

def split_documents(
    documents: List[LangchainDocument],
    chunk_size: int = 256,  # Optimized for CPU
    use_tokenizer: bool = True,
    tokenizer_name: str = "sentence-transformers/all-MiniLM-L6-v2"
) -> List[LangchainDocument]:
    """Split documents using RecursiveCharacterTextSplitter."""
    separators = [
        "\n#{1,6} ",
        "```\n",
        "\n\\*\\*\\*+\n",
        "\n---+\n",
        "\n___+\n",
        "\n\n",
        "\n",
        " ",
        "",
    ]
    try:
        if use_tokenizer:
            tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
            splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
                tokenizer,
                chunk_size=chunk_size,
                chunk_overlap=int(chunk_size / 10),
                add_start_index=True,
                strip_whitespace=True,
                separators=separators,
            )
        else:
            splitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=int(chunk_size / 10),
                add_start_index=True,
                strip_whitespace=True,
                separators=separators,
            )
        
        docs_processed = []
        for doc in tqdm(documents, desc="Splitting documents"):
            docs_processed += splitter.split_documents([doc])
        unique_docs = remove_duplicates(docs_processed)
        logger.info(f"Processed {len(unique_docs)} unique document chunks ({'tokenizer' if use_tokenizer else 'character'}-based)")
        return unique_docs
    except Exception as e:
        logger.error(f"Error splitting documents: {str(e)}")
        raise

def create_faiss_index(
    documents: List[LangchainDocument],
    embeddings: HuggingFaceEmbeddings,
    index_path: str
) -> FAISS:
    """Create or load FAISS index."""
    try:
        if os.path.exists(index_path):
            index = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
            logger.info(f"Loaded existing FAISS index at {index_path}")
            return index
    except:
        pass
    try:
        index = FAISS.from_documents(documents, embeddings, distance_strategy="COSINE")
        os.makedirs(index_path, exist_ok=True)
        index.save_local(index_path)
        logger.info(f"Created and saved FAISS index at {index_path}")
        return index
    except Exception as e:
        logger.error(f"Error creating FAISS index: {str(e)}")
        raise

# Load and process documents
DATA_DIR = "data"
INDEX_PATH_TOKENIZER = "faiss_index_tokenizer"
INDEX_PATH_CHAR = "faiss_index_char"

RAW_DOCUMENTS = load_pdfs(DATA_DIR)
DOCS_TOKENIZER = split_documents(RAW_DOCUMENTS, use_tokenizer=True)
DOCS_CHAR = split_documents(RAW_DOCUMENTS, use_tokenizer=False)
INDEX_TOKENIZER = create_faiss_index(DOCS_TOKENIZER, EMBEDDINGS, INDEX_PATH_TOKENIZER)
INDEX_CHAR = create_faiss_index(DOCS_CHAR, EMBEDDINGS, INDEX_PATH_CHAR)

2025-05-11 05:19:34,444 - INFO - Loaded 10 pages from PDFs in data
Splitting documents: 100%|██████████| 10/10 [00:00<00:00, 26.78it/s]
2025-05-11 05:19:35,246 - INFO - Processed 19 unique document chunks (tokenizer-based)
Splitting documents: 100%|██████████| 10/10 [00:00<00:00, 1231.23it/s]
2025-05-11 05:19:35,279 - INFO - Processed 98 unique document chunks (character-based)
2025-05-11 05:19:35,429 - INFO - Loading faiss with AVX2 support.
2025-05-11 05:19:37,481 - INFO - Successfully loaded faiss with AVX2 support.
2025-05-11 05:19:37,513 - INFO - Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes. This is only an error if you're trying to use GPU Faiss.
2025-05-11 05:19:37,613 - INFO - Loaded existing FAISS index at faiss_index_tokenizer
2025-05-11 05:19:37,678 - INFO - Loaded existing FAISS index at faiss_index_char


In [5]:
def initialize_llm(model_name: str = "distilgpt2") -> HuggingFacePipeline:
    """Initialize DistilGPT-2 LLM."""
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=100,
            do_sample=True,
            temperature=0.7,
            repetition_penalty=1.1,
            device=-1
        )
        llm = HuggingFacePipeline(pipeline=pipe)
        logger.info("DistilGPT-2 initialized successfully.")
        return llm
    except Exception as e:
        logger.error(f"Error initializing DistilGPT-2: {str(e)}")
        raise

def create_conversational_rag_chain(
    vectorstore: FAISS,
    reranker: CrossEncoder,
    llm: HuggingFacePipeline,
    k: int = 10,
    top_n: int = 2
) -> ConversationalRetrievalChain:
    """Create Conversational RAG chain with FAISS retriever and reranker."""
    memory = ConversationBufferMemory(
        memory_key="chat_history",
        return_messages=True,
        output_key="answer",
        max_token_limit=500
    )
    
    prompt_template = PromptTemplate.from_template(
        """You are an expert on Adama Science and Technology University (ASTU) policies. Using only the provided context from ASTU policy documents, provide a concise and accurate answer to the question. Include the source document number if relevant. If the answer cannot be found in the context, respond with "I don't know" and nothing else. Do not generate information beyond the context. Consider the chat history for context.

Chat History:
{chat_history}
Context:
{context}
---
Question: {question}"""
    )
    
    def format_docs(documents: List[LangchainDocument], query: str) -> str:
        """Rerank and format documents."""
        try:
            if not documents:
                return ""
            pairs = [[query, doc.page_content] for doc in documents]
            scores = reranker.predict(pairs, batch_size=32)
            sorted_docs = [doc for _, doc in sorted(zip(scores, documents), key=lambda x: x[0], reverse=True)]
            top_docs = sorted_docs[:top_n]
            if not top_docs:
                return ""
            return "\n".join([f"Document {i}:::\n{doc.page_content}" for i, doc in enumerate(top_docs)])
        except Exception as e:
            logger.error(f"Error in format_docs: {str(e)}")
            return ""
    
    try:
        retriever = vectorstore.as_retriever(search_kwargs={"k": k})
        
        chain = ConversationalRetrievalChain.from_llm(
            llm=llm,
            retriever=retriever,
            memory=memory,
            combine_docs_chain_kwargs={
                "prompt": prompt_template,
                "document_variable_name": "context",
                "document_prompt": PromptTemplate.from_template("Document {i}:::\n{page_content}")
            },
            return_source_documents=False,
            verbose=False,
            get_chat_history=lambda h: h,
            chain_type="stuff"
        )
        
        # Wrap combine_docs_chain to include reranking
        original_combine_docs = chain.combine_docs_chain
        chain.combine_docs_chain = lambda docs, question, **kwargs: original_combine_docs(
            docs=[LangchainDocument(page_content=format_docs(docs, question))], question=question, **kwargs
        )
        
        logger.info("Conversational RAG chain created successfully.")
        return chain
    except Exception as e:
        logger.error(f"Error creating Conversational RAG chain: {str(e)}")
        raise

# Initialize LLM and RAG chains
LLM = initialize_llm()
RAG_CHAIN_TOKENIZER = create_conversational_rag_chain(INDEX_TOKENIZER, RERANKER, LLM)
RAG_CHAIN_CHAR = create_conversational_rag_chain(INDEX_CHAR, RERANKER, LLM)

Device set to use cpu
2025-05-11 05:19:41,267 - INFO - DistilGPT-2 initialized successfully.
  memory = ConversationBufferMemory(
2025-05-11 05:19:41,272 - INFO - Conversational RAG chain created successfully.
2025-05-11 05:19:41,276 - INFO - Conversational RAG chain created successfully.


In [6]:
def test_rag_chain(rag_chain: ConversationalRetrievalChain, queries: List[str], method: str) -> None:
    """Test RAG chain with a list of queries and print results."""
    for query in queries:
        logger.info(f"Processing query: {query} with {method} splitting")
        try:
            result = rag_chain({"question": query})
            print(f"\nMethod: {method}")
            print(f"Query: {query}")
            print(f"Answer: {result['answer']}")
            print(f"Chat History: {rag_chain.memory.buffer}")
        except Exception as e:
            logger.error(f"Error processing query '{query}': {str(e)}")
            print(f"Error: {str(e)}")

# Test queries
test_queries = [
    "What are ASTU’s add and drop policies?",
    "Tell me more about the deadlines.",
    "What does Article 82 say about student conduct?",
    "What is the capital of France?"
]

# Test tokenizer-based chain
print("Testing Tokenizer-based Splitting")
test_rag_chain(RAG_CHAIN_TOKENIZER, test_queries, "Tokenizer-based")

# Reset memory
RAG_CHAIN_TOKENIZER.memory.clear()
logger.info("Tokenizer-based chat history cleared")

# Test character-based chain
print("\nTesting Character-based Splitting")
test_rag_chain(RAG_CHAIN_CHAR, test_queries, "Character-based")

# Reset memory
RAG_CHAIN_CHAR.memory.clear()
logger.info("Character-based chat history cleared")

2025-05-11 05:19:41,327 - INFO - Processing query: What are ASTU’s add and drop policies? with Tokenizer-based splitting
  result = rag_chain({"question": query})


Testing Tokenizer-based Splitting


2025-05-11 05:19:41,672 - INFO - CUDA/NPU is not available. Starting 4 CPU workers
2025-05-11 05:19:41,673 - INFO - Start multi-process pool on devices: cpu, cpu, cpu, cpu


Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-11 05:23:19,112 - ERROR - Error processing query 'What are ASTU’s add and drop policies?': 'function' object has no attribute 'run'
2025-05-11 05:23:19,113 - INFO - Processing query: Tell me more about the deadlines. with Tokenizer-based splitting
2025-05-11 05:23:19,116 - INFO - CUDA/NPU is not available. Starting 4 CPU workers
2025-05-11 05:23:19,117 - INFO - Start multi-process pool on devices: cpu, cpu, cpu, cpu


Error: 'function' object has no attribute 'run'


Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-11 05:24:16,468 - ERROR - Error processing query 'Tell me more about the deadlines.': 'function' object has no attribute 'run'
2025-05-11 05:24:16,471 - INFO - Processing query: What does Article 82 say about student conduct? with Tokenizer-based splitting
2025-05-11 05:24:16,476 - INFO - CUDA/NPU is not available. Starting 4 CPU workers
2025-05-11 05:24:16,479 - INFO - Start multi-process pool on devices: cpu, cpu, cpu, cpu


Error: 'function' object has no attribute 'run'


Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-11 05:24:53,271 - ERROR - Error processing query 'What does Article 82 say about student conduct?': 'function' object has no attribute 'run'
2025-05-11 05:24:53,273 - INFO - Processing query: What is the capital of France? with Tokenizer-based splitting
2025-05-11 05:24:53,277 - INFO - CUDA/NPU is not available. Starting 4 CPU workers
2025-05-11 05:24:53,278 - INFO - Start multi-process pool on devices: cpu, cpu, cpu, cpu


Error: 'function' object has no attribute 'run'


Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-11 05:25:36,353 - ERROR - Error processing query 'What is the capital of France?': 'function' object has no attribute 'run'
2025-05-11 05:25:36,356 - INFO - Tokenizer-based chat history cleared
2025-05-11 05:25:36,357 - INFO - Processing query: What are ASTU’s add and drop policies? with Character-based splitting
2025-05-11 05:25:36,359 - INFO - CUDA/NPU is not available. Starting 4 CPU workers
2025-05-11 05:25:36,363 - INFO - Start multi-process pool on devices: cpu, cpu, cpu, cpu


Error: 'function' object has no attribute 'run'

Testing Character-based Splitting


Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-11 05:26:14,462 - ERROR - Error processing query 'What are ASTU’s add and drop policies?': 'function' object has no attribute 'run'
2025-05-11 05:26:14,463 - INFO - Processing query: Tell me more about the deadlines. with Character-based splitting
2025-05-11 05:26:14,465 - INFO - CUDA/NPU is not available. Starting 4 CPU workers
2025-05-11 05:26:14,467 - INFO - Start multi-process pool on devices: cpu, cpu, cpu, cpu


Error: 'function' object has no attribute 'run'


Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-11 05:26:51,755 - ERROR - Error processing query 'Tell me more about the deadlines.': 'function' object has no attribute 'run'
2025-05-11 05:26:51,757 - INFO - Processing query: What does Article 82 say about student conduct? with Character-based splitting
2025-05-11 05:26:51,761 - INFO - CUDA/NPU is not available. Starting 4 CPU workers
2025-05-11 05:26:51,764 - INFO - Start multi-process pool on devices: cpu, cpu, cpu, cpu


Error: 'function' object has no attribute 'run'


Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-11 05:27:28,645 - ERROR - Error processing query 'What does Article 82 say about student conduct?': 'function' object has no attribute 'run'
2025-05-11 05:27:28,646 - INFO - Processing query: What is the capital of France? with Character-based splitting
2025-05-11 05:27:28,651 - INFO - CUDA/NPU is not available. Starting 4 CPU workers
2025-05-11 05:27:28,654 - INFO - Start multi-process pool on devices: cpu, cpu, cpu, cpu


Error: 'function' object has no attribute 'run'


Chunks:   0%|          | 0/1 [00:00<?, ?it/s]

2025-05-11 05:28:09,342 - ERROR - Error processing query 'What is the capital of France?': 'function' object has no attribute 'run'
2025-05-11 05:28:09,344 - INFO - Character-based chat history cleared


Error: 'function' object has no attribute 'run'
