In [1]:
import os
import time
import pandas as pd
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.llms import HuggingFacePipeline
import torch # For device management and dtype

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# --- Configuration ---
VECTOR_STORE_DIR = '../data/vector_store'
EMBEDDING_MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
# --- LOCAL LLM CONFIGURATION ---
# Using Zephyr-7B-Beta for local inference
# You might need to experiment with other quantized versions (e.g., GGUF via ctransformers or llama_cpp_python)
# if you face memory issues or extreme slowness on your specific hardware.
# 'HuggingFaceH4/zephyr-7b-beta' is the model ID
# For local inference, direct model loading is usually better than HuggingFaceHub for custom params
LLM_MODEL_LOCAL_NAME = "HuggingFaceH4/zephyr-7b-beta"
# --- ---


In [4]:
def get_embedding_model(model_name):
    """
    Loads the HuggingFace embedding model, ensuring GPU usage if available.
    (Copied from vector_store_indexing.py for consistency, though it's typically loaded once)
    """
    print(f"\n--- Loading embedding model: {model_name} ---")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device for embeddings: {device}")
    try:
        embeddings_model = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs={'device': device}
        )
        print("Embedding model loaded successfully.")
        return embeddings_model
    except Exception as e:
        print(f"Error loading embedding model {model_name}: {e}")
        print("Please ensure you have 'sentence-transformers' and 'torch' installed.")
        return None

In [5]:
def load_vector_store(persist_directory, embedding_function):
    """
    Loads the persisted ChromaDB vector store.
    """
    print(f"\n--- Loading vector store from {persist_directory} ---")
    if not os.path.exists(persist_directory):
        print(f"Error: Vector store directory '{persist_directory}' not found. Please run Task 2 first.")
        return None
    try:
        vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding_function)
        print("Vector store loaded successfully.")
        return vectordb
    except Exception as e:
        print(f"Error loading vector store: {e}")
        return None

In [6]:
def get_local_llm_model(model_name):
    """
    Loads a local LLM using HuggingFace Transformers pipeline.
    This will attempt to use GPU if available. Quantization is applied.
    """
    print(f"\n--- Loading local LLM model: {model_name} ---")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        # Load the model with 4-bit quantization for efficiency
        # requires `bitsandbytes` and `accelerate`
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16, # Use float16 for efficiency on GPU
            device_map="auto",         # Automatically places model parts on GPU/CPU
            load_in_4bit=True          # Enable 4-bit quantization
        )
        # Ensure the pad_token is set for generation, especially important for batching or some pipelines
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            print(f"Tokenizer pad_token set to eos_token: {tokenizer.pad_token}")


        # Create a text-generation pipeline
        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=512,  # Max tokens for the LLM to generate in response
            temperature=0.7,     # Controls randomness of output
            top_p=0.95,          # Nucleus sampling
            repetition_penalty=1.1 # Avoids repetitive text
        )

        llm = HuggingFacePipeline(pipeline=pipe)
        print(f"Local LLM '{model_name}' loaded successfully. Device map: {model.hf_device_map}")
        return llm
    except Exception as e:
        print(f"Error loading local LLM model {model_name}: {e}")
        print("Please ensure you have 'transformers', 'torch', 'accelerate', and 'bitsandbytes' installed.")
        print("If you have an NVIDIA GPU, ensure CUDA is properly set up.")
        return None

In [7]:
def implement_rag_system(vector_store, llm):
    """
    Implements the RAG system using a loaded LLM and a vector store.
    """
    print("\n--- Implementing RAG System ---")

    # Define the prompt template
    template = """Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Use three sentences maximum and keep the answer as concise as possible.
    Always say "Thanks for asking!" at the end of the answer.

    {context}

    Question: {question}

    Helpful Answer:"""
    QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

    if vector_store is None:
        print("Error: Vector store is not loaded. Cannot implement RAG system.")
        return None
    if llm is None:
        print("Error: LLM model is not loaded. Cannot implement RAG system.")
        return None

    try:
        # Create the RetrievalQA chain
        rag_chain = RetrievalQA.from_chain_type(
            llm=llm,
            retriever=vector_store.as_retriever(search_kwargs={"k": 5}), # Retrieve top 5 relevant chunks
            return_source_documents=True, # Important for evaluation to see retrieved context
            chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
        )
        print("RAG system implemented successfully.")
        return rag_chain
    except Exception as e:
        print(f"Error implementing RAG system: {e}")
        return None


In [8]:
def qualitative_evaluation(rag_system, questions):
    """
    Performs a qualitative evaluation of the RAG system using representative questions.
    """
    print("\n--- Starting Qualitative Evaluation ---")
    if rag_system is None:
        print("RAG system not initialized. Skipping evaluation.")
        return

    for i, item in enumerate(questions):
        question = item["question"]
        product_filter = item.get("product_filter") # Optional filter
        print(f"\n--- Query {i+1}: ---")
        print(f"Question: {question}")
        if product_filter:
            print(f"Product Filter: {product_filter}")

        # The `invoke` method is preferred for chains in newer LangChain versions
        # It allows for more direct input and structured output access
        start_time = time.time()
        try:
            # If you need to add filters directly to the retriever, you'd modify how the retriever is created
            # For simplicity here, we assume the retriever works on the full vector store.
            # For product-specific filtering, you'd need to re-initialize the retriever with a query_kwargs
            # or a custom retriever that incorporates metadata filters before the similarity search.
            # Example (conceptual):
            # retriever = vector_store.as_retriever(search_kwargs={"k": 5, "filter": {"product": product_filter}})

            # For now, let's keep it simple without dynamic filtering unless your ChromaDB supports it out-of-the-box with .as_retriever()
            # If you need robust metadata filtering with Chroma, you'd define the retriever like this:
            # retriever = vector_store.as_retriever(search_type="similarity_score_threshold",
            #                                       search_kwargs={"score_threshold": 0.7, "k": 5, "filter": {"product": product_filter}})
            # However, `filter` typically expects `Eq`, `Gt`, `Lt`, etc. for comparison.
            # For basic equality, a dict can work.

            result = rag_system.invoke({"query": question})
            response_time = time.time() - start_time

            print(f"\nAnswer: {result['result']}")
            print(f"Response Time: {response_time:.2f} seconds")

            print("\n--- Retrieved Source Documents: ---")
            if result.get('source_documents'):
                for j, doc in enumerate(result['source_documents']):
                    print(f"  Source {j+1}:")
                    print(f"    Content (first 200 chars): {doc.page_content[:200]}...")
                    print(f"    Metadata: {doc.metadata}")
            else:
                print("  No source documents retrieved.")

        except Exception as e:
            print(f"  Error during query '{question}': {e}")
            print("  Skipping this question.")

    print("\n--- Qualitative Evaluation Finished ---")

In [None]:
if __name__ == "__main__":
    REPORTS_DIR = 'reports/'
    if not os.path.exists(REPORTS_DIR):
        os.makedirs(REPORTS_DIR)
        print(f"Created directory: {REPORTS_DIR}")

    print("Starting Task 3: Implement RAG System and Qualitative Evaluation")

    # 1. Load Embedding Model (Needed for Vector Store)
    embeddings = get_embedding_model(EMBEDDING_MODEL_NAME)
    if embeddings is None:
        print("Failed to load embedding model. Exiting Task 3.")
        exit()

    # 2. Load Vector Store
    vectordb = load_vector_store(VECTOR_STORE_DIR, embeddings)
    if vectordb is None:
        print("Failed to load vector store. Exiting Task 3.")
        exit()

    # 3. Load Local LLM
    llm_model = get_local_llm_model(LLM_MODEL_LOCAL_NAME)
    if llm_model is None:
        print("Failed to load local LLM model. Exiting Task 3.")
        exit()

    # 4. Implement RAG System
    rag_system = implement_rag_system(vectordb, llm_model)
    if rag_system is None:
        print("Failed to implement RAG system. Exiting Task 3.")
        exit()

    # 5. Define Representative Questions for Qualitative Evaluation
    representative_questions = [
        {"question": "What are the common issues people face with their credit cards?", "product_filter": "Credit card"},
        {"question": "Why are customers unhappy with Buy Now, Pay Later services?", "product_filter": "Buy Now, Pay Later"},
        {"question": "Summarize complaints about bank accounts.", "product_filter": "Bank account or service"},
        {"question": "What problems do consumers encounter with mortgages?", "product_filter": "Mortgage"},
        {"question": "Tell me about complaints regarding loan application rejections.", "product_filter": "Personal loan"}
    ]

    # Perform qualitative evaluation
    qualitative_evaluation(rag_system, representative_questions)

    print("\n--- Task 3 Completed ---")

Starting Task 3: Implement RAG System and Qualitative Evaluation

--- Loading embedding model: sentence-transformers/all-MiniLM-L6-v2 ---
Using device for embeddings: cpu


  embeddings_model = HuggingFaceEmbeddings(


Embedding model loaded successfully.

--- Loading vector store from ../data/vector_store ---


  vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding_function)


Vector store loaded successfully.

--- Loading local LLM model: HuggingFaceH4/zephyr-7b-beta ---


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.
Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]