In [None]:
import os
import torch
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
# For Flan-T5, we use AutoModelForSeq2SeqLM
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
# New import for HuggingFacePipeline as per deprecation warning fix
from langchain_huggingface import HuggingFacePipeline

# --- Configuration for Task 3 ---
LLM_MODEL_LOCAL_NAME = "google/flan-t5-base"
EMBEDDING_MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
VECTOR_STORE_DIR = '../data/vector_store' # Assumes Task 2 has created this

In [18]:
def get_embedding_model(model_name: str):
    """
    Loads a HuggingFace embedding model.
    Forces CPU usage as per user's system configuration (no GPU).
    """
    device = 'cpu'
    print(f"Loading embedding model '{model_name}' on device: {device}")
    try:
        embeddings_model = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs={'device': device}
        )
        print(f"Embedding model '{model_name}' loaded successfully.")
        return embeddings_model
    except Exception as e:
        print(f"Error loading embedding model {model_name}: {e}")
        return None

In [19]:
def load_vector_store(persist_directory: str, embedding_function):
    """
    Loads the persisted ChromaDB vector store.
    """
    print(f"Loading vector store from '{persist_directory}'...")
    if not os.path.exists(persist_directory):
        print(f"Error: Vector store directory '{persist_directory}' not found. Please run Task 2 first.")
        return None
    try:
        vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding_function)
        print("Vector store loaded successfully.")
        return vectordb
    except Exception as e:
        print(f"Error loading vector store: {e}")
        return None

In [20]:
def get_local_llm_model(model_name: str):
    """
    Loads a local LLM using HuggingFace Transformers pipeline.
    Uses AutoModelForSeq2SeqLM for T5 models, which are CPU-friendly.
    """
    print(f"\n--- Loading local LLM model: {model_name} ---")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # For Flan-T5, we typically don't need bitsandbytes quantization.
        # It's a smaller model that fits well in CPU RAM.
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_name,
            device_map="auto" # This will automatically place the model on CPU if no GPU is available
        )

        # Flan-T5 models are typically used for 'text2text-generation' tasks
        pipe = pipeline(
            "text2text-generation", # Changed task for T5 models
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=512,  # Max tokens for the LLM to generate in response
            temperature=0.7,     # Controls randomness of output (may be ignored by T5 pipeline)
            top_p=0.95,          # Nucleus sampling (may be ignored by T5 pipeline)
            repetition_penalty=1.1, # Avoids repetitive text
        )

        # Use the new HuggingFacePipeline from langchain_huggingface
        llm = HuggingFacePipeline(pipeline=pipe)
        print(f"Local LLM '{model_name}' loaded successfully. Device map: {model.hf_device_map}")
        return llm
    except Exception as e:
        print(f"Error loading local LLM model {model_name}: {e}")
        print("Please ensure you have 'transformers', 'torch', and 'accelerate' installed.")
        return None

In [21]:
def implement_rag_system(vector_store, llm):
    """
    Implements the Retrieval-Augmented Generation (RAG) system.
    Combines the vector store retriever with the LLM using a specific prompt.
    """
    print("\n--- Implementing RAG system ---")
    # Updated Prompt Template as per Task 3 requirements
    template = """You are a financial analyst assistant for CrediTrust. Your task is to answer questions about customer complaints. Use the following retrieved complaint excerpts to formulate your answer. If the context doesn't contain the answer, state that you don't have enough information.

    Context: {context}
    Question: {question}
    Answer:"""
    QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

    if vector_store is None or llm is None:
        print("Error: Vector store or LLM not initialized for RAG system.")
        return None

    try:
        rag_chain = RetrievalQA.from_chain_type(
            llm=llm,
            retriever=vector_store.as_retriever(search_kwargs={"k": 5}), # Retrieve top 5 relevant documents
            return_source_documents=True, # Important for displaying sources
            chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
        )
        print("RAG system implemented successfully.")
        return rag_chain
    except Exception as e:
        print(f"Error implementing RAG system: {e}")
        return None

# --- Qualitative Evaluation Section ---
if __name__ == "__main__":
    # Ensure you have run Task 1 and Task 2 first to create 'data/processed_consumer_complaints.csv'
    # and 'vector_store/' directory.

    print("--- Starting Task 3: Building RAG Core Logic and Evaluation ---")

    # 1. Get embedding model
    embeddings = get_embedding_model(EMBEDDING_MODEL_NAME)
    if embeddings is None:
        print("Exiting Task 3 due to embedding model loading error.")
        exit()

    # 2. Load vector store
    vectordb = load_vector_store(VECTOR_STORE_DIR, embeddings)
    if vectordb is None:
        print("Exiting Task 3 due to vector store loading error. Please ensure Task 2 completed successfully.")
        exit()

    # 3. Get local LLM model
    llm_model = get_local_llm_model(LLM_MODEL_LOCAL_NAME)
    if llm_model is None:
        print("Exiting Task 3 due to LLM model loading error.")
        exit()

    # 4. Implement RAG system
    rag_chain = implement_rag_system(vectordb, llm_model)
    if rag_chain is None:
        print("Exiting Task 3 due to RAG system implementation error.")
        exit()

    print("\nRAG System Ready for Qualitative Evaluation.")
    print("\n--- Running Qualitative Evaluation ---")
    print("Please manually assess the 'Quality Score' (1-5) and 'Comments/Analysis' for your report.")

    # List of representative questions for evaluation
    evaluation_questions = [
        "What is the complaint about credit card?",
        "Describe issues related to bank accounts.",
        "Tell me about problems with mortgages.",
        "What are common issues customers face with student loans?",
        "Summarize complaints about debt collection.",
        "What are the complaints regarding vehicle loans?",
        "Tell me about issues with credit reporting.",
        "What problems do people have with prepaid cards?",
        "Describe a complaint where a customer was charged incorrectly.",
        "What is the most common product complained about?"
    ]

    evaluation_results = []

    for i, question in enumerate(evaluation_questions):
        print(f"\n--- Question {i+1}: {question} ---")
        try:
            # Invoke the RAG chain
            result = rag_chain.invoke({"query": question})
            generated_answer = result['result']
            source_documents = result.get('source_documents', [])

            print(f"Generated Answer: {generated_answer}")

            # Prepare sources for display
            sources_info = []
            if source_documents:
                print("\nRetrieved Sources (Top 2 for brevity in console):")
                for j, doc in enumerate(source_documents[:2]): # Show top 2 sources
                    source_id = doc.metadata.get('original_complaint_id', 'N/A')
                    product = doc.metadata.get('product', 'N/A')
                    content_preview = doc.page_content[:300].replace('\n', ' ') + '...' # Preview first 300 chars
                    
                    print(f"  - Source {j+1} (ID: {source_id}, Product: {product}): {content_preview}")
                    sources_info.append(f"**Source {j+1} (ID: {source_id}, Product: {product}):**\n```\n{content_preview}\n```")
            else:
                print("No relevant source documents retrieved.")
                sources_info.append("No relevant source documents retrieved.")

            # Store results for table generation (Quality Score and Comments are placeholders for manual input)
            evaluation_results.append({
                "Question": question,
                "Generated Answer": generated_answer,
                "Retrieved Sources": "\n\n".join(sources_info),
                "Quality Score (1-5)": "", # To be filled manually
                "Comments/Analysis": ""   # To be filled manually
            })

        except Exception as e:
            print(f"Error processing question '{question}': {e}")
            evaluation_results.append({
                "Question": question,
                "Generated Answer": f"Error: {e}",
                "Retrieved Sources": "N/A",
                "Quality Score (1-5)": "1",
                "Comments/Analysis": "System error during processing."
            })

    print("\n--- Qualitative Evaluation Complete ---")
    print("\nCopy the following Markdown table structure into your report and fill in the 'Quality Score' and 'Comments/Analysis' columns manually:")
    print("\n```markdown")
    print("| Question | Generated Answer | Retrieved Sources (1-2) | Quality Score (1-5) | Comments/Analysis |")
    print("|---|---|---|---|---|")
    for res in evaluation_results:
        # Escape pipe characters in content to avoid breaking Markdown table
        answer = res['Generated Answer'].replace('|', '\\|')
        sources = res['Retrieved Sources'].replace('|', '\\|')
        print(f"| {res['Question']} | {answer} | {sources} | {res['Quality Score (1-5)']} | {res['Comments/Analysis']} |")
    print("```")

--- Starting Task 3: Building RAG Core Logic and Evaluation ---
Loading embedding model 'sentence-transformers/all-MiniLM-L6-v2' on device: cpu
Embedding model 'sentence-transformers/all-MiniLM-L6-v2' loaded successfully.
Loading vector store from '../data/vector_store'...
Vector store loaded successfully.

--- Loading local LLM model: google/flan-t5-base ---


Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Local LLM 'google/flan-t5-base' loaded successfully. Device map: {'': 'cpu'}

--- Implementing RAG system ---
RAG system implemented successfully.

RAG System Ready for Qualitative Evaluation.

--- Running Qualitative Evaluation ---
Please manually assess the 'Quality Score' (1-5) and 'Comments/Analysis' for your report.

--- Question 1: What is the complaint about credit card? ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated Answer: customer-hostile behavior and should be severely sanctioned for such incompetent and abuse behavior

Retrieved Sources (Top 2 for brevity in console):
  - Source 1 (ID: 230545, Product: Credit card): i have previously submitted this complaint. my credit card was used over the of holiday in for 1200.00. i was not in at anytime over the nor did i give my card to anyone to use. the company removed the charge for 290.00 on xxxx to the same service provider, . is well know on the internet for having ...
  - Source 2 (ID: 427754, Product: Money transfer, virtual currency, or money service): their credit card, when they are clearly at fault, is customer-hostile behavior and should be severely sanctioned for such incompetent and abuse behavior. and in this deplorable situation, it is also ....

--- Question 2: Describe issues related to bank accounts. ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated Answer: closed account balances transferred to wrong account and cents missing from one account to the other, charges a off as more money taken than the receipts, cashing wage checks having money deposited back to business account when should have been deposited in personal accounts, business revenue deposited in personal accounts by bank employees causing accounting and audit trail problems, bankers filling in account numbers and using the incorrect accounts, bankers performing transactions without authorization of the account holder or holders, processing checks or convoluting the transaction of the check cashing process, redacting information on deposit slips and just too many similar items to list.

Retrieved Sources (Top 2 for brevity in console):
  - Source 1 (ID: 1332621, Product: Checking or savings account): multi-faceted bank errors. recent audit of accounts and transactions found closed account balances transferred to wrong account and cents missing from one accoun

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated Answer: they get their money! all of it!! why would they need to ruin the lives of three people trying to get by. i would appreciate a prompt response to this matter

Retrieved Sources (Top 2 for brevity in console):
  - Source 1 (ID: 1231876, Product: Debt collection): i have made every mortgage payment since opening. these people are inept....
  - Source 2 (ID: 845488, Product: Credit reporting, credit repair services, or other personal consumer reports): - all mortgage loan inquiries....

--- Question 4: What are common issues customers face with student loans? ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated Answer: interest accrual and loan payoff timelines.

Retrieved Sources (Top 2 for brevity in console):
  - Source 1 (ID: 379740, Product: Student loan): the constant mismanagement of student loan debt. i have had , among with others and now passed on to them who are doing the same...
  - Source 2 (ID: 1495947, Product: Student loan): frustrating when i just want to pay my student loans. every complaint i make falls on deaf ears and the lender does not care at all about its customers. i am also scared to death that due to their incompetence they will report this issue to a credit agency or " fix '' the account but still consider ...

--- Question 5: Summarize complaints about debt collection. ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated Answer: debt collection for 600.00 not aware of the debt and the company it was originally owed to. this continues to harsh by multiple phone calls.

Retrieved Sources (Top 2 for brevity in console):
  - Source 1 (ID: 1095365, Product: Debt collection): debt collection for 600.00 not aware of the debt and the company it was originally owed to. this continues to harsh by multiple phone calls. this debt has been on my report for a very long time affecting the report by ....
  - Source 2 (ID: 44053, Product: Debt collection): of the debt. thank you for your attention to this matter. please let me know if additional documentation or information is required to support my complaint. i am committed to resolving this issue and ensuring accountability for debt collectors who fail to comply with federal laws....

--- Question 6: What are the complaints regarding vehicle loans? ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated Answer: unfair business practices, unfair interest rates, and unfair pricing on vehicles. between xxxx and xxxxyear, i have called and requested the proper loan documents repeatedly. i have also attempted to pay on the vehicle and had the lending staff be rude and irrational. never took payments. also refused to send proper documentation to register the vehicle at the local dmv. professional finance services, end of the loan and refused validation letter. as the vehicle has been paid for, refused to speak of amount owed. no letters of curing the loan, no information of a repossession, illegal towing company, two weeks to find impound lot, sent letter to sell, no response from sell, now civil suit. this company has had over 200 complaints in the past 3 years and over 20 in last 12 months. this company was legally reprimanded in of 2018. all the same issues, non compliance with debt collections, harassing and adding additional charges without reference to. hindering payoff and 

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated Answer: there are problems with my credit report

Retrieved Sources (Top 2 for brevity in console):
  - Source 1 (ID: 1141809, Product: Credit reporting, credit repair services, or other personal consumer reports): problem with credit reporting....
  - Source 2 (ID: 1141838, Product: Credit reporting, credit repair services, or other personal consumer reports): problem with credit reporting....

--- Question 8: What problems do people have with prepaid cards? ---


Token indices sequence length is longer than the specified maximum sequence length for this model (766 > 512). Running this sequence through the model will result in indexing errors
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated Answer: debt card not working, some times. also, there might be problems that i'm not aware off. general issues with debit card

Retrieved Sources (Top 2 for brevity in console):
  - Source 1 (ID: 231182, Product: Checking or savings account): 2000.00 over not mine prepaid card...
  - Source 2 (ID: 103682, Product: Credit card): i was denied 3 different cards one of them was prepaid...

--- Question 9: Describe a complaint where a customer was charged incorrectly. ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Generated Answer: i had an error charge from a merchant. the merchant acknowledged the charge was in error but told me they would issue me a refund only when i go in to their store physically, which is not acceptable to me. i called chase to dispute this erroneous charge from a merchant. the merchant acknowledged the charge was in error but told me they would issue me a refund only when i go in to their store physically, which is not acceptable to me. i called chase to dispute this erroneous charge from a merchant. the merchant acknowledged the charge was in error but told me they would issue me a refund only when i go in to their store physically, which is not acceptable to me. i called chase to dispute this erroneous charge from a merchant. the merchant acknowledged the charge was in error but told me they would issue me a refund only when i go in to their store physically, which is not acceptable to me. i called chase to dispute this erroneous charge from a merchant. the merchant ac