In [1]:
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
import os
import tqdm # For progress bar

# Configuration
DATA_PROCESSED_PATH = '../data/processed/filtered_complaints.csv'
VECTOR_STORE_DIR = '../data/vector_store'
EMBEDDING_MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
CHUNK_SIZE = 1500  # characters
CHUNK_OVERLAP = 200 # characters

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_processed_data(file_path):
    """
    Loads the cleaned and filtered complaint dataset.
    """
    try:
        df = pd.read_csv(file_path)
        print(f"Loaded processed data from {file_path}. Shape: {df.shape}")
        return df
    except FileNotFoundError:
        print(f"Error: Processed data file not found at {file_path}. Please run Task 1 first.")
        return None
    except Exception as e:
        print(f"An error occurred while loading processed data: {e}")
        return None

In [3]:
def chunk_texts(df, text_column='Consumer complaint narrative_cleaned', id_column='Complaint ID', product_column='Product'):
    """
    Chunks long text narratives and associates metadata.
    """
    print(f"\n--- Chunking text narratives (chunk_size={CHUNK_SIZE}, chunk_overlap={CHUNK_OVERLAP}) ---")

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len,  # Measure length by characters
        separators=["\n\n", "\n", " ", ""] # Default separators
    )

    all_chunks = []
    # Create a unique ID for each original complaint, assuming 'Complaint ID' exists or generating one.
    # If 'Complaint ID' is not in your CFPB data, you might need to create one, e.g., using df.index
    if id_column not in df.columns:
        df[id_column] = range(len(df)) # Simple integer ID if not present

    for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="Chunking complaints"):
        narrative = row[text_column]
        original_complaint_id = row[id_column]
        product_category = row[product_column]
        issue = row['Issue'] # Assuming 'Issue' is a useful metadata field

        # Ensure narrative is a string, handle potential NaN/empty after cleaning
        if pd.isna(narrative) or not isinstance(narrative, str) or len(narrative.strip()) == 0:
            continue

        chunks = text_splitter.split_text(narrative)

        for i, chunk in enumerate(chunks):
            all_chunks.append({
                "chunk_content": chunk,
                "metadata": {
                    "original_complaint_id": original_complaint_id,
                    "product": product_category,
                    "issue": issue,
                    "chunk_id": f"{original_complaint_id}-{i}" # Unique ID for each chunk
                }
            })
    print(f"Generated {len(all_chunks)} chunks.")
    return all_chunks

In [4]:
def get_embedding_model(model_name):
    """
    Loads the HuggingFace embedding model.
    """
    print(f"\n--- Loading embedding model: {model_name} ---")
    try:
        # Use HuggingFaceEmbeddings from LangChain for consistency with Chroma
        embeddings_model = HuggingFaceEmbeddings(model_name=model_name)
        print("Embedding model loaded successfully.")
        return embeddings_model
    except Exception as e:
        print(f"Error loading embedding model {model_name}: {e}")
        print("Please ensure you have 'sentence-transformers' and 'torch' installed.")
        return None

In [5]:
def create_and_persist_vector_store(chunks, embeddings_model, persist_directory):
    """
    Creates a ChromaDB vector store and persists it to disk.
    """
    print(f"\n--- Creating and persisting vector store to {persist_directory} ---")

    # Extract texts and metadatas from the chunks list
    texts = [chunk["chunk_content"] for chunk in chunks]
    metadatas = [chunk["metadata"] for chunk in chunks]

    if not texts:
        print("No chunks to embed. Vector store will not be created.")
        return None

    try:
        # Create Chroma vector store from documents (texts + metadatas)
        # LangChain's Chroma.from_documents expects `Document` objects or lists of texts and metadatas
        # For simplicity, we directly use texts and metadatas
        vectordb = Chroma.from_texts(
            texts=texts,
            embedding=embeddings_model,
            metadatas=metadatas,
            persist_directory=persist_directory
        )
        vectordb.persist()
        print("Vector store created and persisted successfully.")
        return vectordb
    except Exception as e:
        print(f"Error creating or persisting vector store: {e}")
        return None

if __name__ == "__main__":
    # Ensure vector store directory exists
    if not os.path.exists(VECTOR_STORE_DIR):
        os.makedirs(VECTOR_STORE_DIR)
        print(f"Created directory: {VECTOR_STORE_DIR}")

    print("Starting Task 2: Text Chunking, Embedding, and Vector Store Indexing")

    # 1. Load processed data
    df_cleaned = load_processed_data(DATA_PROCESSED_PATH)

    if df_cleaned is not None:
        # 2. Chunk text narratives
        # We'll use 'Complaint ID' as the ID column. If your CSV doesn't have it,
        # the load_processed_data function should have added it in Task 1's script
        # based on `df.index` or a similar mechanism.
        # Ensure 'Product' and 'Issue' columns are present in df_cleaned.
        required_cols = ['Consumer complaint narrative_cleaned', 'Product', 'Issue']
        for col in required_cols:
            if col not in df_cleaned.columns:
                print(f"Error: Required column '{col}' not found in the processed data. Please check Task 1 output.")
                exit() # Exit if crucial columns are missing

        chunks = chunk_texts(df_cleaned, id_column='Complaint ID') # Assuming 'Complaint ID' is now available

        if chunks: # Only proceed if chunks were generated
            # 3. Choose and load an embedding model
            embeddings = get_embedding_model(EMBEDDING_MODEL_NAME)

            if embeddings is not None:
                # 4. Embed and Index
                vectordb = create_and_persist_vector_store(chunks, embeddings, VECTOR_STORE_DIR)

                if vectordb:
                    print(f"\nVector store containing {len(chunks)} chunks is ready for querying.")
                    # Optional: Verify by loading and querying a small sample
                    print("\n--- Verifying Vector Store (Optional) ---")
                    try:
                        loaded_vectordb = Chroma(persist_directory=VECTOR_STORE_DIR, embedding_function=embeddings)
                        query_results = loaded_vectordb.similarity_search("Why are people unhappy with their credit card?", k=1)
                        if query_results:
                            print("\nSample query result:")
                            print(f"Content: {query_results[0].page_content[:200]}...")
                            print(f"Metadata: {query_results[0].metadata}")
                        else:
                            print("No results for sample query. Vector store might be empty or query too specific.")
                    except Exception as e:
                        print(f"Error during vector store verification: {e}")
                else:
                    print("Failed to create and persist vector store.")
            else:
                print("Failed to load embedding model. Exiting Task 2.")
        else:
            print("No valid chunks generated. Exiting Task 2.")
    else:
        print("Processed data loading failed. Exiting Task 2.")

    print("\n--- Task 2 Completed ---")

Created directory: ../data/vector_store
Starting Task 2: Text Chunking, Embedding, and Vector Store Indexing
Loaded processed data from ../data/processed/filtered_complaints.csv. Shape: (1581308, 5)

--- Chunking text narratives (chunk_size=1500, chunk_overlap=200) ---


Chunking complaints: 100%|██████████| 1581308/1581308 [06:15<00:00, 4208.92it/s]
  embeddings_model = HuggingFaceEmbeddings(model_name=model_name)


Generated 1959511 chunks.

--- Loading embedding model: sentence-transformers/all-MiniLM-L6-v2 ---
Embedding model loaded successfully.

--- Creating and persisting vector store to ../data/vector_store ---


  vectordb.persist()


Vector store created and persisted successfully.

Vector store containing 1959511 chunks is ready for querying.

--- Verifying Vector Store (Optional) ---


  loaded_vectordb = Chroma(persist_directory=VECTOR_STORE_DIR, embedding_function=embeddings)



Sample query result:
Content: find this to be an unfair and unreasonable policy, particularly when other major credit card issuers offer one-time exceptions to responsible, long-term customers who experience unforeseen financial d...
Metadata: {'original_complaint_id': 228264, 'chunk_id': '228264-1', 'issue': 'Fees or interest', 'product': 'Credit card'}

--- Task 2 Completed ---


Explanation of Choices for the Report:

Chunking Strategy (RecursiveCharacterTextSplitter):

Why RecursiveCharacterTextSplitter? This splitter is generally recommended for its robustness. It attempts to split text using a list of characters (["\n\n", "\n", " ", ""] by default) in order, trying to keep semantically related units (like paragraphs, sentences, words) together. This is crucial for complaint narratives, where retaining the context of an issue is vital for effective retrieval.

chunk_size and chunk_overlap:

chunk_size: This parameter determines the maximum number of characters (or tokens, depending on the length_function) in each chunk. For complaint data, which can vary greatly in length, a chunk_size that allows for a complete thought or specific complaint detail is ideal. Too small, and context is lost; too large, and the embedding might become too generic, or exceed the embedding model's input token limit. A common range for RAG applications is 128-512 tokens. Given that all-MiniLM-L6-v2 handles up to 256 word pieces, we'll aim for character counts that translate to a similar token count. A chunk_size of 500 characters is a good starting point, as it's typically enough to capture a significant part of a complaint without being excessively long. This usually translates to around 100-150 tokens, well within the model's capacity.

chunk_overlap: This specifies the number of characters that will overlap between consecutive chunks. Overlap is crucial to prevent the loss of context at the boundaries of chunks. If a key piece of information spans two chunks, overlap ensures that both chunks contain some shared context, improving the chances of retrieval. An overlap of 10-20% of the chunk_size is generally effective. For a chunk_size of 500, a chunk_overlap of 50-100 characters is reasonable. We'll use 100 to ensure good context flow.

Justification: The chosen chunk_size aims to capture sufficient context from a complaint narrative while staying within the limits of the embedding model. The chunk_overlap ensures that semantic continuity is maintained across chunk boundaries, which is critical for questions that might relate to information spanning multiple segments of a complaint. These values are a good balance between retaining context and preventing overly large chunks that dilute the semantic meaning or exceed model limits.

Embedding Model (sentence-transformers/all-MiniLM-L6-v2):

Why all-MiniLM-L6-v2?

Efficiency: It's a compact and efficient model (384-dimensional vectors) that provides a good balance between performance and computational cost. This is important for processing a large volume of complaints and for efficient real-time querying in a production environment.

Performance for Semantic Search: This model is specifically trained for sentence and short paragraph embeddings, making it highly effective for semantic similarity tasks, which is precisely what our RAG system needs for retrieving relevant complaint narratives. It captures the semantic meaning well, even if the exact keywords aren't present in the query.

Open-source and readily available: It's a widely used and well-supported model from the sentence-transformers library, making it easy to integrate and leverage.

Vector Store (ChromaDB):

Why ChromaDB?

Ease of Use and Local Persistence: ChromaDB is a lightweight, easy-to-use vector database that supports local persistence. This makes it straightforward to set up and manage for an internal tool like CrediTrust's, without requiring complex infrastructure.

Metadata Support: ChromaDB allows storing rich metadata alongside vectors, which is essential for our requirement to link retrieved chunks back to their original complaint ID and product category. This will enable filtering and more insightful answers.

Integration with LangChain: ChromaDB has excellent integration with LangChain, simplifying the process of creating and querying the vector store.

Scalability for Initial Needs: While not as distributed as some larger vector databases, ChromaDB is perfectly suitable for a user base of 500,000 and thousands of complaints per month, especially in its initial internal tool phase. It can scale to a decent volume of data.