# Task 2: Text Chunking, Embedding, and Vector Store

## Objective
Convert cleaned customer complaint narratives into semantically searchable vector embeddings while preserving metadata.

## Status
The heavy lifting (Sampling -> Chunking -> Embedding -> Indexing) has been performed by `src/embedding_pipeline.py` to ensure robustness and reproducibility. This notebook verifies the outputs.

In [None]:
import pandas as pd
import chromadb
from sentence_transformers import SentenceTransformer
import os

# Config
VECTOR_STORE_DIR = '../vector_store'
SAMPLE_DATA_PATH = '../data/processed/complaints_sample.csv'
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'

## 1. Verify Sampled Data

In [None]:
if os.path.exists(SAMPLE_DATA_PATH):
    df_sample = pd.read_csv(SAMPLE_DATA_PATH)
    print(f"Sample file found. Shape: {df_sample.shape}")
    print("Product Distribution in Sample:")
    print(df_sample['product'].value_counts(normalize=True))
else:
    print("Sample file not found (Pipeline might still be running).")

## 2. Load Vector Store (Persistence Check)

In [None]:
print(f"Loading ChromaDB from {VECTOR_STORE_DIR}...")
client = chromadb.PersistentClient(path=VECTOR_STORE_DIR)
try:
    collection = client.get_collection("complaints_rag")
    print(f"Collection loaded. Count: {collection.count()}")
except Exception as e:
    print(f"Error loading collection: {e}")

## 3. Sanity Retrieval Test

In [None]:
query = "Billing issues with credit cards"
print(f"Query: {query}")

model = SentenceTransformer(EMBEDDING_MODEL_NAME)
query_vec = model.encode([query]).tolist()

results = collection.query(
    query_embeddings=query_vec,
    n_results=3
)

for i in range(len(results['documents'][0])):
    print(f"\nResult {i+1}:")
    print(f"Product: {results['metadatas'][0][i].get('product')}")
    print(f"Issue: {results['metadatas'][0][i].get('issue')}")
    print(f"Text: {results['documents'][0][i][:200]}...")