# Task 2: Text Chunking, Embedding, and Vector Store Indexing

## Objective
To convert the cleaned text narratives into a format suitable for efficient semantic search.

## Steps:
1. Implement text chunking strategy
2. Choose and justify embedding model
3. Generate embeddings for text chunks
4. Create vector store using ChromaDB
5. Store embeddings with metadata
6. Test retrieval functionality

In [3]:
!pip install onnxruntime





[notice] A new release of pip available: 22.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [4]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any
import json
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
import time
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("✅ Libraries imported successfully")

ValueError: The onnxruntime python package is not installed. Please install it with `pip install onnxruntime`

## 1. Load Processed Data

In [None]:
# Load the processed complaint data from Task 1
data_path = '../data/filtered_complaints.csv'

if not os.path.exists(data_path):
    print("❌ Processed data not found. Please run Task 1 first.")
    print("Expected file: ../data/filtered_complaints.csv")
    raise FileNotFoundError("Run notebook 01_data_exploration.ipynb first")

print("Loading processed complaint data...")
df = pd.read_csv(data_path)
print(f"✅ Loaded {len(df):,} complaint records")
print(f"📊 Columns: {list(df.columns)}")

# Display basic statistics
print(f"\n📈 Dataset Statistics:")
print(f"Products: {df['Product'].nunique()} unique ({', '.join(df['Product'].unique())})")
print(f"Average narrative length: {df['cleaned_word_count'].mean():.1f} words")
print(f"Median narrative length: {df['cleaned_word_count'].median():.1f} words")
print(f"Date range: {df['Date received'].min()} to {df['Date received'].max()}")

df.head()

## 2. Text Chunking Strategy

### Why Chunking?
Long narratives are often ineffective when embedded as a single vector because:
- They may contain multiple distinct topics
- Embedding models have token limits
- Smaller chunks provide more precise retrieval

### Chunking Parameters:
- **Chunk Size**: 500 characters (balance between context and precision)
- **Overlap**: 50 characters (maintain context continuity)
- **Strategy**: Recursive character splitting (respects sentence boundaries)

In [None]:
# Analyze narrative lengths to determine optimal chunking strategy
print("=== NARRATIVE LENGTH ANALYSIS ===")

# Character count analysis
char_counts = df['cleaned_narrative'].str.len()
print(f"Character count statistics:")
print(char_counts.describe())

# Visualize length distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Histogram of character counts
ax1.hist(char_counts, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
ax1.set_title('Distribution of Narrative Character Counts', fontweight='bold')
ax1.set_xlabel('Character Count')
ax1.set_ylabel('Frequency')
ax1.axvline(char_counts.mean(), color='red', linestyle='--', label=f'Mean: {char_counts.mean():.0f}')
ax1.axvline(char_counts.median(), color='green', linestyle='--', label=f'Median: {char_counts.median():.0f}')
ax1.legend()

# Box plot by product
df.boxplot(column='cleaned_word_count', by='Product', ax=ax2)
ax2.set_title('Narrative Length by Product', fontweight='bold')
ax2.set_xlabel('Product')
ax2.set_ylabel('Word Count')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Determine chunking parameters
chunk_size = 500  # characters
chunk_overlap = 50  # characters

print(f"\n=== CHUNKING STRATEGY ===")
print(f"Chunk size: {chunk_size} characters")
print(f"Chunk overlap: {chunk_overlap} characters")
print(f"Rationale:")
print(f"  - Median narrative length: {char_counts.median():.0f} chars")
print(f"  - {chunk_size} chars ≈ 75-100 words (good for semantic coherence)")
print(f"  - {chunk_overlap} chars overlap maintains context continuity")
print(f"  - Recursive splitting respects sentence boundaries")

In [None]:
# Implement text chunking
def create_text_chunks(df: pd.DataFrame, chunk_size: int = 500, chunk_overlap: int = 50) -> List[Dict[str, Any]]:
    """Split complaint narratives into chunks for better embedding"""
    print("Creating text chunks...")
    
    # Initialize text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""]
    )
    
    chunks = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing narratives"):
        narrative = row['cleaned_narrative']
        
        # Split text into chunks
        text_chunks = text_splitter.split_text(narrative)
        
        for chunk_idx, chunk in enumerate(text_chunks):
            if len(chunk.strip()) > 20:  # Only keep meaningful chunks
                chunk_data = {
                    'id': f"{idx}_{chunk_idx}",
                    'text': chunk.strip(),
                    'complaint_id': idx,
                    'product': row['Product'],
                    'issue': row.get('Issue', 'Unknown'),
                    'company': row.get('Company', 'Unknown'),
                    'date_received': str(row.get('Date received', 'Unknown')),
                    'state': row.get('State', 'Unknown'),
                    'chunk_index': chunk_idx,
                    'original_length': len(narrative),
                    'chunk_length': len(chunk),
                    'total_chunks': len(text_chunks)
                }
                chunks.append(chunk_data)
    
    return chunks

# Create chunks
chunks = create_text_chunks(df, chunk_size, chunk_overlap)

print(f"\n✅ Created {len(chunks):,} text chunks from {len(df):,} complaints")
print(f"📊 Average chunks per complaint: {len(chunks)/len(df):.1f}")
print(f"📏 Average chunk length: {np.mean([c['chunk_length'] for c in chunks]):.0f} characters")

# Analyze chunking results
chunk_stats = pd.DataFrame(chunks)
print(f"\n=== CHUNKING STATISTICS ===")
print(f"Chunk length distribution:")
print(chunk_stats['chunk_length'].describe())

print(f"\nChunks per product:")
chunks_per_product = chunk_stats['product'].value_counts()
print(chunks_per_product)

In [None]:
# Visualize chunking results
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Chunk length distribution
chunk_lengths = [c['chunk_length'] for c in chunks]
ax1.hist(chunk_lengths, bins=30, alpha=0.7, color='lightcoral', edgecolor='black')
ax1.set_title('Distribution of Chunk Lengths', fontweight='bold')
ax1.set_xlabel('Chunk Length (characters)')
ax1.set_ylabel('Frequency')
ax1.axvline(np.mean(chunk_lengths), color='red', linestyle='--', label=f'Mean: {np.mean(chunk_lengths):.0f}')
ax1.legend()

# Chunks per complaint distribution
chunks_per_complaint = chunk_stats.groupby('complaint_id')['chunk_index'].count()
ax2.hist(chunks_per_complaint, bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
ax2.set_title('Distribution of Chunks per Complaint', fontweight='bold')
ax2.set_xlabel('Number of Chunks')
ax2.set_ylabel('Frequency')
ax2.axvline(chunks_per_complaint.mean(), color='red', linestyle='--', label=f'Mean: {chunks_per_complaint.mean():.1f}')
ax2.legend()

# Chunks by product
chunks_per_product.plot(kind='bar', ax=ax3, color='gold')
ax3.set_title('Number of Chunks by Product', fontweight='bold')
ax3.set_xlabel('Product')
ax3.set_ylabel('Number of Chunks')
ax3.tick_params(axis='x', rotation=45)

# Original vs chunk length relationship
original_lengths = [c['original_length'] for c in chunks]
ax4.scatter(original_lengths, chunk_lengths, alpha=0.5, color='purple')
ax4.set_title('Original vs Chunk Length', fontweight='bold')
ax4.set_xlabel('Original Narrative Length')
ax4.set_ylabel('Chunk Length')

plt.tight_layout()
plt.show()

# Show example chunks
print("\n=== EXAMPLE CHUNKS ===")
for i in range(min(3, len(chunks))):
    chunk = chunks[i]
    print(f"\nChunk {i+1} (Product: {chunk['product']}, Issue: {chunk['issue']}):")
    print(f"Length: {chunk['chunk_length']} chars")
    print(f"Text: {chunk['text'][:200]}{'...' if len(chunk['text']) > 200 else ''}")

## 3. Embedding Model Selection

### Model Choice: `sentence-transformers/all-MiniLM-L6-v2`

**Rationale:**
- **Performance**: Good balance of quality and speed
- **Size**: Lightweight (80MB) for efficient deployment
- **Domain**: Trained on diverse text, suitable for financial complaints
- **Dimensions**: 384-dimensional embeddings (manageable size)
- **Language**: Optimized for English text
- **Popularity**: Well-tested and widely used in production

In [None]:
# Initialize embedding model
model_name = 'sentence-transformers/all-MiniLM-L6-v2'

print(f"=== EMBEDDING MODEL INITIALIZATION ===")
print(f"Model: {model_name}")
print(f"Loading model...")

start_time = time.time()
embedding_model = SentenceTransformer(model_name)
load_time = time.time() - start_time

print(f"✅ Model loaded in {load_time:.2f} seconds")

# Get model information
print(f"\n📋 Model Information:")
print(f"Model name: {model_name}")
print(f"Max sequence length: {embedding_model.max_seq_length}")
print(f"Embedding dimension: {embedding_model.get_sentence_embedding_dimension()}")

# Test embedding generation
test_text = "This is a test complaint about billing issues with my credit card."
test_embedding = embedding_model.encode([test_text])
print(f"\n🧪 Test embedding:")
print(f"Input text: {test_text}")
print(f"Embedding shape: {test_embedding.shape}")
print(f"Embedding sample: {test_embedding[0][:5]}...")

## 4. Generate Embeddings

In [None]:
# Generate embeddings for all chunks
def create_embeddings(chunks: List[Dict[str, Any]], model: SentenceTransformer, batch_size: int = 32) -> np.ndarray:
    """Generate embeddings for text chunks"""
    print("Generating embeddings...")
    
    texts = [chunk['text'] for chunk in chunks]
    
    try:
        # Generate embeddings in batches to manage memory
        embeddings = []
        
        for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
            batch_texts = texts[i:i + batch_size]
            batch_embeddings = model.encode(
                batch_texts, 
                show_progress_bar=False,
                convert_to_numpy=True,
                normalize_embeddings=True  # Normalize for cosine similarity
            )
            embeddings.append(batch_embeddings)
        
        embeddings = np.vstack(embeddings)
        return embeddings
        
    except Exception as e:
        print(f"❌ Error generating embeddings: {e}")
        raise

# Generate embeddings
start_time = time.time()
embeddings = create_embeddings(chunks, embedding_model, batch_size=32)
embedding_time = time.time() - start_time

print(f"\n✅ Generated {len(embeddings):,} embeddings")
print(f"📊 Embedding shape: {embeddings.shape}")
print(f"⏱️  Generation time: {embedding_time:.2f} seconds")
print(f"🚀 Speed: {len(embeddings)/embedding_time:.1f} embeddings/second")
print(f"💾 Memory usage: {embeddings.nbytes / 1024**2:.2f} MB")

# Analyze embedding statistics
print(f"\n=== EMBEDDING STATISTICS ===")
print(f"Mean embedding norm: {np.linalg.norm(embeddings, axis=1).mean():.4f}")
print(f"Std embedding norm: {np.linalg.norm(embeddings, axis=1).std():.4f}")
print(f"Min embedding value: {embeddings.min():.4f}")
print(f"Max embedding value: {embeddings.max():.4f}")

In [None]:
# Visualize embedding properties
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Embedding norms distribution
norms = np.linalg.norm(embeddings, axis=1)
ax1.hist(norms, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
ax1.set_title('Distribution of Embedding Norms', fontweight='bold')
ax1.set_xlabel('L2 Norm')
ax1.set_ylabel('Frequency')
ax1.axvline(norms.mean(), color='red', linestyle='--', label=f'Mean: {norms.mean():.3f}')
ax1.legend()

# Embedding values distribution
ax2.hist(embeddings.flatten(), bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
ax2.set_title('Distribution of Embedding Values', fontweight='bold')
ax2.set_xlabel('Embedding Value')
ax2.set_ylabel('Frequency')

# PCA visualization (first 2 components)
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Sample embeddings for visualization (to avoid overcrowding)
sample_size = min(1000, len(embeddings))
sample_indices = np.random.choice(len(embeddings), sample_size, replace=False)
sample_embeddings = embeddings[sample_indices]
sample_chunks = [chunks[i] for i in sample_indices]

pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(sample_embeddings)

# Color by product
products = [chunk['product'] for chunk in sample_chunks]
unique_products = list(set(products))
colors = plt.cm.Set3(np.linspace(0, 1, len(unique_products)))
product_colors = {product: colors[i] for i, product in enumerate(unique_products)}

for product in unique_products:
    mask = [p == product for p in products]
    ax3.scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1], 
               c=[product_colors[product]], label=product, alpha=0.6, s=20)

ax3.set_title('PCA Visualization of Embeddings (by Product)', fontweight='bold')
ax3.set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
ax3.set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
ax3.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# Similarity heatmap (sample)
sample_similarity = np.dot(sample_embeddings[:20], sample_embeddings[:20].T)
im = ax4.imshow(sample_similarity, cmap='viridis', aspect='auto')
ax4.set_title('Similarity Matrix (Sample)', fontweight='bold')
ax4.set_xlabel('Chunk Index')
ax4.set_ylabel('Chunk Index')
plt.colorbar(im, ax=ax4)

plt.tight_layout()
plt.show()

print(f"\n📊 PCA Analysis:")
print(f"PC1 explains {pca.explained_variance_ratio_[0]:.1%} of variance")
print(f"PC2 explains {pca.explained_variance_ratio_[1]:.1%} of variance")
print(f"Total variance explained: {pca.explained_variance_ratio_.sum():.1%}")

## 5. Create Vector Store with ChromaDB

In [None]:
# Initialize ChromaDB
vector_store_path = '../vector_store'

print(f"=== VECTOR STORE CREATION ===")
print(f"Vector store path: {vector_store_path}")

# Create vector store directory
os.makedirs(vector_store_path, exist_ok=True)

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path=vector_store_path)
print(f"✅ ChromaDB client initialized")

# Create or get collection
collection_name = "complaint_embeddings"

# Delete existing collection if it exists
try:
    chroma_client.delete_collection(name=collection_name)
    print(f"🗑️  Deleted existing collection: {collection_name}")
except:
    pass

# Create new collection
collection = chroma_client.create_collection(
    name=collection_name,
    metadata={"description": "Financial complaint embeddings for RAG system"}
)
print(f"✅ Created collection: {collection_name}")

In [None]:
# Prepare data for ChromaDB
def prepare_chroma_data(chunks: List[Dict[str, Any]], embeddings: np.ndarray):
    """Prepare data for ChromaDB insertion"""
    
    ids = [chunk['id'] for chunk in chunks]
    documents = [chunk['text'] for chunk in chunks]
    metadatas = []
    
    for chunk in chunks:
        metadata = {
            'complaint_id': str(chunk['complaint_id']),
            'product': chunk['product'],
            'issue': chunk['issue'],
            'company': chunk['company'],
            'date_received': chunk['date_received'],
            'state': chunk['state'],
            'chunk_index': chunk['chunk_index'],
            'original_length': chunk['original_length'],
            'chunk_length': chunk['chunk_length'],
            'total_chunks': chunk['total_chunks']
        }
        metadatas.append(metadata)
    
    return ids, documents, metadatas, embeddings.tolist()

# Prepare data
print("Preparing data for ChromaDB...")
ids, documents, metadatas, embedding_list = prepare_chroma_data(chunks, embeddings)

print(f"✅ Prepared {len(ids):,} records for insertion")
print(f"📊 Sample metadata: {metadatas[0]}")

In [None]:
# Insert data into ChromaDB in batches
def insert_to_chroma(collection, ids, documents, metadatas, embeddings, batch_size=100):
    """Insert data into ChromaDB collection in batches"""
    
    total_batches = (len(ids) + batch_size - 1) // batch_size
    
    for i in tqdm(range(0, len(ids), batch_size), desc="Inserting to ChromaDB"):
        end_idx = min(i + batch_size, len(ids))
        
        batch_ids = ids[i:end_idx]
        batch_documents = documents[i:end_idx]
        batch_metadatas = metadatas[i:end_idx]
        batch_embeddings = embeddings[i:end_idx]
        
        collection.add(
            ids=batch_ids,
            embeddings=batch_embeddings,
            documents=batch_documents,
            metadatas=batch_metadatas
        )

# Insert data
start_time = time.time()
insert_to_chroma(collection, ids, documents, metadatas, embedding_list, batch_size=100)
insert_time = time.time() - start_time

print(f"\n✅ Successfully inserted {len(ids):,} embeddings into ChromaDB")
print(f"⏱️  Insertion time: {insert_time:.2f} seconds")
print(f"🚀 Speed: {len(ids)/insert_time:.1f} insertions/second")

# Verify insertion
collection_count = collection.count()
print(f"📊 Collection count: {collection_count:,}")

if collection_count != len(ids):
    print(f"⚠️  Warning: Expected {len(ids)} but got {collection_count}")
else:
    print(f"✅ All records inserted successfully")

## 6. Save Configuration and Test Retrieval

In [None]:
# Save configuration
config = {
    'model_name': model_name,
    'chunk_size': chunk_size,
    'chunk_overlap': chunk_overlap,
    'vector_store_path': vector_store_path,
    'collection_name': collection_name,
    'embedding_dimension': embedding_model.get_sentence_embedding_dimension(),
    'total_chunks': len(chunks),
    'total_complaints': len(df),
    'creation_timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
}

config_path = os.path.join(vector_store_path, 'config.json')
with open(config_path, 'w') as f:
    json.dump(config, f, indent=2)

print(f"✅ Configuration saved to: {config_path}")
print(f"📋 Configuration:")
for key, value in config.items():
    print(f"  {key}: {value}")

In [None]:
# Test retrieval functionality
def test_retrieval(collection, embedding_model, query: str, n_results: int = 5):
    """Test semantic search functionality"""
    
    # Generate query embedding
    query_embedding = embedding_model.encode([query], normalize_embeddings=True)[0]
    
    # Search in ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=n_results,
        include=['documents', 'metadatas', 'distances']
    )
    
    return results

print("=== TESTING RETRIEVAL FUNCTIONALITY ===")

test_queries = [
    "billing issues with credit cards",
    "unauthorized transactions and fraud",
    "customer service problems",
    "payment failures and delays",
    "account access issues"
]

for i, query in enumerate(test_queries, 1):
    print(f"\n🔍 Test Query {i}: '{query}'")
    
    start_time = time.time()
    results = test_retrieval(collection, embedding_model, query, n_results=3)
    search_time = time.time() - start_time
    
    print(f"⏱️  Search time: {search_time*1000:.1f}ms")
    
    if results['documents'][0]:
        print(f"📊 Found {len(results['documents'][0])} results:")
        
        for j, (doc, metadata, distance) in enumerate(zip(
            results['documents'][0], 
            results['metadatas'][0], 
            results['distances'][0]
        )):
            similarity = 1 - distance  # Convert distance to similarity
            print(f"\n  Result {j+1}:")
            print(f"    Product: {metadata['product']}")
            print(f"    Issue: {metadata['issue']}")
            print(f"    Similarity: {similarity:.3f}")
            print(f"    Text: {doc[:150]}{'...' if len(doc) > 150 else ''}")
    else:
        print("❌ No results found")

print(f"\n✅ Retrieval testing completed successfully!")

## 7. Vector Store Statistics and Analysis

In [None]:
# Get comprehensive vector store statistics
def get_vector_store_stats(collection):
    """Get comprehensive statistics about the vector store"""
    
    # Get total count
    total_count = collection.count()
    
    # Get sample of metadata for analysis
    sample_size = min(1000, total_count)
    sample_results = collection.get(
        limit=sample_size,
        include=['metadatas']
    )
    
    if sample_results['metadatas']:
        # Analyze metadata
        metadata_df = pd.DataFrame(sample_results['metadatas'])
        
        stats = {
            'total_chunks': total_count,
            'sample_size': sample_size,
            'product_distribution': metadata_df['product'].value_counts().to_dict(),
            'issue_distribution': metadata_df['issue'].value_counts().head(10).to_dict(),
            'state_distribution': metadata_df['state'].value_counts().head(10).to_dict(),
            'avg_chunk_length': metadata_df['chunk_length'].astype(int).mean(),
            'avg_original_length': metadata_df['original_length'].astype(int).mean(),
            'unique_complaints': metadata_df['complaint_id'].nunique(),
            'avg_chunks_per_complaint': total_count / metadata_df['complaint_id'].nunique()
        }
        
        return stats
    
    return {'total_chunks': total_count}

# Get statistics
print("=== VECTOR STORE STATISTICS ===")
stats = get_vector_store_stats(collection)

print(f"📊 Total chunks: {stats['total_chunks']:,}")
print(f"📝 Unique complaints: {stats.get('unique_complaints', 'Unknown'):,}")
print(f"📈 Average chunks per complaint: {stats.get('avg_chunks_per_complaint', 0):.1f}")
print(f"📏 Average chunk length: {stats.get('avg_chunk_length', 0):.0f} characters")
print(f"📄 Average original length: {stats.get('avg_original_length', 0):.0f} characters")

if 'product_distribution' in stats:
    print(f"\n🏷️  Product distribution:")
    for product, count in stats['product_distribution'].items():
        percentage = (count / stats['total_chunks']) * 100
        print(f"  - {product}: {count:,} chunks ({percentage:.1f}%)")

if 'issue_distribution' in stats:
    print(f"\n🎯 Top issues:")
    for issue, count in list(stats['issue_distribution'].items())[:5]:
        print(f"  - {issue}: {count:,} chunks")

# Save statistics
stats_path = os.path.join(vector_store_path, 'statistics.json')
with open(stats_path, 'w') as f:
    json.dump(stats, f, indent=2, default=str)

print(f"\n✅ Statistics saved to: {stats_path}")

In [None]:
# Create final visualization
if 'product_distribution' in stats:
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
    
    # Product distribution in vector store
    products = list(stats['product_distribution'].keys())
    counts = list(stats['product_distribution'].values())
    
    ax1.pie(counts, labels=products, autopct='%1.1f%%', startangle=90)
    ax1.set_title('Chunk Distribution by Product', fontweight='bold')
    
    # Top issues
    issues = list(stats['issue_distribution'].keys())[:8]
    issue_counts = list(stats['issue_distribution'].values())[:8]
    
    ax2.barh(issues, issue_counts, color='lightcoral')
    ax2.set_title('Top Issues in Vector Store', fontweight='bold')
    ax2.set_xlabel('Number of Chunks')
    
    # Chunk length distribution (from original data)
    chunk_lengths = [c['chunk_length'] for c in chunks]
    ax3.hist(chunk_lengths, bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
    ax3.set_title('Chunk Length Distribution', fontweight='bold')
    ax3.set_xlabel('Chunk Length (characters)')
    ax3.set_ylabel('Frequency')
    
    # Chunks per complaint distribution
    chunks_per_complaint = pd.Series([c['complaint_id'] for c in chunks]).value_counts()
    ax4.hist(chunks_per_complaint.values, bins=20, alpha=0.7, color='gold', edgecolor='black')
    ax4.set_title('Chunks per Complaint Distribution', fontweight='bold')
    ax4.set_xlabel('Number of Chunks')
    ax4.set_ylabel('Number of Complaints')
    
    plt.suptitle('Vector Store Analysis Summary', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

print("\n" + "="*60)
print("🎉 TASK 2 COMPLETED SUCCESSFULLY!")
print("="*60)
print(f"✅ Created {stats['total_chunks']:,} text chunks")
print(f"✅ Generated {stats['total_chunks']:,} embeddings using {model_name}")
print(f"✅ Stored embeddings in ChromaDB vector store")
print(f"✅ Tested retrieval functionality successfully")
print(f"📁 Vector store location: {vector_store_path}")
print(f"⚙️  Configuration saved with chunking and embedding parameters")
print("\n🚀 Ready for Task 3: Building RAG Core Logic and Evaluation")