In [1]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from chromadb.config import Settings


####  Load cleaned data

In [2]:
df = pd.read_csv('../data/filtered_complaints.csv', low_memory=False)


#### Set chunking parameters

In [None]:
chunk_size = 300  # characters per chunk
chunk_overlap = 50  # overlap between chunks

#### Initialize text splitter

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

#### Prepare data for chunking

In [6]:
chunks = []
metadatas = []

for idx, row in df.iterrows():
    text = str(row['clean_narrative'])
    complaint_id = row.get('Complaint ID', idx)
    product = row.get('Product', '')
    # Split text into chunks
    for chunk in splitter.split_text(text):
        chunks.append(chunk)
        metadatas.append({
            'complaint_id': complaint_id,
            'product': product,
            'row_idx': idx
        })


#### Load embedding model

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

#### Generate embeddings for all chunks

In [None]:
embeddings = model.encode(chunks, show_progress_bar=True, convert_to_numpy=True)

#### Create ChromaDB client and collection

In [8]:
os.makedirs('../vector_store', exist_ok=True)
chroma_client = chromadb.PersistentClient(path="../vector_store" )
collection = chroma_client.get_or_create_collection("complaints")


#### Add chunks and embeddings to ChromaDB

In [None]:
batch_size = 5000  # or 5461, or less

for i in range(0, len(chunks), batch_size):
    collection.add(
        embeddings=embeddings[i:i+batch_size].tolist(),
        documents=chunks[i:i+batch_size],
        metadatas=metadatas[i:i+batch_size],
        ids=[str(j) for j in range(i, min(i+batch_size, len(chunks)))]
    )


#### Check number of chuncks

In [7]:
print(f"Indexed {len(chunks)} chunks. Vector store saved in '../vector_store/'.")


Indexed 378888 chunks. Vector store saved in '../vector_store/'.
