In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

In [3]:
# Load cleaned data
df = pd.read_parquet('../data/processed/filtered_complaints.parquet')

In [4]:
# Initialize text splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size = 300,
    chunk_overlap = 50,
    separators = ['\n\n', '\n', '.', ' ']
)

In [5]:
# Apply chunking
documents = []
for idx, row in df.iterrows():
    chunks = splitter.split_text(row['cleaned_narrative'])
    for chunk in chunks:
        documents.append({
            'text': chunk,
            'product': row['Product'],
            'complaint_id': row.get('Complaint ID', idx)
        })

print(f"✅ Total chunks created: {len(documents)}")

✅ Total chunks created: 766564


In [7]:
# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Embed chunks
texts = [doc['text'] for doc in documents]
embeddings = model.encode(texts, show_progress_bar=True)

Batches:  49%|████▊     | 11677/23956 [1:09:04<1:12:38,  2.82it/s]


KeyboardInterrupt: 