In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import torch
import pickle

In [2]:
# Load cleaned data
df = pd.read_parquet('../data/processed/filtered_complaints.parquet')

In [3]:
# Initialize text splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size = 300,
    chunk_overlap = 50,
    separators = ['\n\n', '\n', '.', ' ']
)

In [4]:
# Apply chunking
documents = []
for idx, row in df.iterrows():
    chunks = splitter.split_text(row['cleaned_narrative'])
    for chunk in chunks:
        documents.append({
            'text': chunk,
            'product': row['Product'],
            'complaint_id': row.get('Complaint ID', idx)
        })

print(f"✅ Total chunks created: {len(documents)}")

✅ Total chunks created: 766564


In [5]:
print(documents[:10])

[{'text': 'xxxx xxxx card opened name fraudster received notice xxxx account opened name reached xxxx xxxx state activity unauthorized xxxx xxxx confirmed fraudulent immediately closed card however failed remove three credit agency fraud impacting credit score based hard credit pull done xxxx xxxx done', 'product': 'Credit card', 'complaint_id': 14069121}, {'text': 'score based hard credit pull done xxxx xxxx done fraudster', 'product': 'Credit card', 'complaint_id': 14069121}, {'text': 'dear cfpb secured credit card citibank changed unsecured card calling n waitting check release secured amount 250000 rhat credit card told wait today told someone cashed check system doesnt info happen requested case number wont send confirmation whatsoever report police told wait form might sent', 'product': 'Credit card', 'complaint_id': 14047085}, {'text': 'report police told wait form might sent via mail within 14 day fill called customer service second time today manager told thing info help pleas

In [14]:
print(torch.cuda.get_device_properties(0))

_CudaDeviceProperties(name='NVIDIA GeForce 940MX', major=5, minor=0, total_memory=2047MB, multi_processor_count=3, uuid=bf11a784-7c04-f58b-d95d-4fd7280432f4, L2_cache_size=1MB)


In [15]:
# Load model
model = SentenceTransformer('all-MiniLM-L6-v2', device="cuda")

# Embed chunks
texts = [doc['text'] for doc in documents]
embeddings = model.encode(texts, show_progress_bar=True, batch_size=32, convert_to_tensor=True)

Batches: 100%|██████████| 23956/23956 [1:18:08<00:00,  5.11it/s]


In [None]:
# Attach embeddings to documents
for i, emb in enumerate(embeddings):
    documents[i]['embedding'] = emb.tolist()

In [18]:
# Save to disk
with open('../vector_store/chunks_with_embeddings.pkl', 'wb') as f:
    pickle.dump(documents, f)