# Task 2: Text Chunking, Embedding, and Vector Store Indexing

In [1]:
import sys
import os
import pandas as pd
from glob import glob

In [2]:
from pathlib import Path
from importlib import reload
# add the project root to the path
project_root = Path("..").resolve()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

### Load the dataset

In [3]:
# import the module to load the data into dataframes
from src.data_loader import DataLoader
import src.data_loader
reload(src.data_loader)

<module 'src.data_loader' from '/home/chalasimon/Documents/10academy/week 6/challenge/CrediTrust-RAG-Chatbot/src/data_loader.py'>

In [4]:
# Load the dataset
data_loader = DataLoader('../data/processed/filtered_complaints.csv')
df = data_loader.load_data()
print(f"Loaded {len(df)} cleaned complaints")

Loaded 478818 cleaned complaints


In [5]:
df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,...,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,narrative_length,MappedProduct,clean_text
0,2025-06-13,Credit card,Store credit card,Getting a credit card,Card opened without my consent or knowledge,A XXXX XXXX card was opened under my name by a...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78230,...,Consent provided,Web,2025-06-13,Closed with non-monetary relief,Yes,,14069121,91,Credit card,a xxxx xxxx card was opened under my name by a...
1,2025-06-13,Checking or savings account,Checking account,Managing an account,Deposits and withdrawals,I made the mistake of using my wellsfargo debi...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,ID,83815,...,Consent provided,Web,2025-06-13,Closed with explanation,Yes,,14061897,109,Saving account,i made the mistake of using my wellsfargo debi...
2,2025-06-12,Credit card,General-purpose credit card or charge card,"Other features, terms, or problems",Other problem,"Dear CFPB, I have a secured credit card with c...",Company has responded to the consumer and the ...,"CITIBANK, N.A.",NY,11220,...,Consent provided,Web,2025-06-13,Closed with monetary relief,Yes,,14047085,156,Credit card,"dear cfpb, i have a secured credit card with c..."
3,2025-06-12,Credit card,General-purpose credit card or charge card,Incorrect information on your report,Account information incorrect,I have a Citi rewards cards. The credit balanc...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",IL,60067,...,Consent provided,Web,2025-06-12,Closed with explanation,Yes,,14040217,233,Credit card,i have a citi rewards cards. the credit balanc...
4,2025-06-09,Credit card,General-purpose credit card or charge card,Problem with a purchase shown on your statement,Credit card company isn't resolving a dispute ...,b'I am writing to dispute the following charge...,Company has responded to the consumer and the ...,"CITIBANK, N.A.",TX,78413,...,Consent provided,Web,2025-06-09,Closed with monetary relief,Yes,,13968411,454,Credit card,bi am writing to dispute the following charges...


### Text Chuncking

In [6]:
# Import the text splitter and initialize it
from src.embedding.chunking import get_text_splitter
text_splitter = get_text_splitter()

In [7]:
from tqdm import tqdm
# chunking process
docs = []
metadata = []

for _, row in df.iterrows():
    chunks = text_splitter.split_text(row['clean_text'])
    for chunk in chunks:
        docs.append(chunk)
        metadata.append({
            "complaint_id": row['Complaint ID'],
            "product": row['MappedProduct']
        })

print(f"Total text chunks created: {len(docs)}")

Total text chunks created: 5289405


### Text Embedding

In [None]:
# load and initialize the embedding function
from src.embedding.embedder import Embedder

embedder = Embedder()

In [None]:
print(f"Embedding model loaded: {embedder.model}")
print(f"Embedding dimension: {embedder.get_embedding_dimension()}")

In [None]:
# Embed the chunks in batches to avoid memory issues
import numpy as np

batch_size = 100
embeddings = []

for i in tqdm(range(0, len(docs), batch_size)):
    batch = docs[i:i+batch_size]
    batch_embeddings = embedder.embed_text(batch)
    embeddings.append(batch_embeddings)

embeddings = np.concatenate(embeddings)
print(f"Embeddings shape: {embeddings.shape}")  # Should be (num_chunks, embedding_dim)