Chunking


In [None]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

cleaned_data_file_path = '../data/processed/filtered_complaints.csv'

print(f"Loading cleaned data from: {cleaned_data_file_path}")

try:
    df_cleaned = pd.read_csv(cleaned_data_file_path)
    print("Cleaned dataset loaded successfully!")
    print(f"Shape of the cleaned dataset: {df_cleaned.shape}")
    print("\nFirst 5 rows of the cleaned dataset (showing relevant columns):")

    print(df_cleaned[['Complaint ID', 'Product', 'Consumer complaint narrative_cleaned']].head())

except FileNotFoundError:
    print(f"Error: The cleaned data file was not found at {cleaned_data_file_path}.")
    print("Please ensure 'filtered_complaints.csv' is in the data/processed/ directory.")
except Exception as e:
    print(f"An unexpected error occurred while loading the data: {e}")

# --- Text Chunking Strategy ---
chunk_size = 1000 
chunk_overlap = 100 

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len, 
    add_start_index=True,
)

documents_to_chunk = []
for index, row in df_cleaned.iterrows():
    narrative = str(row['Consumer complaint narrative_cleaned'])
    
    if narrative.strip():
        documents_to_chunk.append({
            'page_content': narrative,
            'metadata': {
                'complaint_id': row['Complaint ID'],
                'product': row['Product'],
                'original_index': index 
            }
        })

print(f"\nProcessing {len(documents_to_chunk)} narratives for chunking...")

# chunking
all_chunks = []
for doc in documents_to_chunk:
    chunks_for_doc = text_splitter.create_documents([doc['page_content']], metadatas=[doc['metadata']])
    all_chunks.extend(chunks_for_doc)

print(f"Total number of chunks created: {len(all_chunks)}")
print("\nSample of a chunk:")
if all_chunks:
    print(all_chunks[0])
    print(f"Content length of first chunk: {len(all_chunks[0].page_content.split())} words")

# for i in range(5):
#     if i < len(all_chunks):
#         print(f"\nChunk {i+1} content:\n{all_chunks[i].page_content[:200]}...") 
#         print(f"Chunk {i+1} metadata: {all_chunks[i].metadata}")

Loading cleaned data from: ../data/processed/filtered_complaints.csv
Cleaned dataset loaded successfully!
Shape of the cleaned dataset: (82164, 3)

First 5 rows of the cleaned dataset (showing relevant columns):
   Complaint ID      Product  \
0      14069121  Credit card   
1      14047085  Credit card   
2      14040217  Credit card   
3      13968411  Credit card   
4      13965746  Credit card   

                Consumer complaint narrative_cleaned  
0  a xxxx xxxx card was opened under my name by a...  
1  dear cfpb i have a secured credit card with ci...  
2  i have a citi rewards cards the credit balance...  
3  bi am writing to dispute the following charges...  
4  although the account had been deemed closed i ...  

Processing 82164 narratives for chunking...
Total number of chunks created: 136540

Sample of a chunk:
page_content='a xxxx xxxx card was opened under my name by a fraudster i received a notice from xxxx that an account was just opened under my name i reached out 

 Embedding Model Selection 

In [None]:

from langchain_community.embeddings import HuggingFaceBgeEmbeddings 

model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'} 
encode_kwargs = {'normalize_embeddings': False} 

print(f"\nInitializing embedding model: {model_name} (using {model_kwargs['device']})...")

try:
    # Initialize the HuggingFaceEmbeddings object
    embeddings_model = HuggingFaceBgeEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    print("Embedding model initialized successfully!")

    sample_text = "This is a test sentence for embedding."
    sample_embedding = embeddings_model.embed_query(sample_text)
    print(f"Sample embedding generated (first 5 values): {sample_embedding[:5]}...")
    print(f"Sample embedding dimension: {len(sample_embedding)}")

except Exception as e:
    print(f"Error initializing or testing embedding model: {e}")


from langchain_community.vectorstores import FAISS

# Creating the FAISS vector store
print("\nCreating FAISS vector store with embeddings...")

try:
    vector_store = FAISS.from_documents(all_chunks, embeddings_model)
    print("FAISS vector store created successfully!")
    print(f"Number of vectors in FAISS index: {vector_store.index.ntotal}")

    # --- Persist the Vector Store ---
    vector_store_path = "../vector_store"
    os.makedirs(vector_store_path, exist_ok=True)

    vector_store.save_local(vector_store_path)
    print(f"Vector store saved locally to: {vector_store_path}")

except Exception as e:
    print(f"Error creating or saving FAISS vector store: {e}")
    print("Please ensure 'faiss-cpu' is installed.")


Initializing embedding model: sentence-transformers/all-MiniLM-L6-v2 (using cpu)...


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model initialized successfully!
Sample embedding generated (first 5 values): [-0.028859706595540047, 0.06519326567649841, 0.06909038126468658, 0.027181899175047874, 0.04393984377384186]...
Sample embedding dimension: 384

Creating FAISS vector store with embeddings...
FAISS vector store created successfully!
Number of vectors in FAISS index: 136540
Vector store saved locally to: ../vector_store
