In [7]:
%pip install pandas matplotlib seaborn sentence-transformers faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0-cp312-cp312-win_amd64.whl (15.0 MB)
   ---------------------------------------- 0.0/15.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/15.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/15.0 MB ? eta -:--:--
    --------------------------------------- 0.3/15.0 MB ? eta -:--:--
   - -------------------------------------- 0.5/15.0 MB 1.5 MB/s eta 0:00:10
   -- ------------------------------------- 1.0/15.0 MB 1.6 MB/s eta 0:00:09
   --- ------------------------------------ 1.3/15.0 MB 1.6 MB/s eta 0:00:09
   ---- ----------------------------------- 1.6/15.0 MB 1.6 MB/s eta 0:00:09
   ---- ----------------------------------- 1.8/15.0 MB 1.5 MB/s eta 0:00:09
   ----- ---------------------------------- 2.1/15.0 MB 1.5 MB/s eta 0:00:09
   ------ --------------------------------- 2.4/15.0 MB 1.4 MB/s eta 0:00:09
   ------ ----------


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# File: notebooks/embed_and_index.ipynb
#text chunking, embedding, and indexing the Consumer Complaints dataset using FAISS
# Import necessary libraries
import pandas as pd
import os
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document # Not strictly needed for loading, but good to have if showing document structure

# --- 0. Set up Paths ---
# We determine the project root dynamically based on the current working directory.
# This assumes the notebook is launched either from the project root or the 'notebooks/' directory.

current_working_dir = os.getcwd()

# Check if the current working directory ends with 'notebooks' (case-insensitive)
if current_working_dir.lower().endswith(os.path.sep + 'notebooks'):
    project_root = os.path.abspath(os.path.join(current_working_dir, os.pardir))
else:
    project_root = current_working_dir

# Path to the persisted FAISS vector store
vector_store_path = os.path.join(project_root, 'vector_store', 'complaint_faiss_index')

print(f"Detected Project Root: {project_root}")
print(f"Attempting to load vector store from: {vector_store_path}")

# --- 1. Load the Embedding Model (must be the same as used for indexing) ---
try:
    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    print(f"\nEmbedding model '{embeddings.model_name}' loaded successfully.")
except ImportError:
    print("Error: 'sentence-transformers' library not found.")
    print("Please install it using: pip install sentence-transformers")
    embeddings = None
except Exception as e:
    print(f"Error loading embedding model: {e}")
    embeddings = None

# --- 2. Load the Persisted FAISS Vector Store ---
db = None
if embeddings: # Only try to load if embedding model loaded successfully
    if os.path.exists(vector_store_path):
        try:
            db = FAISS.load_local(vector_store_path, embeddings, allow_dangerous_deserialization=True)
            print("FAISS vector store loaded successfully!")
        except Exception as e:
            print(f"Error loading FAISS vector store from '{vector_store_path}': {e}")
            print("Ensure 'faiss-cpu' is installed and the index was saved correctly by 'src/embed_and_index.py'.")
    else:
        print(f"\nError: Vector store directory '{vector_store_path}' not found.")
        print("Please run 'src/embed_and_index.py' first to generate and persist the vector store.")
else:
    print("\nSkipping vector store loading as embedding model could not be initialized.")

# --- 3. Perform a Sample Semantic Search (if DB loaded) ---
if db:
    print("\n--- Performing a Sample Semantic Search ---")
    query = "unauthorized transactions on my account"
    print(f"Search Query: '{query}'")

    try:
        # Perform similarity search
        # k=3 retrieves the top 3 most relevant chunks
        docs_with_scores = db.similarity_search_with_score(query, k=3)

        print("\nRetrieved Documents (Chunks):")
        for i, (doc, score) in enumerate(docs_with_scores):
            print(f"\n--- Result {i+1} (Similarity Score: {score:.4f}) ---")
            print(f"Complaint ID: {doc.metadata.get('complaint_id', 'N/A')}")
            print(f"Product: {doc.metadata.get('product', 'N/A')}")
            print(f"Chunk Content (first 200 chars): '{doc.page_content[:200]}...'")
            # print(f"Full Chunk Content: '{doc.page_content}'") # Uncomment to see full content

    except Exception as e:
        print(f"Error during similarity search: {e}")
else:
    print("\nSkipping sample search as the vector store could not be loaded.")

print("\n--- Embed and Index Notebook Execution Complete ---")

Detected Project Root: d:\Kifiya AI Master Training Program 5 6 &7\week-6\intelligent-complaint-analysis
Attempting to load vector store from: d:\Kifiya AI Master Training Program 5 6 &7\week-6\intelligent-complaint-analysis\vector_store\complaint_faiss_index

Embedding model 'all-MiniLM-L6-v2' loaded successfully.

Error: Vector store directory 'd:\Kifiya AI Master Training Program 5 6 &7\week-6\intelligent-complaint-analysis\vector_store\complaint_faiss_index' not found.
Please run 'src/embed_and_index.py' first to generate and persist the vector store.

Skipping sample search as the vector store could not be loaded.

--- Embed and Index Notebook Execution Complete ---


: 