# 🧠 GenAI Embedding Pipeline with FAISS
This notebook reads customer data from an Excel file, generates per-customer JSON, embeds it using `bge-small-en`, and indexes it in FAISS for retrieval.

In [None]:
# ✅ Step 1: Install dependencies
!pip install -U sentence-transformers faiss-cpu openpyxl tqdm

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [None]:
# ✅ Step 2: Import libraries
import pandas as pd
import json
import numpy as np
import faiss
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import os

In [None]:
# Load the updated Excel file

xls = pd.ExcelFile("customer_data_fully_enhanced.xlsx")

# Read all necessary sheets
df_profile = pd.read_excel(xls, sheet_name="Customer Individual")
df_feedback = pd.read_excel(xls, sheet_name="Customer Feedback")
df_sentiment = pd.read_excel(xls, sheet_name="Customer Sentiment Data")
df_transactions = pd.read_excel(xls, sheet_name="Customer Transaction")
df_bank_statement = pd.read_excel(xls, sheet_name="Bank Statement")

In [None]:
# Prepare a list to store JSON records
customer_json_records = []

# Iterate through each customer in the profile sheet
for _, row in df_profile.iterrows():
    customer_id = row["Customer Id"]

    customer_dict = {
        "customer_id": customer_id,
        "customer_profile": {
            "name": row["Customer Name"],
            "age": row["Age"],
            "gender": row["Gender"],
            "location": row["Location"],
            "income": row["Income per year in $"],
            "education": row["Education"],
            "occupation": row["Occupation"],
            "interests": str(row["Interests"]).split(", "),
            "preferences": str(row["Preferences"]).split(", "),
            "marital_status": row["Marital Status"],
            "dependents": row["Dependents Count"],
            "home_ownership": row["Home Ownership"],
            "lifestyle": row["Lifestyle"],
            "region": row["Region"],
            "urban_rural": row["Urban/Rural"],
            "preferred_communication": row["Preferred Communication"],
            "email_open_rate": row["Email Open Rate"],
            "click_through_rate": row["Click Through Rate"],
            "social_media_engagement": row["Social Media Engagement"]
        },
        "feedback_history": df_feedback[df_feedback["customer_id"] == customer_id][[
            "Feedback_id", "date", "rating", "comment"
        ]].rename(columns={"Feedback_id": "feedback_id"}).to_dict(orient="records"),
        "sentiment_posts": df_sentiment[df_sentiment["customer_id"] == customer_id][[
            "platform", "content", "timestamp", "sentiment_score", "intent", "emotion",
            "sentiment_category", "mentions", "Hashtags", "day_of_week", "time_of_day"
        ]].to_dict(orient="records"),
        "transactions": df_transactions[df_transactions["Customer Id"] == customer_id][[
            "Product Name", "Transaction Type", "Category", "Amount (in dollars)",
            "Purchased Date", "Payment Mode", "Account Impact"
        ]].rename(columns={
            "Product Name": "product_name",
            "Transaction Type": "transaction_type",
            "Category": "category",
            "Amount (in dollars)": "amount",
            "Purchased Date": "date",
            "Payment Mode": "payment_mode",
            "Account Impact": "account_impact"
        }).to_dict(orient="records"),
        "bank_statement": df_bank_statement[df_bank_statement["Customer Id"] == customer_id][[
            "Date", "Transaction Type", "Description", "Category", "Amount", "Balance Impact"
        ]].rename(columns={
            "Date": "date",
            "Transaction Type": "transaction_type",
            "Description": "description",
            "Category": "category",
            "Amount": "amount",
            "Balance Impact": "balance_impact"
        }).to_dict(orient="records")
    }

    customer_json_records.append(customer_dict)

In [None]:
customer_json_records[0]

{'customer_id': 'CUST2025001',
 'customer_profile': {'name': 'Daniel Kelly',
  'age': 18,
  'gender': 'Female',
  'location': 'Utah',
  'income': 91362,
  'education': 'Graduate',
  'occupation': 'Marketing Manager',
  'interests': ['Luxury Shopping', 'Travel', 'Dining'],
  'preferences': ['Discounts', 'New Arrivals'],
  'marital_status': 'Married',
  'dependents': 2,
  'home_ownership': 'Rent',
  'lifestyle': 'Sedentary',
  'region': 'North America',
  'urban_rural': 'Rural',
  'preferred_communication': 'Whatsapp',
  'email_open_rate': '90%',
  'click_through_rate': '91%',
  'social_media_engagement': 'High'},
 'feedback_history': [{'feedback_id': 'FB00001',
   'date': Timestamp('2025-03-17 17:15:16.060000'),
   'rating': 5,
   'comment': 'Highly recommended.'}],
 'sentiment_posts': [{'platform': 'Twitter',
   'content': 'Not satisfied at all. Expected much better.',
   'timestamp': Timestamp('2025-02-28 16:58:02.790000'),
   'sentiment_score': -0.9012449415738586,
   'intent': 'Prai

In [None]:
# ✅ Step 5: Load embedding model
model = SentenceTransformer('BAAI/bge-small-en')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from sentence_transformers import SentenceTransformer
import json
import numpy as np
from tqdm import tqdm


# Define a function to convert JSON to embedding-friendly text
def prepare_for_embedding(customer_json):
    # Recursively convert all datetime or Timestamp objects to strings
    def convert(obj):
        if isinstance(obj, dict):
            return {k: convert(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [convert(i) for i in obj]
        elif hasattr(obj, 'isoformat'):  # handles pd.Timestamp, datetime.datetime, etc.
            return obj.isoformat()
        else:
            return obj

    customer_json = convert(customer_json)
    return "Represent this customer for retrieval: " + json.dumps(customer_json, indent=2)

# Store embeddings, customer IDs, and raw text
embeddings = []
customer_ids = []
documents = []

# Generate embeddings for each customer
for record in tqdm(customer_json_records):
    customer_id = record["customer_id"]
    customer_text = prepare_for_embedding(record)

    embedding = model.encode(customer_text, normalize_embeddings=True)

    embeddings.append(embedding)
    customer_ids.append(customer_id)
    documents.append(customer_text)

100%|██████████| 1000/1000 [10:41<00:00,  1.56it/s]


In [None]:
# ✅ Step 7: Create FAISS index
embedding_matrix = np.array(embeddings).astype("float32")
dimension = embedding_matrix.shape[1]
index = faiss.IndexHNSWFlat(dimension, 32)
index.add(embedding_matrix)

# Save index and metadata
faiss.write_index(index, "customer_faiss_index.index")
pd.DataFrame({"customer_id": customer_ids, "document": documents}).to_csv("customer_vector_metadata.csv", index=False)

print("✅ Index and metadata saved.")

✅ Index and metadata saved.


In [None]:
df_products = pd.read_excel(xls, sheet_name="Banking Products & Rewards")
df_offers = pd.read_excel(xls, sheet_name="Offers & Campaigns")

# Combine both into one list of documents
combined_docs = []

def row_to_text(row, source):
    # Convert row to JSON-safe dictionary
    row_dict = {k: (v.isoformat() if hasattr(v, 'isoformat') else v) for k, v in row.items() if pd.notnull(v)}

    if source == "Product":
        return f"Represent this banking product for retrieval: {json.dumps(row_dict, indent=2)}"
    else:
        return f"Represent this campaign offer for retrieval: {json.dumps(row_dict, indent=2)}"

for _, row in df_products.iterrows():
    combined_docs.append({
        "type": "product",
        "text": row_to_text(row, source="Product")
    })

for _, row in df_offers.iterrows():
    combined_docs.append({
        "type": "offer",
        "text": row_to_text(row, source="Offer")
    })

    # Generate embeddings
texts_offer = [doc["text"] for doc in combined_docs]
embeddings_offer = model.encode(texts_offer, normalize_embeddings=True)



In [None]:
texts_offer

['Represent this banking product for retrieval: {\n  "Product Name": "Platinum Credit Card",\n  "Category": "Credit Card",\n  "Eligibility Criteria": "Income > $100K",\n  "Benefits": "5x travel points, lounge access",\n  "Ideal Customer Profile": "High-spending professionals",\n  "Interest Tags": "Travel, Luxury",\n  "Reference Doc": "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"\n}',
 'Represent this banking product for retrieval: {\n  "Product Name": "Gold Credit Card",\n  "Category": "Credit Card",\n  "Eligibility Criteria": "Income > $50K",\n  "Benefits": "Cashback on groceries & fuel",\n  "Ideal Customer Profile": "Mid-income customers",\n  "Interest Tags": "Daily Expenses, Cashback",\n  "Reference Doc": "https://www.adobe.com/content/dam/cc/us/en/creative-cloud/photography/discover/sample-photography.pdf"\n}',
 'Represent this banking product for retrieval: {\n  "Product Name": "Personal Loan",\n  "Category": "Loan",\n  "Eligibility Criteria": "Income >

In [None]:
# Create FAISS index
embedding_offer_matrix = np.array(embeddings_offer).astype("float32")
dimension_offer = embedding_offer_matrix.shape[1]
index_offer = faiss.IndexHNSWFlat(dimension_offer, 32)
index_offer.add(embedding_offer_matrix)

# Save index and metadata
faiss.write_index(index_offer, "product_offer_faiss.index")

pd.DataFrame({
    "type": [doc["type"] for doc in combined_docs],
    "text": texts_offer
}).to_csv("product_offer_metadata.csv", index=False)

print("✅ Product & Offer FAISS index created successfully.")

✅ Product & Offer FAISS index created successfully.


In [None]:
!pip install google-generativeai



In [None]:
#Configure Gemini Flash 2.0
import google.generativeai as genai

# Set your Gemini API key
genai.configure(api_key="Your gemini-2.0 key")

# Initialize Gemini Flash 2.0 model
model = genai.GenerativeModel(model_name="models/gemini-2.0-flash")


In [None]:
#Define Retrieval + Prompt Code

import faiss
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# Load your model and indexes
embedding_model = SentenceTransformer("BAAI/bge-small-en")
customer_index = faiss.read_index("customer_faiss_index.index")
product_index = faiss.read_index("product_offer_faiss.index")

# Load metadata
customer_meta = pd.read_csv("customer_vector_metadata.csv")
product_meta = pd.read_csv("product_offer_metadata.csv")


In [None]:
#Search & Build RAG Prompt
def search_faiss(index, query_vec, k=3):
    D, I = index.search(query_vec, k)
    return I[0]

def get_embedding(text):
    return embedding_model.encode(text, normalize_embeddings=True).reshape(1, -1)

def generate_personalization(customer_query):
    # Step 1: Embed customer query to find customer profile
    customer_vec = get_embedding(f"Represent this customer for retrieval: {customer_query}")
    customer_idx = search_faiss(customer_index, customer_vec, k=1)[0]
    customer_context = customer_meta.iloc[customer_idx]["document"]

    # Step 2: Use the customer context to search for relevant products
    product_vec = get_embedding(customer_context)
    product_indices = search_faiss(product_index, product_vec, k=3)
    matched_products = "\n\n".join(product_meta.iloc[i]["text"] for i in product_indices)

    # Step 3: Build Gemini prompt
    prompt = f"""
You are an AI personalization assistant for a bank.

Based on the customer profile and behavior below, suggest the best matching banking products or offers.
Explain *why* each recommendation is relevant.

Customer Profile:
{customer_context}

Top Matching Products:
{matched_products}

Respond with a clear list of personalized recommendations.
"""

    # Step 4: Send to Gemini
    response = model.generate_content(prompt)
    return response.text


In [None]:
#Test Prompt
print(generate_personalization("CUST2025001"))


Here are the personalized banking product/offer recommendations for Michael Johnson, based on his profile and behavior:

**1. Travel Discount Credit Card:**

*   **Why:** Michael has a history of "High-End Travel Bookings" at Ritz-Carlton, and Private Jet Charters, indicating he is a frequent traveler. The "Travel Discount" credit card offers a 20% discount on flight & hotel bookings, which aligns perfectly with his travel habits and preferences.

**2. E-commerce Cashback Credit Card:**

*   **Why:** Michael is interested in Tech Gadgets, and subscriptions, indicating he does online shopping. The "E-commerce Cashback" credit card offer provides 10% cashback on purchases at partner e-commerce sites, which would be appealing and beneficial for his online spending habits.

**3. Premium Rewards Credit Card (Not in provided list, but highly relevant):**

*   **Why:** Given Michael's high income, "Graduate" education, and transactions on luxury goods (Gucci shoes, Rolex watch), he would like

In [None]:
#Prompt for a non existing customer

new_customer_profile = {
    "name": "John Doe",
    "age": 35,
    "income": 95000,
    "region": "North America",
    "interests": ["Travel", "Dining"],
    "preferences": ["Cashback", "Digital Banking"],
    "marital_status": "Married",
    "communication": "Email",
    "lifestyle": "Active"
}


In [None]:
#Format it for retrieval-style embedding:
import json

def create_virtual_customer_prompt(profile_dict):
    return "Represent this customer for retrieval: " + json.dumps(profile_dict, indent=2)

new_cust_text = create_virtual_customer_prompt(new_customer_profile)
new_cust_vec = get_embedding(new_cust_text)

In [None]:
# Retrieve matching products:
matched_product_indices = search_faiss(product_index, new_cust_vec, k=3)
matched_products = "\n\n".join(product_meta.iloc[i]["text"] for i in matched_product_indices)


In [None]:
#RAG prompt

rag_prompt = f"""
You are an AI assistant that helps recommend personalized banking products and offers.

This is a new customer. Based on the following profile, suggest the top 3 suitable products or offers.

New Customer Profile:
{new_cust_text}

Top Matching Products:
{matched_products}

Explain your recommendations clearly.
"""
response = model.generate_content(rag_prompt)
print(response.text)


Okay, based on John Doe's profile, here are my top 3 recommended banking products/offers, along with explanations:

**1. Travel Discount Credit Card:**

*   **Reasoning:** John's profile explicitly states an interest in "Travel". The "Travel Discount" credit card offer directly caters to this interest by providing a significant discount (20%) on flight and hotel bookings. This aligns perfectly with his lifestyle and could incentivize him to use our services for his travel plans.
*   **Value Proposition:** This card will help John save money on his travels, a direct benefit that resonates with his stated interests.

**2. E-commerce Cashback Credit Card:**

*   **Reasoning:** While not explicitly stated, individuals in North America, with income $95,000 and are active are highly likely to engage in online shopping. The "E-commerce Cashback" credit card offers 10% cashback on purchases at partner e-commerce sites, which can translate to significant savings for John, especially given his p

In [None]:
#Temporal trend pattern checking

prompt = """
From the retrieved customer profiles and their feedback history, identify if any customers have shown a decline in satisfaction or sentiment over time.

Look at feedback ratings, comments, and sentiment scores over the last few months.

For each such customer:
1. Summarize the trend in their feedback or sentiment.
2. Identify any red flags or recurring complaints.
3. Suggest a personalized product, service, or engagement action to prevent churn.
"""

response = model.generate_content(prompt)
print(response.text)


Okay, I'm ready to analyze customer data and identify declining satisfaction. To do this effectively, I need you to **provide the data** in a format that I can understand.  Ideally, this would be a structured format like a CSV file, a JSON object, or even a well-formatted table in your prompt.

**Here's what I need to see for each customer you want me to analyze:**

*   **Customer ID:** (e.g., 12345) - A unique identifier for each customer.
*   **Date:** (e.g., 2023-10-26) - Date of feedback or interaction.
*   **Feedback Rating:** (e.g., 5, 4, 3, 2, 1) - A numerical rating scale. (If applicable)
*   **Comment:** (e.g., "The product was great!") - Free-text feedback from the customer.
*   **Sentiment Score:** (e.g., 0.8, -0.2) - A numerical score representing the sentiment of the comment (positive, negative, neutral). If you don't have sentiment scores pre-calculated, I can attempt to analyze the comments using basic keyword analysis.
*   **Product/Service Interaction:** (e.g., "Purcha

In [None]:
import pandas as pd
import faiss
import numpy as np
import google.generativeai as genai
from sentence_transformers import SentenceTransformer


# Load customer FAISS index + metadata
customer_index = faiss.read_index("customer_faiss_index.index")
customer_meta = pd.read_csv("customer_vector_metadata.csv")

# ---- FUNCTION ----
def find_customers_with_declining_sentiment(top_k=10):
    print("🔍 Checking for customers with declining feedback...")

    # Step 1: Randomly sample top_k customers (or load all if small set)
    total_customers = len(customer_meta)
    sampled_indices = np.random.choice(total_customers, size=min(top_k, total_customers), replace=False)

    # Step 2: Parse and pre-filter for feedback with trend
    filtered_contexts = []
    for idx in sampled_indices:
        record = customer_meta.iloc[idx]
        customer_doc = record["document"]
        customer_id = record["customer_id"]

        # Quick check for multiple feedback entries
        if customer_doc.count("feedback_id") >= 2:
            filtered_contexts.append(f"Customer ID: {customer_id}\n{customer_doc}")

    if not filtered_contexts:
        return "No customers found with enough feedback data to evaluate trends."

    # Step 3: Build prompt
    rag_context = "\n\n---\n\n".join(filtered_contexts)
    prompt = f"""
You are a customer insights assistant.

Below are real customer profiles including feedback history.
ONLY use the data shown to determine if the customer sentiment is declining over time.

DO NOT make up or assume any customer IDs or feedback.

Instructions:
- For each customer with a declining trend, output their Customer ID.
- Summarize the trend (e.g., 'rating dropped from 5 to 2 in 3 months').
- Suggest a personalized retention strategy or offer.

Context:
{rag_context}
"""

    # Step 4: Call Gemini
    response = gemini_model.generate_content(prompt)
    return response.text


In [None]:
print(find_customers_with_declining_sentiment(top_k=10))

🔍 Checking for customers with declining feedback...
No customers found with enough feedback data to evaluate trends.
