## Importing Libraries

In [7]:
import os
import re
import json
from dotenv import load_dotenv
import chromadb
from sentence_transformers import SentenceTransformer
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
import google.generativeai as genai

## Loading env variables

In [2]:
# Load environment variables
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

## Set up embedding using Sentence transformer

In [3]:
embedding_fn = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

## Set up chroma db

In [4]:
client = chromadb.Client()
collection = client.get_or_create_collection(name="rca_patterns", embedding_function=embedding_fn)

## Set up Gemini 1.5 Flash

In [5]:
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel("gemini-1.5-flash")

## Load data.json into ChromaDB (if DB is empty)

In [8]:
def preprocess_text(text: str) -> str:
    """Preprocess text by lowercasing and removing extra whitespace."""
    text = text.lower().strip()
    text = re.sub(r'\s+', ' ', text)
    return text

if not collection.count():
    with open("data/examples.json") as f:
        data = json.load(f)
        for item in data:
            collection.add(
                documents=[preprocess_text(item["pattern"])],
                metadatas=[{"checklist": item["checklist"], "source": "json"}],
                ids=[preprocess_text(item["pattern"])]
            )
    print("✅ Loaded initial data from data.json")
else:
    print("ℹ️ ChromaDB already initialized with data")

✅ Loaded initial data from data.json


  return forward_call(*args, **kwargs)


## RCA agent logic

In [9]:
def retrieve_or_generate_checklist(pattern: str, similarity_threshold=0.3, max_results=10):
    """
    Retrieves relevant checklists from the vector database and generates a new checklist
    tailored to the given pattern. If no relevant examples are found, generates from scratch.
    """
    # Preprocess the query pattern
    pattern = preprocess_text(pattern)
    print("Query pattern:", pattern)

    # Check for exact match in IDs
    existing_data = collection.get(ids=[pattern], include=["documents", "metadatas"])
    if existing_data["ids"]:
        print(f"📎 Exact match found for pattern: {pattern}")
        return existing_data["documents"][0]

    # Query similar patterns
    results = collection.query(
        query_texts=[pattern],
        n_results=max_results,
        include=["documents", "distances"]
    )
    documents = results["documents"][0]
    distances = results["distances"][0]
    print("Query results:", results["documents"], results["distances"])

    # Filter relevant examples
    relevant_examples = [
        doc for doc, dist in zip(documents, distances)
        if doc and dist < (1 - similarity_threshold)
    ]
    print("Relevant examples:", relevant_examples)

    if relevant_examples:
        print(f"📎 Found {len(relevant_examples)} relevant examples. Merging checklists for new pattern.")
        checklists_text = "\n\n".join(
            f"Checklist {i+1}:\n{ckl}" for i, ckl in enumerate(relevant_examples)
        )
        prompt = f"""
You are an FMCG root cause analysis expert.

A new pattern was observed: "{pattern}".

Here are several similar past issues and their checklists:
{checklists_text}

Based on these, generate a new checklist tailored to the new pattern. Output just 5 crisp subtasks, one per line.
"""
        checklist = gemini_model.generate_content(prompt).text.strip()
    else:
        print("🚀 No relevant examples found. Generating from scratch.")
        prompt = f"""
You are an expert in root cause analysis for FMCG products.
Given the issue: "{pattern}", list 5 crisp subtasks to investigate the root causes. Only one point per line.
"""
        checklist = gemini_model.generate_content(prompt).text.strip()

    # Store new pattern + checklist
    collection.add(
        documents=[preprocess_text(pattern)],  # Store pattern as document
        metadatas=[{"checklist": checklist, "source": "generated"}],
        ids=[pattern]
    )

    return checklist

## 🎯 Example test

In [10]:
pattern = "Increased sales on monday mornings"
checklist = retrieve_or_generate_checklist(pattern)
print("\n📋 Checklist:\n", checklist)

Query pattern: increased sales on monday mornings
Query results: [['sales dip during festival season', 'year-on-year growth < 3%']] [[1.113682746887207, 1.5396499633789062]]
Relevant examples: []
🚀 No relevant examples found. Generating from scratch.

📋 Checklist:
 1. Analyze sales data for all Mondays over the past year, comparing them to other weekdays, to identify consistent patterns and magnitude of the increase.  Include segmentation by product, location, and customer type (if available).

2. Investigate potential promotional activities or marketing campaigns running on or immediately preceding Mondays, including in-store displays, digital ads, and loyalty programs.

3. Examine supply chain processes to identify potential bottlenecks or unusual efficiencies that might lead to higher availability on Mondays compared to other days.

4. Explore consumer behavior patterns through surveys, focus groups, or social media analysis to understand Monday-specific purchasing motivations (e.g.

In [13]:
pattern = "sales dip during festival season like diwali and pongal"
checklist = retrieve_or_generate_checklist(pattern)
print("\n📋 Checklist:\n", checklist)

Query pattern: sales dip during festival season like diwali and pongal
Query results: [['sales dip during festival season', 'increased sales on monday mornings', 'protein bar sales spike in tier-2 cities', 'year-on-year growth < 3%']] [[0.3110525906085968, 1.2025566101074219, 1.4031076431274414, 1.5327401161193848]]
Relevant examples: ['sales dip during festival season']
📎 Found 1 relevant examples. Merging checklists for new pattern.

📋 Checklist:
 Investigate competitor promotional activity during festivals.
Analyze distribution channel effectiveness during peak festival periods.
Assess product stock levels and supply chain disruptions.
Evaluate pricing strategy and promotional offers' impact.
Review consumer sentiment and preference shifts during festivals.


In [12]:
pattern = "Protein bar sales spike in tier-2 cities"
checklist = retrieve_or_generate_checklist(pattern)
print("\n📋 Checklist:\n", checklist)

Query pattern: protein bar sales spike in tier-2 cities
Query results: [['sales dip during festival season', 'increased sales on monday mornings', 'year-on-year growth < 3%']] [[1.268486738204956, 1.343087911605835, 1.4514671564102173]]
Relevant examples: []
🚀 No relevant examples found. Generating from scratch.

📋 Checklist:
 1. Analyze Tier-2 city market penetration & competitive landscape changes before and during the sales spike.
2. Investigate marketing and promotional activities targeting Tier-2 cities during the relevant period.
3.  Assess distribution network effectiveness and changes in Tier-2 cities, including retailer relationships and stock levels.
4. Identify any specific product variations or pricing strategies deployed in Tier-2 cities that might explain the increase.
5. Conduct consumer research in Tier-2 cities to understand purchase drivers, brand perception, and reasons for increased consumption.
