Import libraries

In [1]:
import os
import re
import json
import sqlite3
import chromadb
from chromadb.utils import embedding_functions
from dotenv import load_dotenv
import google.generativeai as genai


  from .autonotebook import tqdm as notebook_tqdm


Load API key from .env


In [2]:
load_dotenv()
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))

Defining Paths

In [3]:
EXAMPLES_PATH = "data/examples.json"
DB_PATH = "data/SuperStoreOrders.db" 

Load examples.json file

In [4]:
with open(EXAMPLES_PATH, "r") as f:
    examples = json.load(f)

Chroma DB for RAG

In [5]:
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
client = chromadb.Client()
collection = client.create_collection(name="patterns", embedding_function=embedding_fn)

# Add examples to vector DB (store checklist as JSON string)

for idx, ex in enumerate(examples):
    collection.add(
        ids=[str(idx)],
        documents=[ex["pattern"]],
        metadatas=[{"checklist": json.dumps(ex["checklist"])}]  # store as string
    )


  return forward_call(*args, **kwargs)


Function to retrieve similar patterns

In [6]:
def retrieve_similar_patterns(user_pattern, top_k=2):
    results = collection.query(query_texts=[user_pattern], n_results=top_k)
    # Parse checklist string back to list
    parsed_results = []
    for meta in results["metadatas"]:
        checklist = json.loads(meta[0]["checklist"]) if meta[0].get("checklist") else []
        parsed_results.append({"checklist": checklist})
    return parsed_results

Function to generate checklist, given new pattern

In [7]:
def generate_checklist(user_pattern, retrieved_patterns):
    prompt = f"""
You are a root cause analysis assistant.
Given the following historical patterns and their checklists:
{retrieved_patterns}

Now, create a final checklist (only 5 points in checklist) for this new pattern:
'{user_pattern}'
Checklist should be step-by-step, concise, and relevant to metrics columns:
There are 4 metrics tables:
1. metrics/monthly_metrics.csv : [month_start,monthly_sales,pct_change_month]
2. metrics/weekly_metrics.csv : [week_start,weekly_sales,pct_change_week]
3. metrics/nth_week_monthly_change.csv: [year,month,nth_week,sales,pct_change_mom]
4. metrics/weekly_product_share.csv: [order_date,category,sub_category,product_name,sales,subcat_sales,pct_share]
    """
    resp = genai.GenerativeModel("gemini-1.5-flash").generate_content(prompt)
    return [item.strip("- ").strip() for item in resp.text.split("\n") if item.strip()]


Function to generate SQL query

In [12]:
def generate_pandas_code(checklist_item):
    prompt = f"""
You are an expert Pandas code generator.
Write ONE valid pandas code (no multiple statements, no comments, no placeholders).
1. metrics/monthly_metrics.csv : [month_start,monthly_sales,pct_change_month]
2. metrics/weekly_metrics.csv : [week_start,weekly_sales,pct_change_week]
3. metrics/nth_week_monthly_change.csv: [year,month,nth_week,sales,pct_change_mom]
4. metrics/weekly_product_share.csv: [order_date,category,sub_category,product_name,sales,subcat_sales,pct_share]
    

The request is:
"{checklist_item}"

Return ONLY the pandas code, no explanation, no comments, no markdown.
"""
    resp = genai.GenerativeModel("gemini-1.5-flash").generate_content(prompt)
    return resp.text.strip()


Function to run SQL query

In [9]:

def clean_sql(sql):
    # Remove SQL comments
    sql = re.sub(r"--.*", "", sql)
    # Remove extra spaces
    sql = sql.strip()
    # Remove code block markers from Gemini output
    sql = sql.replace("```sql", "").replace("```", "").strip()
    # Ensure only one statement by splitting on semicolon
    if ";" in sql:
        sql = sql.split(";")[0]
    return sql

def run_sql(query):
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    try:
        cleaned_query = clean_sql(query)
        cursor.execute(cleaned_query)
        results = cursor.fetchall()
        col_names = [desc[0] for desc in cursor.description]
        conn.close()
        return col_names, results
    except Exception as e:
        conn.close()
        return None, str(e)


Testing with new real life pattern

In [13]:
user_input_pattern = "market share of Ikea is decreasing in the furniture category"

# Step 1: Retrieve similar patterns
similar = retrieve_similar_patterns(user_input_pattern)

# Step 2: Generate checklist
checklist = generate_checklist(user_input_pattern, similar)
print("\nGenerated Checklist:")
for i, step in enumerate(checklist, 1):
    print(f"{i}. {step}")

# Step 3: Generate and run SQL for each checklist item
for step in checklist:
    sql = generate_pandas_code(step)
    print(f"\nChecklist Item: {step}")
    print(f"Pandas code:\n{sql}")
    


Generated Checklist:
1. **Final Checklist (5 points) for Decreasing IKEA Market Share in Furniture:**
2. 1. **Quantify the decline:** Analyze `metrics/monthly_metrics.csv` and `metrics/nth_week_monthly_change.csv` to determine the magnitude and duration of IKEA's market share decrease in the furniture category.  Calculate the overall percentage drop and identify periods of most significant decline.
3. 2. **Identify affected product areas:** Using `metrics/weekly_product_share.csv`, pinpoint specific sub-categories and products within the furniture category where IKEA's `pct_share` has decreased most dramatically.
4. 3. **Analyze competitor performance:** Investigate whether the overall furniture category sales are declining (`metrics/monthly_metrics.csv` and `metrics/weekly_metrics.csv`) or if IKEA's share loss is due to competitor gains.  If possible, compare IKEA's performance against key competitors.
5. 4. **Assess pricing and promotions:** Compare IKEA's discount and promotion str

In [11]:
user_input_pattern = "More profit in Canada than EU"

# Step 1: Retrieve similar patterns
similar = retrieve_similar_patterns(user_input_pattern)

# Step 2: Generate checklist
checklist = generate_checklist(user_input_pattern, similar)
print("\nGenerated Checklist:")
for i, step in enumerate(checklist, 1):
    print(f"{i}. {step}")

# Step 3: Generate and run SQL for each checklist item
for step in checklist:
    sql = generate_sql(step)
    print(f"\nChecklist Item: {step}")
    print(f"SQL:\n{sql}")
    cols, res = run_sql(sql)
    if cols:
        print("Results:", res[:5])  
    else:
        print("Error:", res)


Generated Checklist:
1. **Final Checklist: Investigating Higher Profit in Canada than EU**
2. 1. **Compare Overall Profitability:** Calculate total profit for Canada and EU using `monthly_metrics.csv` (sum `monthly_sales` for each region, adjusting for known costs).  This establishes the overall difference.
3. 2. **Analyze Monthly Sales Trends:** Using `monthly_metrics.csv`, compare month-over-month `pct_change_month` in sales for Canada and EU to identify any significant divergence in growth patterns.
4. 3. **Investigate Weekly Performance:**  Use `weekly_metrics.csv` to perform a similar analysis as step 2, looking for any weeks with significant divergences in `pct_change_week` between Canada and EU. This provides finer-grained insight.
5. 4. **Drill Down by Product Category:** Utilize `weekly_product_share.csv` to identify specific product categories or sub-categories driving the profit differential.  Compare `sales` and `pct_share` for Canada and EU across categories.
6. 5. **Exam

NameError: name 'generate_sql' is not defined