In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

def create_mock_analytics_data(n_customers=1000, n_products=200, n_purchases=5000):
    np.random.seed(42)
    random.seed(42)
    
    # Customer DataFrame
    customers = pd.DataFrame({
        'customer_id': range(1, n_customers + 1),
        'age': np.random.randint(18, 80, n_customers),
        'gender': np.random.choice(['M', 'F'], n_customers),
        'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], n_customers),
        'state': np.random.choice(['CA', 'TX', 'NY', 'FL', 'IL'], n_customers),
        'income': np.random.normal(65000, 25000, n_customers).astype(int),
        'signup_date': pd.date_range('2020-01-01', '2024-01-01', periods=n_customers),
        'customer_segment': np.random.choice(['Premium', 'Standard', 'Basic'], n_customers),
        'preferred_channel': np.random.choice(['Online', 'Store', 'Mobile'], n_customers),
        'loyalty_score': np.random.randint(1, 101, n_customers)
    })
    
    # Product DataFrame
    products = pd.DataFrame({
        'product_id': range(1, n_products + 1),
        'category': np.random.choice(['Electronics', 'Clothing', 'Home', 'Sports'], n_products),
        'brand': np.random.choice(['Apple', 'Samsung', 'Nike', 'Adidas', 'IKEA'], n_products),
        'price': np.round(np.random.exponential(50, n_products) + 10, 2),
        'cost': np.round(np.random.exponential(30, n_products) + 5, 2),
        'rating': np.round(np.random.uniform(1, 5, n_products), 2),
        'stock_quantity': np.random.randint(0, 1000, n_products)
    })
    
    # Purchase DataFrame
    purchases = pd.DataFrame({
        'purchase_id': range(1, n_purchases + 1),
        'customer_id': np.random.choice(customers['customer_id'], n_purchases),
        'product_id': np.random.choice(products['product_id'], n_purchases),
        'purchase_date': pd.date_range('2020-01-01', '2024-12-01', periods=n_purchases),
        'quantity': np.random.poisson(2, n_purchases) + 1,
        'channel': np.random.choice(['Online', 'Store', 'Mobile'], n_purchases),
        'discount_applied': np.round(np.random.uniform(0, 0.3, n_purchases), 2),
        'returned': np.random.choice([True, False], n_purchases, p=[0.08, 0.92])
    })
    
    # Add total amount
    purchases = purchases.merge(products[['product_id', 'price']], on='product_id')
    purchases['total_amount'] = np.round(purchases['quantity'] * purchases['price'] * (1 - purchases['discount_applied']), 2)
    
    return customers, products, purchases

In [3]:
customers_df, products_df, purchases_df = create_mock_analytics_data()

In [4]:
customers_df

Unnamed: 0,customer_id,age,gender,city,state,income,signup_date,customer_segment,preferred_channel,loyalty_score
0,1,56,M,Phoenix,IL,43243,2020-01-01 00:00:00.000000000,Basic,Store,98
1,2,69,M,New York,FL,59657,2020-01-02 11:05:56.756756756,Basic,Mobile,77
2,3,46,F,Chicago,IL,79007,2020-01-03 22:11:53.513513513,Basic,Store,58
3,4,32,M,Los Angeles,TX,49609,2020-01-05 09:17:50.270270270,Basic,Online,78
4,5,60,M,Chicago,FL,82346,2020-01-06 20:23:47.027027027,Basic,Store,75
...,...,...,...,...,...,...,...,...,...,...
995,996,18,M,New York,TX,103389,2023-12-26 03:36:12.972972960,Standard,Store,44
996,997,35,F,Phoenix,TX,78468,2023-12-27 14:42:09.729729728,Premium,Store,64
997,998,49,F,Chicago,NY,59718,2023-12-29 01:48:06.486486480,Premium,Mobile,31
998,999,64,M,Chicago,CA,54000,2023-12-30 12:54:03.243243232,Premium,Online,47


In [5]:
products_df

Unnamed: 0,product_id,category,brand,price,cost,rating,stock_quantity
0,1,Sports,Apple,28.20,84.09,4.30,269
1,2,Clothing,Nike,19.25,7.04,3.59,291
2,3,Electronics,Nike,40.87,28.92,4.88,452
3,4,Clothing,Samsung,28.34,31.81,3.86,714
4,5,Electronics,Apple,11.79,36.73,4.70,23
...,...,...,...,...,...,...,...
195,196,Sports,Adidas,209.29,28.17,2.33,707
196,197,Home,Samsung,15.12,7.19,3.98,445
197,198,Clothing,Samsung,28.05,86.56,1.61,207
198,199,Home,IKEA,60.84,48.62,1.52,469


In [6]:
purchases_df

Unnamed: 0,purchase_id,customer_id,product_id,purchase_date,quantity,channel,discount_applied,returned,price,total_amount
0,1,976,82,2020-01-01 00:00:00.000000000,4,Mobile,0.14,False,12.58,43.28
1,2,819,131,2020-01-01 08:37:21.088217643,3,Store,0.27,False,35.75,78.29
2,3,104,118,2020-01-01 17:14:42.176435287,5,Online,0.17,False,17.29,71.75
3,4,742,81,2020-01-02 01:52:03.264652930,2,Mobile,0.04,False,48.85,93.79
4,5,426,70,2020-01-02 10:29:24.352870574,4,Store,0.16,False,41.40,139.10
...,...,...,...,...,...,...,...,...,...,...
4995,4996,91,182,2024-11-29 13:30:35.647129408,1,Store,0.13,False,30.59,26.61
4996,4997,854,16,2024-11-29 22:07:56.735347072,3,Online,0.11,False,81.26,216.96
4997,4998,272,183,2024-11-30 06:45:17.823564704,3,Store,0.07,False,10.91,30.44
4998,4999,834,94,2024-11-30 15:22:38.911782336,2,Mobile,0.10,True,24.70,44.46


In [7]:
# Usage
# from broinsight.metadata.metadata_generator import DataFrameMetadataGenerator
# generator = DataFrameMetadataGenerator()
# generator.generate_metadata(customers_df, "customers_df", "Customer information with loyalty score")
# generator.generate_metadata(products_df, "products_df", "Product catalog with pricing and inventory")
# generator.generate_metadata(purchases_df, "purchases_df", "Customer purchases and return status")


In [8]:
from broinsight.metadata.metadata_loader import MetadataLoader

# Usage
loader = MetadataLoader("metadata")

# Load all tables
full_context = loader.construct_prompt_context()

# Load specific tables only
selected_context = loader.construct_prompt_context(["customers_df", "products_df"])


In [9]:
print(loader.get_summary_prompt())

Table: customers_df - Description: Customer information with loyalty score
Table: products_df - Description: Product catalog with pricing and inventory
Table: purchases_df - Description: Customer purchases and return status


In [10]:
print(full_context)

TABLE: customers_df
DESCRIPTION: Customer information with loyalty score
FIELDS:
  - age (INTEGER): Customer age in years (18-79)
  - city (VARCHAR): Customer residential city [Values: Chicago, Houston, Los Angeles, New York, Phoenix]
  - customer_id (INTEGER): Unique customer identifier
  - customer_segment (VARCHAR): Customer tier classification (Basic, Standard, Premium) [Values: Basic, Premium, Standard]
  - gender (VARCHAR): Customer gender (M/F) [Values: F, M]
  - income (INTEGER): Annual household income in USD
  - loyalty_score (INTEGER): Customer loyalty rating (1-100 scale)
  - preferred_channel (VARCHAR): Customer preferred shopping channel (Online, Store, Mobile) [Values: Mobile, Online, Store]
  - signup_date (TIMESTAMP): Date when customer registered account
  - state (VARCHAR): Customer residential state (US state code) [Values: CA, FL, IL, NY, TX]

TABLE: products_df
DESCRIPTION: Product catalog with pricing and inventory
FIELDS:
  - brand (VARCHAR): Product brand name 

In [11]:
print(selected_context)

TABLE: customers_df
DESCRIPTION: Customer information with loyalty score
FIELDS:
  - age (INTEGER): Customer age in years (18-79)
  - city (VARCHAR): Customer residential city [Values: Chicago, Houston, Los Angeles, New York, Phoenix]
  - customer_id (INTEGER): Unique customer identifier
  - customer_segment (VARCHAR): Customer tier classification (Basic, Standard, Premium) [Values: Basic, Premium, Standard]
  - gender (VARCHAR): Customer gender (M/F) [Values: F, M]
  - income (INTEGER): Annual household income in USD
  - loyalty_score (INTEGER): Customer loyalty rating (1-100 scale)
  - preferred_channel (VARCHAR): Customer preferred shopping channel (Online, Store, Mobile) [Values: Mobile, Online, Store]
  - signup_date (TIMESTAMP): Date when customer registered account
  - state (VARCHAR): Customer residential state (US state code) [Values: CA, FL, IL, NY, TX]

TABLE: products_df
DESCRIPTION: Product catalog with pricing and inventory
FIELDS:
  - brand (VARCHAR): Product brand name 

In [12]:
from brollm import BedrockChat

class BaseOpenAI(BedrockChat):
    def __init__(self, model_name):
        super().__init__(model_name=model_name)
    
    def run(self, system_prompt, messages):
        model = self.get_model()
        response = model.converse(
            modelId=self.model_name,
            messages=messages,
            system=self.SystemMessage(text=system_prompt),
            inferenceConfig={
                # "maxTokens": 150, 
                "temperature": 0.7, 
                # "topP": 0.9
            },
        )
        return response['output']['message']['content'][-1]['text']

class OpenAI20b(BaseOpenAI):
    def __init__(self):
        super().__init__(model_name="openai.gpt-oss-20b-1:0")

In [15]:
from broinsight.flow import get_flow
from broinsight.actions import Shared
from broflow import state

state.set("debug", False)

flow = get_flow(model=OpenAI20b())
flow.save_mermaid("flow.md")

import duckdb
from broinsight.metadata.metadata_loader import MetadataLoader
loader = MetadataLoader("metadata")

conn = duckdb.connect()
conn.register("customers_df", customers_df)
conn.register("products_df", products_df)
conn.register("purchases_df", purchases_df)
shared = Shared(db=conn, metadata_loader=loader)

flow.run(shared)

Welcome to BroInsight!
AI: Below is a quick snapshot of the **top‑5 products by quantity sold** for each year that appears in your data set.  
(If a year had fewer than five products, we simply list all that were available.)

| Year | Rank | Product ID | Total Quantity | Total Sales |
|------|------|------------|----------------|-------------|
| **2024** | 1 | 190 | 33.0 | $481.27 |
| | 2 | 36 | 31.0 | $1,213.57 |
| | 3 | 7 | 29.0 | $1,237.93 |
| | 4 | 133 | 28.0 | $1,411.93 |
| | 5 | 95 | 27.0 | $963.54 |
| **2023** | 1 | 32 | 42.0 | $1,023.34 |
| | 2 | 29 | 33.0 | $3,287.95 |
| | 3 | 176 | 33.0 | $568.05 |
| | 4 | 52 | 33.0 | $275.09 |
| | 5 | 143 | 31.0 | $771.41 |
| **2022** | 1 | 15 | 35.0 | $1,180.25 |
| | 2 | 122 | 33.0 | $2,399.11 |
| | 3 | 172 | 33.0 | $1,405.49 |
| | 4 | 96 | 33.0 | $1,088.41 |
| | 5 | 186 | 32.0 | $1,541.39 |
| **2021** | 1 | 72 | 49.0 | $703.57 |
| | 2 | 38 | 43.0 | $756.55 |
| | 3 | 167 | 42.0 | $2,235.71 |
| | 4 | 27 | 36.0 | $3,080.40 |
| | 5 | 39 | 32.0

'default'

In [16]:
conn.execute(shared.sql_query).df()

Unnamed: 0,yr,product_id,brand,category,total_quantity,total_sales
0,2020,48,Nike,Sports,39.0,815.09
1,2021,72,Apple,Electronics,49.0,703.57
2,2022,15,Nike,Clothing,35.0,1180.25
3,2023,32,Samsung,Home,42.0,1023.34
4,2024,190,Nike,Clothing,33.0,481.27


In [65]:
print(shared.sql_query)


WITH buyer_counts AS (
    SELECT
        customer_id,
        COUNT(*) AS purchase_count
    FROM purchases_df
    GROUP BY customer_id
),
top_buyer AS (
    SELECT
        customer_id,
        purchase_count
    FROM buyer_counts
    ORDER BY purchase_count DESC
    LIMIT 1
),
category_counts AS (
    SELECT
        p.customer_id,
        prod.category,
        SUM(p.quantity) AS total_quantity
    FROM purchases_df p
    JOIN products_df prod ON p.product_id = prod.product_id
    WHERE p.customer_id = (SELECT customer_id FROM top_buyer)
    GROUP BY p.customer_id, prod.category
)
SELECT
    tb.customer_id,
    tb.purchase_count,
    cc.category,
    cc.total_quantity
FROM top_buyer tb
CROSS JOIN (
    SELECT
        category,
        total_quantity
    FROM category_counts
    ORDER BY total_quantity DESC
    LIMIT 1
) cc;



In [64]:
conn.execute(shared.sql_query).df()

Unnamed: 0,customer_id,purchase_count,category,total_quantity
0,464,15,Clothing,16.0


In [66]:
shared.selected_metadata

['purchases_df', 'products_df', 'customers_df']