In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from brollm import BaseLLM
from typing import Dict, List, Any
import httpx
class OllamaChat(BaseLLM):
    def __init__(
            self, 
            model_name: str = "qwen3:8b",
            temperature: float = 0,
            base_url: str = "http://localhost:11434"
    ):
        self.model_name = model_name
        self.temperature = temperature
        self.base_url = base_url

    def UserMessage(self, text: str, **kwargs) -> Dict[str, Any]:
        return {"role": "user", "content": text}

    def AIMessage(self, text: str) -> Dict[str, Any]:
        return {"role": "assistant", "content": text}

    def SystemMessage(self, text: str) -> Dict[str, Any]:
        return {"role": "system", "content": text}

    def run(self, system_prompt: str, messages: List[Dict[str, Any]]) -> str:
        all_messages = [self.SystemMessage(system_prompt)] + messages
        
        response = httpx.post(f'{self.base_url}/api/chat', json={
            "model": self.model_name,
            "messages": all_messages,
            "stream": False,
            "options": {"temperature": self.temperature},
        },
            timeout=300
        )
        
        return response.json()['message']['content']

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

def create_mock_analytics_data(n_customers=1000, n_products=200, n_purchases=5000):
    np.random.seed(42)
    random.seed(42)
    
    # Customer DataFrame
    customers = pd.DataFrame({
        'customer_id': range(1, n_customers + 1),
        'age': np.random.randint(18, 80, n_customers),
        'gender': np.random.choice(['M', 'F'], n_customers),
        'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], n_customers),
        'state': np.random.choice(['CA', 'TX', 'NY', 'FL', 'IL'], n_customers),
        'income': np.random.normal(65000, 25000, n_customers).astype(int),
        'signup_date': pd.date_range('2020-01-01', '2024-01-01', periods=n_customers),
        'customer_segment': np.random.choice(['Premium', 'Standard', 'Basic'], n_customers),
        'preferred_channel': np.random.choice(['Online', 'Store', 'Mobile'], n_customers),
        'loyalty_score': np.random.randint(1, 101, n_customers)
    })
    
    # Product DataFrame
    products = pd.DataFrame({
        'product_id': range(1, n_products + 1),
        'category': np.random.choice(['Electronics', 'Clothing', 'Home', 'Sports'], n_products),
        'brand': np.random.choice(['Apple', 'Samsung', 'Nike', 'Adidas', 'IKEA'], n_products),
        'price': np.round(np.random.exponential(50, n_products) + 10, 2),
        'cost': np.round(np.random.exponential(30, n_products) + 5, 2),
        'rating': np.round(np.random.uniform(1, 5, n_products), 2),
        'stock_quantity': np.random.randint(0, 1000, n_products)
    })
    
    # Purchase DataFrame
    purchases = pd.DataFrame({
        'purchase_id': range(1, n_purchases + 1),
        'customer_id': np.random.choice(customers['customer_id'], n_purchases),
        'product_id': np.random.choice(products['product_id'], n_purchases),
        'purchase_date': pd.date_range('2020-01-01', '2024-12-01', periods=n_purchases),
        'quantity': np.random.poisson(2, n_purchases) + 1,
        'channel': np.random.choice(['Online', 'Store', 'Mobile'], n_purchases),
        'discount_applied': np.round(np.random.uniform(0, 0.3, n_purchases), 2),
        'returned': np.random.choice([True, False], n_purchases, p=[0.08, 0.92])
    })
    
    # Add total amount
    purchases = purchases.merge(products[['product_id', 'price']], on='product_id')
    purchases['total_amount'] = np.round(purchases['quantity'] * purchases['price'] * (1 - purchases['discount_applied']), 2)
    
    return customers, products, purchases

In [4]:
customers_df, products_df, purchases_df = create_mock_analytics_data()

In [5]:
customers_df

Unnamed: 0,customer_id,age,gender,city,state,income,signup_date,customer_segment,preferred_channel,loyalty_score
0,1,56,M,Phoenix,IL,43243,2020-01-01 00:00:00.000000000,Basic,Store,98
1,2,69,M,New York,FL,59657,2020-01-02 11:05:56.756756756,Basic,Mobile,77
2,3,46,F,Chicago,IL,79007,2020-01-03 22:11:53.513513513,Basic,Store,58
3,4,32,M,Los Angeles,TX,49609,2020-01-05 09:17:50.270270270,Basic,Online,78
4,5,60,M,Chicago,FL,82346,2020-01-06 20:23:47.027027027,Basic,Store,75
...,...,...,...,...,...,...,...,...,...,...
995,996,18,M,New York,TX,103389,2023-12-26 03:36:12.972972960,Standard,Store,44
996,997,35,F,Phoenix,TX,78468,2023-12-27 14:42:09.729729728,Premium,Store,64
997,998,49,F,Chicago,NY,59718,2023-12-29 01:48:06.486486480,Premium,Mobile,31
998,999,64,M,Chicago,CA,54000,2023-12-30 12:54:03.243243232,Premium,Online,47


In [6]:
products_df

Unnamed: 0,product_id,category,brand,price,cost,rating,stock_quantity
0,1,Sports,Apple,28.20,84.09,4.30,269
1,2,Clothing,Nike,19.25,7.04,3.59,291
2,3,Electronics,Nike,40.87,28.92,4.88,452
3,4,Clothing,Samsung,28.34,31.81,3.86,714
4,5,Electronics,Apple,11.79,36.73,4.70,23
...,...,...,...,...,...,...,...
195,196,Sports,Adidas,209.29,28.17,2.33,707
196,197,Home,Samsung,15.12,7.19,3.98,445
197,198,Clothing,Samsung,28.05,86.56,1.61,207
198,199,Home,IKEA,60.84,48.62,1.52,469


In [7]:
purchases_df

Unnamed: 0,purchase_id,customer_id,product_id,purchase_date,quantity,channel,discount_applied,returned,price,total_amount
0,1,976,82,2020-01-01 00:00:00.000000000,4,Mobile,0.14,False,12.58,43.28
1,2,819,131,2020-01-01 08:37:21.088217643,3,Store,0.27,False,35.75,78.29
2,3,104,118,2020-01-01 17:14:42.176435287,5,Online,0.17,False,17.29,71.75
3,4,742,81,2020-01-02 01:52:03.264652930,2,Mobile,0.04,False,48.85,93.79
4,5,426,70,2020-01-02 10:29:24.352870574,4,Store,0.16,False,41.40,139.10
...,...,...,...,...,...,...,...,...,...,...
4995,4996,91,182,2024-11-29 13:30:35.647129408,1,Store,0.13,False,30.59,26.61
4996,4997,854,16,2024-11-29 22:07:56.735347072,3,Online,0.11,False,81.26,216.96
4997,4998,272,183,2024-11-30 06:45:17.823564704,3,Store,0.07,False,10.91,30.44
4998,4999,834,94,2024-11-30 15:22:38.911782336,2,Mobile,0.10,True,24.70,44.46


In [106]:
# Usage
# from broinsight.metadata.metadata_generator import DataFrameMetadataGenerator
# generator = DataFrameMetadataGenerator()
# generator.generate_metadata(customers_df, "customers_df", "Customer information with loyalty score")
# generator.generate_metadata(products_df, "products_df", "Product catalog with pricing and inventory")
# generator.generate_metadata(purchases_df, "purchases_df", "Customer purchases and return status")


In [4]:
from broinsight.metadata.metadata_loader import MetadataLoader

# Usage
loader = MetadataLoader("metadata")

# Load all tables
full_context = loader.construct_prompt_context()

# Load specific tables only
selected_context = loader.construct_prompt_context(["customers_df", "products_df"])


In [108]:
print(loader.get_summary_prompt())

Table: purchases_df - Description: **Table: Purchases and Returns**

This table records every customer order and whether the order was returned. It includes the date and time of the sale, the product bought, the quantity, the price, any discount applied, the total amount paid, the channel through which the purchase was made (online, store, mobile), and the customer who made the purchase.  

**Primary purpose / use cases**  
- **Sales performance** – view total revenue, average sale size, and top‑selling products.  
- **Channel analysis** – compare how many sales occur in each channel and how channel mix affects revenue.  
- **Return monitoring** – calculate return rates, identify high‑return products, and assess the impact of returns on cash flow.  
- **Discount effectiveness** – see how discounts influence the quantity sold and the final revenue.  
- **Revenue forecasting** – feed into budgeting and planning by linking past sales, discounts, and returns to future projections.  

**Key

In [109]:
print(full_context)

TABLE: purchases_df
DESCRIPTION: **Table: Purchases and Returns**

This table records every customer order and whether the order was returned. It includes the date and time of the sale, the product bought, the quantity, the price, any discount applied, the total amount paid, the channel through which the purchase was made (online, store, mobile), and the customer who made the purchase.  

**Primary purpose / use cases**  
- **Sales performance** – view total revenue, average sale size, and top‑selling products.  
- **Channel analysis** – compare how many sales occur in each channel and how channel mix affects revenue.  
- **Return monitoring** – calculate return rates, identify high‑return products, and assess the impact of returns on cash flow.  
- **Discount effectiveness** – see how discounts influence the quantity sold and the final revenue.  
- **Revenue forecasting** – feed into budgeting and planning by linking past sales, discounts, and returns to future projections.  

**Key r

In [110]:
print(selected_context)

TABLE: customers_df
DESCRIPTION: 
This table holds the key details about each customer, including who they are, where they live, how much they earn, and how loyal they are to the brand. It is used by marketing, sales, and product teams to segment customers, target promotions, and measure loyalty. The data links directly to customer‑service interactions, purchase history, and campaign results, helping the company tailor offers, forecast revenue, and improve retention. Typical users are marketing analysts, loyalty program managers, and senior executives who need a clear view of customer profiles and engagement levels.
FIELDS:
  - age (INTEGER): Customer age in years (18-79)
  - city (VARCHAR): Customer residential city [Values: Chicago, Houston, Los Angeles, New York, Phoenix]
  - customer_id (INTEGER): Unique customer identifier
  - customer_segment (VARCHAR): Customer tier classification (Basic, Standard, Premium) [Values: Basic, Premium, Standard]
  - gender (VARCHAR): Customer gender

In [111]:
from broinsight.metadata.table_descriptor import TableDescriptor
from broprompt import Prompt

In [112]:
td = TableDescriptor(
    Prompt.from_markdown("./broinsight/prompt_hub/table_descriptor.md").str, 
    OpenAI20b()
)

In [113]:
metadata = loader.get_metadata_dict()['customers_df']
print(td.run("{metadata}".format(metadata=metadata)))


The customers_df table stores the core profile information for every customer in the business.  
It includes who the customer is (ID, gender, age), where they live (city, state), how much they earn (annual income), how loyal they are (loyalty score), what tier they belong to (Basic, Standard, Premium), what channel they prefer for shopping, and when they first signed up.  

This data is used by marketing, sales, and product teams to:
- Segment the customer base for targeted campaigns and promotions
- Personalize offers and communications based on demographics, income, and loyalty
- Forecast revenue and assess the impact of marketing activities
- Measure and improve customer retention and satisfaction  

The table connects to other business processes such as customer‑service interactions, purchase history, and campaign analytics, allowing a holistic view of each customer’s journey.  

Typical users are marketing analysts, loyalty program managers, product managers, and senior executive

In [114]:
metadata_dict = loader.get_metadata_dict()
for table_name, detail in metadata_dict.items():
    description = td.run("{metadata}".format(metadata=detail))
    metadata_dict[table_name]['description'] = description

In [115]:
import yaml

for table_name, detail in metadata_dict.items():
    with open(f"metadata/{table_name}.yaml", "w") as f:
        detail.update({"table_name": table_name})
        yaml.dump(detail, f)

In [8]:
from brollm import BedrockChat, BaseLLM

class BaseOpenAI(BedrockChat):
    def __init__(self, model_name):
        super().__init__(model_name=model_name)
    
    def run(self, system_prompt, messages):
        # prompt = ["ROLE: {role}\n{content}".format(role=m['role'], content=m['content'][0]['text']) for m in messages]
        # prompt = "\n".join(prompt)
        model = self.get_model()
        response = model.converse(
            modelId=self.model_name,
            # messages=[self.UserMessage(text=prompt)],
            messages=messages,
            system=self.SystemMessage(text=system_prompt),
            inferenceConfig={
                # "maxTokens": 150, 
                "temperature": 0.7, 
                # "topP": 0.9
            },
        )
        return response['output']['message']['content'][-1]['text']

class OpenAI20b(BaseOpenAI):
    def __init__(self):
        super().__init__(model_name="openai.gpt-oss-20b-1:0")

In [11]:
from broinsight.metadata.metadata_db import DuckConnector
connector = DuckConnector()
connector.register_dataframe(customers_df, "customers_df")
connector.register_dataframe(products_df, "products_df")
connector.register_dataframe(purchases_df, "purchases_df")
connector.register_metadata_from_yaml("metadata/")

In [10]:
connector.close()

In [12]:
print(connector.construct_prompt_context())

TABLE: purchases_df
DESCRIPTION: **Table: Purchases and Returns**

This table logs every customer order, including the product bought, quantity, price, any discount, the final amount paid, the sales channel (online, store, or mobile), the customer who made the purchase, and whether the order was returned.

**Why it matters**

- **Track sales performance** – see total revenue, average sale size, and which products sell best.  
- **Compare channels** – understand how online, in‑store, and mobile sales differ and how each channel contributes to revenue.  
- **Monitor returns** – calculate return rates, spot high‑return products, and assess the cash‑flow impact of returns.  
- **Evaluate discounts** – measure how discounts affect quantity sold and net revenue.  
- **Support forecasting** – feed past sales, discounts, and return data into budgeting and revenue projections.

**Key connections**

- *Customer* – links each transaction to a customer profile for segmentation.  
- *Product* – tie

In [42]:
from broinsight import BroInsight
from broflow import state

state.set("debug", False)
model = OpenAI20b()
# model = OllamaChat(model_name="gpt-oss:20b")
# Setup

# Use in BroInsight (no changes needed)
agent = BroInsight(model, metadata_loader=connector, db=connector)


In [31]:
result = agent.ask("Who is the most frequent buyer?")

Welcome to BroInsight!
Thank you for using BroInsight!
AI: The most frequent buyer in the data you provided is:

| customer_id | purchase_count |
|-------------|----------------|
| **464**     | **15**         |

Since there’s only one record, customer 464 is the top buyer by purchase count. If you have more data and want to compare multiple customers, just let me know!


In [43]:
result = agent.chat()

Welcome to BroInsight!
AI: **Here’s a quick snapshot of the data we have**

| Table | What it holds | Key fields you can use |
|-------|---------------|------------------------|
| **purchases_df** | Every order (and return) – product, price, discount, channel, customer, date, quantity, total amount | `channel`, `customer_id`, `product_id`, `purchase_date`, `quantity`, `discount_applied`, `total_amount`, `returned` |
| **customers_df** | The master customer profile – demographics, loyalty, signup info, preferred channel | `customer_id`, `age`, `gender`, `city`, `state`, `income`, `loyalty_score`, `customer_segment`, `preferred_channel`, `signup_date` |
| **products_df** | The product catalogue – brand, category, cost, price, stock, rating | `product_id`, `brand`, `category`, `price`, `cost`, `rating`, `stock_quantity` |

All three tables link through the `customer_id` and `product_id` fields, so you can combine them to explore sales performance, customer behavior, and product profitabil

In [27]:
connector.execute_query(result.sql_query)

Unnamed: 0,customer_id,purchase_count,product_id,brand,category,product_purchase_count
0,464,15,131,IKEA,Electronics,3


In [80]:
model.run("you're a helpful assistant", messages=[
    model.UserMessage(text="Hi")
])

'Hello! How can I help you today?'

In [99]:
model.UserMessage(text="Hi")

{'role': 'user', 'content': [{'text': 'Hi'}]}

In [None]:
import duckdb

with duckdb.connect("metadata")