In [1]:
%load_ext autoreload
%autoreload 2

# Test Metadata

In [24]:
import seaborn as sns
import duckdb

conn = duckdb.connect()
conn.register("tips", sns.load_dataset("tips"))
conn.execute("SELECT * FROM tips;").df()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [None]:
from broinsight.utils.data_spec import Metadata, TableSpec, FieldSpec, create_field_specs_from_profile, FieldDescription, FieldDescriptions
from broinsight.data_quality.sql_profile import sql_field_profile, sql_table_profile

# Get profiles
tbl_prof = sql_table_profile(conn, "tips")
field_prof = sql_field_profile(conn, "tips")

# Create specs
table_spec = TableSpec(**tbl_prof)
field_specs = create_field_specs_from_profile(field_prof)

# Create metadata
metadata = Metadata(
    table_name="tips", # this must matches the table name of data
    table_description="Restaurant tips dataset", # this should be human-readable summary
    table_spec=table_spec, # mandatory
    field_spec=field_specs # mandatory
)

In [None]:
# field description here is important because it'll convert to LLM context later by DataCatalog and fetch it to BroInsight.method(...)

descriptions = dict(
    total_bill="the amount of paid bill of the meal",
    tip="the amount of tip that customers paid",
    sex="the gender of customers",
    smoker="it indicates that a customer is a smoker or not. if No means a customer is a non-smoker, Yes means a customer is a smoker.",
    day="this is a day of the week when a customer having a meal here. i.e. Mon, Tue, Wed, Thu, Fri, Sat, Sun",
    time="the time of the meal. it can be either Dinner or Lunch",
    size="the number of dishes that customers have",
)

field_descriptions = [
    FieldDescription(field_name=k, description=v) for k, v in descriptions.items()
]

metadata.add_field_descriptions(field_descriptions=FieldDescriptions(descriptions=field_descriptions))

In [6]:
from broinsight.utils.data_spec import create_data_quality_assessment

# Add quality assessment to metadata
quality_assessment = create_data_quality_assessment(field_prof)
metadata.add_data_quality_assessment(quality_assessment)

In [7]:
metadata

Metadata(table_name='tips', table_description='Restaurant tips dataset', table_spec=TableSpec(rows=244, columns=7, duplicates=1, evidences={0: {'total_bill': 13.0, 'tip': 2.0, 'sex': 'Female', 'smoker': 'Yes', 'day': 'Thur', 'time': 'Lunch', 'size': 2, 'dup_count': 2}}), field_spec=[FieldSpec(field_name='total_bill', data_type='float', missing_values=0, missing_values_pct=0.0, unique_values=229, unique_values_pct=0.94, most_frequent={13.42: 3, 7.25: 2, 20.29: 2, 10.34: 2, 15.69: 2}, statistics={'min': 3.07, 'max': 50.81, 'mean': 19.79, 'median': 17.8, 'std': 8.9, 'var': 79.25, 'skew': 1.13, 'kurt': 1.22, 'iqr': 10.78, 'cv': 0.45, 'lower_bound': -2.82, 'upper_bound': 40.3}, description='the amount of paid bill of the meal'), FieldSpec(field_name='tip', data_type='float', missing_values=0, missing_values_pct=0.0, unique_values=123, unique_values_pct=0.5, most_frequent={2.0: 33, 3.0: 23, 4.0: 12, 5.0: 10, 2.5: 10}, statistics={'min': 1.0, 'max': 10.0, 'mean': 3.0, 'median': 2.9, 'std': 1.

In [25]:
metadata.__dict__.keys()

dict_keys(['table_name', 'table_description', 'table_spec', 'field_spec', 'field_descriptions', 'data_quality'])

In [26]:
metadata.table_name

'tips'

In [27]:
metadata.table_description

'Restaurant tips dataset'

In [29]:
metadata.table_spec.__dict__.keys()

dict_keys(['rows', 'columns', 'duplicates', 'evidences'])

In [31]:
metadata.field_spec

[FieldSpec(field_name='total_bill', data_type='float', missing_values=0, missing_values_pct=0.0, unique_values=229, unique_values_pct=0.94, most_frequent={13.42: 3, 7.25: 2, 20.29: 2, 10.34: 2, 15.69: 2}, statistics={'min': 3.07, 'max': 50.81, 'mean': 19.79, 'median': 17.8, 'std': 8.9, 'var': 79.25, 'skew': 1.13, 'kurt': 1.22, 'iqr': 10.78, 'cv': 0.45, 'lower_bound': -2.82, 'upper_bound': 40.3}, description='the amount of paid bill of the meal'),
 FieldSpec(field_name='tip', data_type='float', missing_values=0, missing_values_pct=0.0, unique_values=123, unique_values_pct=0.5, most_frequent={2.0: 33, 3.0: 23, 4.0: 12, 5.0: 10, 2.5: 10}, statistics={'min': 1.0, 'max': 10.0, 'mean': 3.0, 'median': 2.9, 'std': 1.38, 'var': 1.91, 'skew': 1.47, 'kurt': 3.65, 'iqr': 1.56, 'cv': 0.46, 'lower_bound': -0.34, 'upper_bound': 5.91}, description='the amount of tip that customers paid'),
 FieldSpec(field_name='sex', data_type='string', missing_values=0, missing_values_pct=0.0, unique_values=2, unique

In [32]:
metadata.field_spec[0].__dict__.keys()

dict_keys(['field_name', 'data_type', 'missing_values', 'missing_values_pct', 'unique_values', 'unique_values_pct', 'most_frequent', 'statistics', 'description'])

# Test with AWS Bedrock

## One pandas table

In [8]:
from broinsight.experiment.bedrock import AWSConfig, BedrockOpenAI
from broinsight.experiment.ollama import LocalOpenAI
from broinsight.broinsight import BroInsight
import boto3

from broinsight.utils.data_catalog import DataCatalog
import seaborn as sns

field_descriptions = FieldDescriptions(descriptions=[
    FieldDescription(field_name="total_bill", description="Total bill amount in dollars"),
    FieldDescription(field_name="tip", description="Tip amount in dollars"),
    FieldDescription(field_name="sex", description="Customer gender"),
    FieldDescription(field_name="smoker", description="Whether customer smokes"),
    FieldDescription(field_name="day", description="Day of the week"),
    FieldDescription(field_name="time", description="Meal time (Lunch/Dinner)"),
    FieldDescription(field_name="size", description="Party size")
])

# Create catalog and register data
catalog = DataCatalog()
catalog.register("tips", sns.load_dataset("tips"), "Restaurant tips dataset")

# Profile the table
catalog.profile_tables(["tips"])

# Add field descriptions
catalog.add_field_descriptions("tips", field_descriptions)

session = boto3.Session()
credentials = session.get_credentials()

aws_configs = AWSConfig(
    aws_access_key_id=credentials.access_key,
    aws_secret_access_key=credentials.secret_key,
    aws_session_token=credentials.token or None,
    region_name='us-west-2'
)

model_id="openai.gpt-oss-20b-1:0"
bedrock = BedrockOpenAI(
    model_id=model_id,
    aws_configs=aws_configs
)

# broinsight = BroInsight(model=LocalOpenAI())
broinsight = BroInsight(model=bedrock)

In [None]:
# check data quality
response = broinsight.assess_data_quality(
    context=catalog.to_dq_profile("tips"),
    message="Do we have any concern about this one?"
)
print(response['content'])

**Data Quality Assessment:** READY  
**Overall Status:** The dataset is in good shape—no missing values, only one duplicate row, and the overall quality rating is “GOOD.”  

**Minor Issues Identified:**  
- **total_bill, tip, size** – each shows moderate right‑skew (skew ≈ 1.1–1.5).  

**Impact on Analysis:**  
- For descriptive statistics and non‑parametric tests, skewness is usually fine.  
- If you plan to use linear regression, ANOVA, or other parametric models that assume normality of residuals, the skewness could affect model fit and inference.  
- Skewed variables can also influence clustering or distance‑based methods.

**Recommended Actions (if needed):**  
1. **Transform Skewed Variables** – log, square‑root, or Box‑Cox transformations can reduce skewness.  
2. **Robust Modeling** – use models that are less sensitive to non‑normality (e.g., tree‑based methods, generalized linear models with appropriate link functions).  
3. **Diagnostics** – after transformation or modeling, 

In [None]:
# ask for suggestions based on specific data in data catalog
response = broinsight.suggest_questions(
    context=catalog.to_guide_metadata("tips"),
    message="I'm a new manager and I wanna make a promotion for the shop but I don't know how to start."
)
print(response['content'])

Based on your role and goals, here are some areas you might want to explore:

Customer Spending Patterns  
- What is the average total_bill for each day?  
- Which day has the highest average tip?  
- How does the average total_bill differ between Lunch and Dinner?

Party Size Insights  
- What is the average party size for each day?  
- Which party size group spends the most on average?  
- How does tip amount vary with party size?

Customer Demographics  
- What is the average tip for Male vs Female customers?  
- Do smokers tip more or less than non-smokers on average?  
- Which gender has the highest average total_bill?

Promotion Timing  
- Which day/time combination has the lowest average total_bill?  
- How many customers dine during Lunch vs Dinner?  
- What is the average tip amount for each time of day?  

Just ask me any of these questions and I'll analyze your data to get the answers!


In [None]:
# create sql query from selected table
question = "What is the average total_bill for size = 1 compared to size = 4 or more?"
response = broinsight.generate_sql(
    context=catalog.to_sql_metadata("tips"),
    message=question
)
print(response['content'])

```sql
SELECT
    AVG(CASE WHEN size = 1 THEN total_bill END)   AS avg_total_bill_size1,
    AVG(CASE WHEN size >= 4 THEN total_bill END) AS avg_total_bill_size4plus
FROM tips;
```


In [None]:
# parsing sql query and get result
sql_query = response['content'].split("```sql")[-1].split("```")[0]
result = catalog.query(sql_query)
result

Unnamed: 0,avg_total_bill_size1,avg_total_bill_size4plus
0,7.2425,29.312174


In [None]:
# ask data is just asking data that we get from sql query and put it as a context
response = broinsight.ask_data(
    context=result.to_string(),
    message=question
)
print(response['content'])

For a single‑person table the average total bill is **about $7.24**.  
When the party is four people or larger, the average climbs to **about $29.31**.

That’s roughly a four‑fold jump in spend when you move from a solo diner to a group of four or more. It suggests that larger parties are a much higher‑value segment for the restaurant—each additional guest brings a significant bump in revenue.  

**What to do next?**  
- Consider targeted promotions or upsell strategies for larger groups (e.g., group menus, special offers).  
- Look at how seating capacity or table layout might accommodate more 4+ parties.  
- Check if the higher spend is driven by specific menu items or service levels, and see if those can be highlighted in marketing.  

Let me know if you’d like to dig deeper into the drivers behind that difference!


In [None]:
# it's almost the same flow as above but it will return a chart, instead of text
response = broinsight.create_chart(
    query_result=result,
    message=question
)
response['chart']
# print(response['fig'])

## Two s3 tables

Down below is almost the above example except it takes more than one table to do the job

In [15]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Dataset 1: Customers
np.random.seed(42)
customers_data = {
    'customer_id': range(1, 101),
    'name': [f'Customer_{i}' for i in range(1, 101)],
    'email': [f'customer{i}@email.com' for i in range(1, 101)],
    'age': np.random.randint(18, 75, 100),
    'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], 100),
    'signup_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(100)],
    'status': np.random.choice(['Active', 'Inactive'], 100, p=[0.8, 0.2])
}
customers_df = pd.DataFrame(customers_data)

# Dataset 2: Orders (with foreign key to customers)
np.random.seed(42)
orders_data = {
    'order_id': range(1, 251),
    'customer_id': np.random.choice(range(1, 101), 250),  # Foreign key to customers
    'order_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(250)],
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home', 'Sports'], 250),
    'order_amount': np.round(np.random.uniform(10.0, 500.0, 250), 2),
    'shipping_cost': np.round(np.random.uniform(0.0, 25.0, 250), 2),
    'order_status': np.random.choice(['Completed', 'Pending', 'Cancelled'], 250, p=[0.7, 0.2, 0.1]),
    'payment_method': np.random.choice(['Credit Card', 'PayPal', 'Bank Transfer'], 250)
}
orders_df = pd.DataFrame(orders_data)

# Field descriptions for customers
customers_descriptions = FieldDescriptions(descriptions=[
    FieldDescription(field_name="customer_id", description="Unique customer identifier"),
    FieldDescription(field_name="name", description="Customer full name"),
    FieldDescription(field_name="email", description="Customer email address"),
    FieldDescription(field_name="age", description="Customer age in years"),
    FieldDescription(field_name="city", description="Customer's city of residence"),
    FieldDescription(field_name="signup_date", description="Date when customer registered"),
    FieldDescription(field_name="status", description="Current customer account status")
])

# Field descriptions for orders
orders_descriptions = FieldDescriptions(descriptions=[
    FieldDescription(field_name="order_id", description="Unique order identifier"),
    FieldDescription(field_name="customer_id", description="Links to customer who placed the order"),
    FieldDescription(field_name="order_date", description="Date when order was placed"),
    FieldDescription(field_name="product_category", description="Category of products ordered"),
    FieldDescription(field_name="order_amount", description="Total order value in dollars"),
    FieldDescription(field_name="shipping_cost", description="Shipping fee charged"),
    FieldDescription(field_name="order_status", description="Current status of the order"),
    FieldDescription(field_name="payment_method", description="Payment method used")
])

In [16]:
from broinsight.utils.data_catalog import DataCatalog
import boto3

s3_client = boto3.client('s3')
bucket='dev-broinsight'

aws_configs = AWSConfig(
    aws_access_key_id=credentials.access_key,
    aws_secret_access_key=credentials.secret_key,
    aws_session_token=credentials.token or None,
    region_name='ap-southeast-1'
)

s3_client.put_object(Bucket=bucket, Key='customers.csv', Body=customers_df.to_csv(index=False))
s3_client.put_object(Bucket=bucket, Key='orders.csv', Body=orders_df.to_csv(index=False))

s3_customers = f"s3://{bucket}/customers.csv"
s3_orders = f"s3://{bucket}/orders.csv"

catalog = DataCatalog(aws_configs=aws_configs)
# catalog.register("customers", customers_df, "Customer information and demographics")
# catalog.register("orders", orders_df, "Customer order transactions and details")

catalog.register("customers", s3_customers, "Customer information and demographics")
catalog.register("orders", s3_orders, "Customer order transactions and details")
# # Profile tables
catalog.profile_tables(["customers", "orders"])

# # Add field descriptions
catalog.add_field_descriptions("customers", customers_descriptions)
catalog.add_field_descriptions("orders", orders_descriptions)

# # Add relationship
# catalog.add_relationship("orders", "customer_id", "customers", "customer_id")
catalog.add_relationship(foreign_table="orders", foreign_key="customer_id", primary_table="customers", primary_key="customer_id")

# # Test SQL metadata generation
sql_metadata = catalog.to_sql_metadata(["customers", "orders"])
print(sql_metadata)

METADATAS:

Table: customers
Columns:
- customer_id (integer): Unique customer identifier, NOT NULL
  Range: 1 - 100, Average: 50.50
- name (string): Customer full name, NOT NULL
  Examples: "Customer_3", "Customer_5", "Customer_32"
- email (string): Customer email address, NOT NULL
  Examples: "customer3@email.com", "customer17@email.com", "customer33@email.com"
- age (integer): Customer age in years, NOT NULL
  Range: 19 - 74, Average: 45.19
- city (string): Customer's city of residence, NOT NULL
  Examples: "New York", "Houston", "Chicago"
- signup_date (unknown): Date when customer registered, NOT NULL
- status (string): Current customer account status, NOT NULL
  Examples: "Active", "Inactive"

Table: orders
Relationships:
  customer_id -> customers.customer_id
Columns:
- order_id (integer): Unique order identifier, NOT NULL
  Range: 1 - 250, Average: 125.50
- customer_id (integer): Links to customer who placed the order, NOT NULL
  Range: 1 - 100, Average: 50.34
- order_date (unk

In [17]:
response = broinsight.suggest_questions(
    context=catalog.to_guide_metadata(["customers", "orders"]),
    message="I'm a new manager here. I don't know what to look to know more about our business."

)
print(response['content'])

Based on your role and goals, here are some areas you might want to explore:

Customer Profile & Retention
- How many customers are Active versus Inactive?
- What is the average age of Active customers?
- Which city has the highest number of Active customers?

Sales & Order Performance
- What is the average order_amount for Completed orders?
- How many orders are Pending, Completed, or Cancelled?
- Which product_category has the highest total order_amount?

Payment & Shipping Insights
- What is the average shipping_cost for orders paid with PayPal, Credit Card, and Bank Transfer?
- How many orders use each payment_method?
- What is the average order_amount by payment_method?

Product Category Trends
- Which product_category has the highest average order_amount?
- How many orders were placed for each product_category?
- What is the average shipping_cost for each product_category?

Just ask me any of these questions and I'll analyze your data to get the answers!


In [18]:
# question = "How many orders were paid using each payment_method in the orders table?"
# question = "Who are the top 5 customers payting with PayPal method? I wanna know thier ages and names"
question = "Who are the top 5 spenders? I wanna know thier ages and names and their most payment method"
response = broinsight.generate_sql(
    context=catalog.to_sql_metadata(["customers", "orders"]),
    message=question

)
print(response['content'])

```sql
WITH total_spent AS (
    SELECT
        o.customer_id,
        SUM(o.order_amount) AS total_amount
    FROM orders o
    GROUP BY o.customer_id
),
ranked_customers AS (
    SELECT
        ts.customer_id,
        ts.total_amount,
        ROW_NUMBER() OVER (ORDER BY ts.total_amount DESC) AS rn
    FROM total_spent ts
),
most_used_payment AS (
    SELECT
        o.customer_id,
        o.payment_method,
        COUNT(*) AS method_count,
        ROW_NUMBER() OVER (
            PARTITION BY o.customer_id
            ORDER BY COUNT(*) DESC
        ) AS rn
    FROM orders o
    GROUP BY o.customer_id, o.payment_method
)
SELECT
    c.name,
    c.age,
    mup.payment_method
FROM ranked_customers rc
JOIN customers c
    ON rc.customer_id = c.customer_id
JOIN most_used_payment mup
    ON mup.customer_id = rc.customer_id
    AND mup.rn = 1
WHERE rc.rn <= 5
ORDER BY rc.total_amount DESC;
```


In [19]:
sql_query = response['content'].split("```sql")[-1].split("```")[0]
result = catalog.query(sql_query)

In [20]:
result

Unnamed: 0,name,age,payment_method
0,Customer_62,45,Bank Transfer
1,Customer_2,69,Credit Card
2,Customer_89,48,PayPal
3,Customer_44,74,Bank Transfer
4,Customer_88,31,Credit Card


In [21]:
response = broinsight.ask_data(
    context=result.to_string(),
    message=question
)
print(response['content'])

I’m sorry, but the data you shared only lists each customer’s name, age, and payment method. It doesn’t include any spending amounts or rankings, so I can’t identify who the top 5 spenders are. If you can provide the spend figures or a column that indicates total purchases, I’d be happy to help you sort them and pull out the ages, names, and preferred payment methods.


In [23]:
response = broinsight.create_chart(
    query_result=result,
    message=question
)
response['chart']