In [1]:
%load_ext autoreload
%autoreload 2

# Test Metadata

In [2]:
import seaborn as sns
import duckdb

conn = duckdb.connect()
conn.register("tips", sns.load_dataset("tips"))
conn.execute("SELECT * FROM tips;").df()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [3]:
from broinsight.utils.data_spec import Metadata, TableSpec, FieldSpec, create_field_specs_from_profile, FieldDescription, FieldDescriptions
from broinsight.data_quality.sql_profile import sql_field_profile, sql_table_profile

# Get profiles
tbl_prof = sql_table_profile(conn, "tips")
field_prof = sql_field_profile(conn, "tips")

# Create specs
table_spec = TableSpec(**tbl_prof)
field_specs = create_field_specs_from_profile(field_prof)

# Create metadata
metadata = Metadata(
    table_name="tips", # this must matches the table name of data
    table_description="Restaurant tips dataset", # this should be human-readable summary
    table_spec=table_spec, # mandatory
    field_spec=field_specs # mandatory
)

In [4]:
# field description here is important because it'll convert to LLM context later by DataCatalog and fetch it to BroInsight.method(...)

descriptions = dict(
    total_bill="the amount of paid bill of the meal",
    tip="the amount of tip that customers paid",
    sex="the gender of customers",
    smoker="it indicates that a customer is a smoker or not. if No means a customer is a non-smoker, Yes means a customer is a smoker.",
    day="this is a day of the week when a customer having a meal here. i.e. Mon, Tue, Wed, Thu, Fri, Sat, Sun",
    time="the time of the meal. it can be either Dinner or Lunch",
    size="the number of dishes that customers have",
)

field_descriptions = [
    FieldDescription(field_name=k, description=v) for k, v in descriptions.items()
]

metadata.add_field_descriptions(field_descriptions=FieldDescriptions(descriptions=field_descriptions))

In [5]:
from broinsight.utils.data_spec import create_data_quality_assessment

# Add quality assessment to metadata
quality_assessment = create_data_quality_assessment(field_prof)
metadata.add_data_quality_assessment(quality_assessment)

In [6]:
metadata

Metadata(table_name='tips', table_description='Restaurant tips dataset', table_spec=TableSpec(rows=244, columns=7, duplicates=1, evidences={0: {'total_bill': 13.0, 'tip': 2.0, 'sex': 'Female', 'smoker': 'Yes', 'day': 'Thur', 'time': 'Lunch', 'size': 2, 'dup_count': 2}}), field_spec=[FieldSpec(field_name='total_bill', data_type='float', missing_values=0, missing_values_pct=0.0, unique_values=229, unique_values_pct=0.94, most_frequent={13.42: 3, 15.98: 2, 10.07: 2, 20.69: 2, 13.81: 2}, statistics={'min': 3.07, 'max': 50.81, 'mean': 19.79, 'median': 17.8, 'std': 8.9, 'var': 79.25, 'skew': 1.13, 'kurt': 1.22, 'iqr': 10.78, 'cv': 0.45, 'lower_bound': -2.82, 'upper_bound': 40.3}, description='the amount of paid bill of the meal'), FieldSpec(field_name='tip', data_type='float', missing_values=0, missing_values_pct=0.0, unique_values=123, unique_values_pct=0.5, most_frequent={2.0: 33, 3.0: 23, 4.0: 12, 2.5: 10, 5.0: 10}, statistics={'min': 1.0, 'max': 10.0, 'mean': 3.0, 'median': 2.9, 'std': 1

In [7]:
metadata.__dict__.keys()

dict_keys(['table_name', 'table_description', 'table_spec', 'field_spec', 'field_descriptions', 'data_quality'])

In [8]:
metadata.table_name

'tips'

In [9]:
metadata.table_description

'Restaurant tips dataset'

In [10]:
metadata.table_spec.__dict__.keys()

dict_keys(['rows', 'columns', 'duplicates', 'evidences'])

In [11]:
metadata.field_spec

[FieldSpec(field_name='total_bill', data_type='float', missing_values=0, missing_values_pct=0.0, unique_values=229, unique_values_pct=0.94, most_frequent={13.42: 3, 15.98: 2, 10.07: 2, 20.69: 2, 13.81: 2}, statistics={'min': 3.07, 'max': 50.81, 'mean': 19.79, 'median': 17.8, 'std': 8.9, 'var': 79.25, 'skew': 1.13, 'kurt': 1.22, 'iqr': 10.78, 'cv': 0.45, 'lower_bound': -2.82, 'upper_bound': 40.3}, description='the amount of paid bill of the meal'),
 FieldSpec(field_name='tip', data_type='float', missing_values=0, missing_values_pct=0.0, unique_values=123, unique_values_pct=0.5, most_frequent={2.0: 33, 3.0: 23, 4.0: 12, 2.5: 10, 5.0: 10}, statistics={'min': 1.0, 'max': 10.0, 'mean': 3.0, 'median': 2.9, 'std': 1.38, 'var': 1.91, 'skew': 1.47, 'kurt': 3.65, 'iqr': 1.56, 'cv': 0.46, 'lower_bound': -0.34, 'upper_bound': 5.91}, description='the amount of tip that customers paid'),
 FieldSpec(field_name='sex', data_type='string', missing_values=0, missing_values_pct=0.0, unique_values=2, uniqu

In [12]:
metadata.field_spec[0].__dict__.keys()

dict_keys(['field_name', 'data_type', 'missing_values', 'missing_values_pct', 'unique_values', 'unique_values_pct', 'most_frequent', 'statistics', 'description'])

# Test with AWS Bedrock

## One pandas table

In [13]:
from broinsight.experiment.bedrock import AWSConfig, BedrockOpenAI
from broinsight.experiment.ollama import LocalOpenAI
from broinsight.broinsight import BroInsight
import boto3

from broinsight.utils.data_catalog import DataCatalog
import seaborn as sns

field_descriptions = FieldDescriptions(descriptions=[
    FieldDescription(field_name="total_bill", description="Total bill amount in dollars"),
    FieldDescription(field_name="tip", description="Tip amount in dollars"),
    FieldDescription(field_name="sex", description="Customer gender"),
    FieldDescription(field_name="smoker", description="Whether customer smokes"),
    FieldDescription(field_name="day", description="Day of the week"),
    FieldDescription(field_name="time", description="Meal time (Lunch/Dinner)"),
    FieldDescription(field_name="size", description="Party size")
])

# Create catalog and register data
catalog = DataCatalog()
catalog.register("tips", sns.load_dataset("tips"), "Restaurant tips dataset")

# Profile the table
catalog.profile_tables(["tips"])

# Add field descriptions
catalog.add_field_descriptions("tips", field_descriptions)

session = boto3.Session()
credentials = session.get_credentials()

aws_configs = AWSConfig(
    aws_access_key_id=credentials.access_key,
    aws_secret_access_key=credentials.secret_key,
    aws_session_token=credentials.token or None,
    region_name='us-west-2'
)

model_id="openai.gpt-oss-20b-1:0"
bedrock = BedrockOpenAI(
    model_id=model_id,
    aws_configs=aws_configs
)

# broinsight = BroInsight(model=LocalOpenAI())
broinsight = BroInsight(model=bedrock)

In [14]:
# check data quality
response = broinsight.assess_data_quality(
    context=catalog.to_dq_profile("tips"),
    message="Do we have any concern about this one?"
)
print(response['content'])

**Data Quality Assessment:** READY  
**Critical Issues Found:** None  

**Recommended Actions (Minor Issues):**  
1. **Skewed Variables** – `total_bill`, `tip`, and `size` show moderate right‑skew (skew ≈ 1.1–1.5).  
   - *If you plan to use models that assume normality (e.g., linear regression, ANOVA), consider applying a log or square‑root transformation to these columns.*  
   - *For non‑parametric or tree‑based methods (e.g., random forests, gradient boosting), the skewness is usually not a problem.*  

2. **Duplicate Row** – Only one duplicate exists.  
   - *Verify that the duplicate is truly identical; if so, you can safely drop it. If it contains slightly different values, investigate why it was duplicated.*

**Next Steps:**  
- Proceed with your analysis.  
- If you encounter model diagnostics indicating non‑normal residuals or heteroscedasticity, revisit the transformations above.  

**Overall Recommendation:** The dataset is in good shape for most analytical tasks. Minor ske

In [15]:
# ask for suggestions based on specific data in data catalog
response = broinsight.suggest_questions(
    context=catalog.to_guide_metadata("tips"),
    message="I'm a new manager and I wanna make a promotion for the shop but I don't know how to start."
)
print(response['content'])

Based on your role and goals, here are some areas you might want to explore:

Customer Segmentation  
- How many customers are Male versus Female in the dataset?  
- How many customers are Smoker versus No in the dataset?  
- Which day of the week has the highest average total_bill?  

Spending Patterns  
- What is the average total_bill for each party size (size)?  
- What is the average tip amount for each day of the week (day)?  
- Which day has the highest average tip percentage (tip / total_bill)?  

Promotion Targeting  
- Which day and time combination (day + time) has the lowest average tip amount?  
- Which day and time combination has the highest average total_bill?  
- Which party size group (size) has the highest average tip amount?  

Tip Behavior  
- What is the average tip amount for Smokers versus Non-smokers?  
- What is the average tip percentage for Male versus Female customers?  
- Which day of the week has the highest average tip percentage?  

Just ask me any of t

In [16]:
# create sql query from selected table
question = "What is the average total_bill for size = 1 compared to size = 4 or more?"
response = broinsight.generate_sql(
    context=catalog.to_sql_metadata("tips"),
    message=question
)
print(response['content'])

```sql
SELECT
    CASE
        WHEN size = 1 THEN 'size = 1'
        WHEN size >= 4 THEN 'size >= 4'
    END AS size_category,
    AVG(total_bill) AS avg_total_bill
FROM tips
WHERE size = 1 OR size >= 4
GROUP BY size_category
ORDER BY size_category;
```


In [17]:
# parsing sql query and get result
sql_query = response['content'].split("```sql")[-1].split("```")[0]
result = catalog.query(sql_query)
result

Unnamed: 0,size_category,avg_total_bill
0,size = 1,7.2425
1,size >= 4,29.312174


In [18]:
# ask data is just asking data that we get from sql query and put it as a context
response = broinsight.ask_data(
    context=result.to_string(),
    message=question
)
print(response['content'])

For groups that dine alone (size = 1), the average bill is **$7.24**.  
For larger parties (size ≥ 4), the average bill jumps to **$29.31**.

That’s a difference of roughly $22, meaning larger groups spend far more per visit.  

**What this tells us:**

- **Higher spend per table**: Bigger parties are a more lucrative segment.  
- **Potential upsell opportunities**: Consider targeted promotions or menu bundles for larger groups to boost that already high average.  
- **Capacity planning**: If you’re looking to increase revenue, focus on attracting more groups of four or more—perhaps by highlighting family‑friendly seating or group‑discount specials.

**Next steps to explore:**

1. Look at the frequency of each group size to see how many visits come from each segment.  
2. Check if the higher average for larger groups is driven by higher‑priced items or simply more items per person.  
3. Test a small pilot of a group‑discount menu to see if it can further lift the average for size ≥ 4.


In [19]:
# it's almost the same flow as above but it will return a chart, instead of text
response = broinsight.create_chart(
    query_result=result,
    message=question
)
response['chart']
# print(response['fig'])

## Two s3 tables

Down below is almost the above example except it takes more than one table to do the job

In [20]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Dataset 1: Customers
np.random.seed(42)
customers_data = {
    'customer_id': range(1, 101),
    'name': [f'Customer_{i}' for i in range(1, 101)],
    'email': [f'customer{i}@email.com' for i in range(1, 101)],
    'age': np.random.randint(18, 75, 100),
    'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], 100),
    'signup_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(100)],
    'status': np.random.choice(['Active', 'Inactive'], 100, p=[0.8, 0.2])
}
customers_df = pd.DataFrame(customers_data)

# Dataset 2: Orders (with foreign key to customers)
np.random.seed(42)
orders_data = {
    'order_id': range(1, 251),
    'customer_id': np.random.choice(range(1, 101), 250),  # Foreign key to customers
    'order_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(250)],
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home', 'Sports'], 250),
    'order_amount': np.round(np.random.uniform(10.0, 500.0, 250), 2),
    'shipping_cost': np.round(np.random.uniform(0.0, 25.0, 250), 2),
    'order_status': np.random.choice(['Completed', 'Pending', 'Cancelled'], 250, p=[0.7, 0.2, 0.1]),
    'payment_method': np.random.choice(['Credit Card', 'PayPal', 'Bank Transfer'], 250)
}
orders_df = pd.DataFrame(orders_data)

# Field descriptions for customers
customers_descriptions = FieldDescriptions(descriptions=[
    FieldDescription(field_name="customer_id", description="Unique customer identifier"),
    FieldDescription(field_name="name", description="Customer full name"),
    FieldDescription(field_name="email", description="Customer email address"),
    FieldDescription(field_name="age", description="Customer age in years"),
    FieldDescription(field_name="city", description="Customer's city of residence"),
    FieldDescription(field_name="signup_date", description="Date when customer registered"),
    FieldDescription(field_name="status", description="Current customer account status")
])

# Field descriptions for orders
orders_descriptions = FieldDescriptions(descriptions=[
    FieldDescription(field_name="order_id", description="Unique order identifier"),
    FieldDescription(field_name="customer_id", description="Links to customer who placed the order"),
    FieldDescription(field_name="order_date", description="Date when order was placed"),
    FieldDescription(field_name="product_category", description="Category of products ordered"),
    FieldDescription(field_name="order_amount", description="Total order value in dollars"),
    FieldDescription(field_name="shipping_cost", description="Shipping fee charged"),
    FieldDescription(field_name="order_status", description="Current status of the order"),
    FieldDescription(field_name="payment_method", description="Payment method used")
])

In [21]:
from broinsight.utils.data_catalog import DataCatalog
import boto3

s3_client = boto3.client('s3')
bucket='dev-broinsight'

aws_configs = AWSConfig(
    aws_access_key_id=credentials.access_key,
    aws_secret_access_key=credentials.secret_key,
    aws_session_token=credentials.token or None,
    region_name='ap-southeast-1'
)

s3_client.put_object(Bucket=bucket, Key='customers.csv', Body=customers_df.to_csv(index=False))
s3_client.put_object(Bucket=bucket, Key='orders.csv', Body=orders_df.to_csv(index=False))

s3_customers = f"s3://{bucket}/customers.csv"
s3_orders = f"s3://{bucket}/orders.csv"

catalog = DataCatalog(aws_configs=aws_configs)
# catalog.register("customers", customers_df, "Customer information and demographics")
# catalog.register("orders", orders_df, "Customer order transactions and details")

catalog.register("customers", s3_customers, "Customer information and demographics")
catalog.register("orders", s3_orders, "Customer order transactions and details")
# # Profile tables
catalog.profile_tables(["customers", "orders"])

# # Add field descriptions
catalog.add_field_descriptions("customers", customers_descriptions)
catalog.add_field_descriptions("orders", orders_descriptions)

# # Add relationship
# catalog.add_relationship("orders", "customer_id", "customers", "customer_id")
catalog.add_relationship(foreign_table="orders", foreign_key="customer_id", primary_table="customers", primary_key="customer_id")

# # Test SQL metadata generation
sql_metadata = catalog.to_sql_metadata(["customers", "orders"])
print(sql_metadata)

METADATAS:

Table: customers
Columns:
- customer_id (integer): Unique customer identifier, NOT NULL
  Range: 1 - 100, Average: 50.50
- name (string): Customer full name, NOT NULL
  Examples: "Customer_10", "Customer_23", "Customer_30"
- email (string): Customer email address, NOT NULL
  Examples: "customer20@email.com", "customer52@email.com", "customer56@email.com"
- age (integer): Customer age in years, NOT NULL
  Range: 19 - 74, Average: 45.19
- city (string): Customer's city of residence, NOT NULL
  Examples: "New York", "Houston", "Chicago"
- signup_date (datetime): Date when customer registered, NOT NULL
  Range: 2023-01-02 00:00:00 to 2023-12-26 00:00:00, 77 unique days
- status (string): Current customer account status, NOT NULL
  Examples: "Active", "Inactive"

Table: orders
Relationships:
  customer_id -> customers.customer_id
Columns:
- order_id (integer): Unique order identifier, NOT NULL
  Range: 1 - 250, Average: 125.50
- customer_id (integer): Links to customer who place

In [22]:
response = broinsight.suggest_questions(
    context=catalog.to_guide_metadata(["customers", "orders"]),
    message="I'm a new manager here. I don't know what to look to know more about our business."

)
print(response['content'])

Based on your role and goals, here are some areas you might want to explore:

Customer Overview
- How many customers are Active versus Inactive in customers.status?
- What is the average age of customers in each city from customers.city and customers.age?

Order Performance
- What is the total and average order_amount for each product_category in orders.product_category?
- How many orders are Completed, Pending, or Cancelled in orders.order_status?

Revenue Insights
- What is the total revenue (sum of orders.order_amount) for each month in orders.order_date?
- What is the average shipping_cost for orders in each order_status category?

Customer Acquisition
- How many new customers signed up each month from customers.signup_date?
- What is the distribution of payment_method used in orders.payment_method?

Just ask me any of these questions and I'll analyze your data to get the answers!


In [23]:
# question = "How many orders were paid using each payment_method in the orders table?"
# question = "Who are the top 5 customers payting with PayPal method? I wanna know thier ages and names"
# question = "Who are the top 5 spenders? I wanna know thier ages and names and their most payment method"
question = "Show me monthly sales trends of each year"
response = broinsight.generate_sql(
    context=catalog.to_sql_metadata(["customers", "orders"]),
    message=question

)
print(response['content'])

```sql
SELECT
    EXTRACT(YEAR FROM o.order_date)  AS year,
    EXTRACT(MONTH FROM o.order_date) AS month,
    SUM(o.order_amount)              AS total_sales,
    COUNT(*)                         AS order_count
FROM orders o
WHERE o.order_status = 'Completed'
GROUP BY year, month
ORDER BY year, month;
```


In [24]:
sql_query = response['content'].split("```sql")[-1].split("```")[0]
result = catalog.query(sql_query)

In [25]:
result

Unnamed: 0,year,month,total_sales,order_count
0,2023,1,2427.12,8
1,2023,2,3037.51,15
2,2023,3,1549.73,6
3,2023,4,3798.1,16
4,2023,5,5520.59,23
5,2023,6,5578.09,18
6,2023,7,2987.18,13
7,2023,8,2729.77,11
8,2023,9,4984.97,15
9,2023,10,3243.12,14


In [26]:
response = broinsight.ask_data(
    context=result.to_string(),
    message=question
)
print(response['content'])

Here’s a quick look at how sales moved month‑to‑month in 2023:

| Month | Total Sales | Order Count |
|-------|-------------|-------------|
| Jan   | $2,427.12   | 8 |
| Feb   | $3,037.51   | 15 |
| Mar   | $1,549.73   | 6 |
| Apr   | $3,798.10   | 16 |
| May   | $5,520.59   | 23 |
| Jun   | $5,578.09   | 18 |
| Jul   | $2,987.18   | 13 |
| Aug   | $2,729.77   | 11 |
| Sep   | $4,984.97   | 15 |
| Oct   | $3,243.12   | 14 |
| Nov   | $3,134.15   | 12 |
| Dec   | $3,930.74   | 15 |

**What the numbers tell us**

- **Peak performance**: May and June were the strongest months, with sales above $5,500 each. Those months also saw the highest order counts (23 in May, 18 in June).
- **Lowest point**: March dipped to just $1,549 in sales and only 6 orders, suggesting a seasonal lull or a promotional gap.
- **Mid‑year rebound**: After the March low, sales climbed steadily through April, peaking again in May and June before tapering off in July and August.
- **Consistent end‑of‑year**: September

In [27]:
response = broinsight.create_chart(
    query_result=result,
    message=question
)
response['chart']

In [28]:
catalog.list_tables()

['customers', 'orders']

In [29]:
catalog.query("SELECT * FROM customers LIMIT 5")

Unnamed: 0,customer_id,name,email,age,city,signup_date,status
0,1,Customer_1,customer1@email.com,56,Phoenix,2023-09-25,Active
1,2,Customer_2,customer2@email.com,69,Los Angeles,2023-10-22,Active
2,3,Customer_3,customer3@email.com,46,New York,2023-04-23,Active
3,4,Customer_4,customer4@email.com,32,Houston,2023-04-11,Active
4,5,Customer_5,customer5@email.com,60,Houston,2023-04-23,Active


In [30]:
catalog.query("SELECT * FROM customers LIMIT 5").info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   customer_id  5 non-null      int64         
 1   name         5 non-null      object        
 2   email        5 non-null      object        
 3   age          5 non-null      int64         
 4   city         5 non-null      object        
 5   signup_date  5 non-null      datetime64[us]
 6   status       5 non-null      object        
dtypes: datetime64[us](1), int64(2), object(4)
memory usage: 412.0+ bytes


In [31]:
print(catalog.to_dq_profile("customers"))

DATASET: customers
DESCRIPTION: Customer information and demographics
ROWS: 100
COLUMNS: 7
DUPLICATES: 0
OVERALL QUALITY: GOOD
TOTAL ISSUES: 0

FIELD PROFILES:

customer_id (integer):
  Missing: 0 (0.0%)
  Unique: 100 (1.0%)
  Description: Unique customer identifier

name (string):
  Missing: 0 (0.0%)
  Unique: 100 (1.0%)
  Description: Customer full name

email (string):
  Missing: 0 (0.0%)
  Unique: 100 (1.0%)
  Description: Customer email address

age (integer):
  Missing: 0 (0.0%)
  Unique: 50 (0.5%)
  Description: Customer age in years

city (string):
  Missing: 0 (0.0%)
  Unique: 5 (0.1%)
  Description: Customer's city of residence

signup_date (datetime):
  Missing: 0 (0.0%)
  Unique: 77 (0.8%)
  Description: Date when customer registered

status (string):
  Missing: 0 (0.0%)
  Unique: 2 (0.0%)
  Description: Current customer account status



In [32]:
print(catalog.to_guide_metadata("customers"))

METADATA:

Table: customers (Customer information and demographics)
Fields:
- customer_id (integer): Unique customer identifier
  Range: 1 - 100, Average: 50.50
- name (string): Customer full name
  Values: Customer_10 (1), Customer_23 (1), Customer_30 (1)
- email (string): Customer email address
  Values: customer20@email.com (1), customer52@email.com (1), customer56@email.com (1)
- age (integer): Customer age in years
  Range: 19 - 74, Average: 45.19
- city (string): Customer's city of residence
  Values: New York (27), Houston (22), Chicago (21)
- signup_date (datetime): Date when customer registered
  Range: 2023-01-02 00:00:00 to 2023-12-26 00:00:00, 77 unique days
- status (string): Current customer account status
  Values: Active (82), Inactive (18)


In [33]:
print(catalog.to_guide_metadata(table_names=["customers", "orders"]))

METADATA:

Table: customers (Customer information and demographics)
Fields:
- customer_id (integer): Unique customer identifier
  Range: 1 - 100, Average: 50.50
- name (string): Customer full name
  Values: Customer_10 (1), Customer_23 (1), Customer_30 (1)
- email (string): Customer email address
  Values: customer20@email.com (1), customer52@email.com (1), customer56@email.com (1)
- age (integer): Customer age in years
  Range: 19 - 74, Average: 45.19
- city (string): Customer's city of residence
  Values: New York (27), Houston (22), Chicago (21)
- signup_date (datetime): Date when customer registered
  Range: 2023-01-02 00:00:00 to 2023-12-26 00:00:00, 77 unique days
- status (string): Current customer account status
  Values: Active (82), Inactive (18)

Table: orders (Customer order transactions and details)
Fields:
- order_id (integer): Unique order identifier
  Range: 1 - 250, Average: 125.50
- customer_id (integer): Links to customer who placed the order
  Range: 1 - 100, Averag

In [34]:
print(catalog.to_sql_metadata(table_names=["customers", "orders"]))

METADATAS:

Table: customers
Columns:
- customer_id (integer): Unique customer identifier, NOT NULL
  Range: 1 - 100, Average: 50.50
- name (string): Customer full name, NOT NULL
  Examples: "Customer_10", "Customer_23", "Customer_30"
- email (string): Customer email address, NOT NULL
  Examples: "customer20@email.com", "customer52@email.com", "customer56@email.com"
- age (integer): Customer age in years, NOT NULL
  Range: 19 - 74, Average: 45.19
- city (string): Customer's city of residence, NOT NULL
  Examples: "New York", "Houston", "Chicago"
- signup_date (datetime): Date when customer registered, NOT NULL
  Range: 2023-01-02 00:00:00 to 2023-12-26 00:00:00, 77 unique days
- status (string): Current customer account status, NOT NULL
  Examples: "Active", "Inactive"

Table: orders
Relationships:
  customer_id -> customers.customer_id
Columns:
- order_id (integer): Unique order identifier, NOT NULL
  Range: 1 - 250, Average: 125.50
- customer_id (integer): Links to customer who place