In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
from broinsight.utils.data_spec import Metadata, TableSpec, FieldSpec
from broinsight.data_quality.sql_profile import sql_field_profile, sql_table_profile
import seaborn as sns
import pandas as pd
import duckdb

conn = duckdb.connect()
conn.register("tips", sns.load_dataset("tips"))

<duckdb.duckdb.DuckDBPyConnection at 0x117ac9f30>

In [14]:
conn.execute("SELECT * FROM tips;").df()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [5]:
from broinsight.utils.data_spec import Metadata, TableSpec, FieldSpec, create_field_specs_from_profile, FieldDescription, FieldDescriptions

# Get profiles
tbl_prof = sql_table_profile(conn, "tips")
field_prof = sql_field_profile(conn, "tips")

# Create specs
table_spec = TableSpec(**tbl_prof)
field_specs = create_field_specs_from_profile(field_prof)

# Create metadata (this should work now)
metadata = Metadata(
    table_name="tips",
    table_description="Restaurant tips dataset",
    table_spec=table_spec,
    field_spec=field_specs
)

In [9]:
metadata

Metadata(table_name='tips', table_description='Restaurant tips dataset', table_spec=TableSpec(rows=244, columns=7, duplicates=1, evidences={0: {'total_bill': 13.0, 'tip': 2.0, 'sex': 'Female', 'smoker': 'Yes', 'day': 'Thur', 'time': 'Lunch', 'size': 2, 'dup_count': 2}}), field_spec=[FieldSpec(field_name='total_bill', data_type='float', missing_values=0, missing_values_pct=0.0, unique_values=229, unique_values_pct=0.94, most_frequent={13.42: 3, 15.98: 2, 20.29: 2, 10.33: 2, 7.25: 2}, statistics={'min': 3.07, 'max': 50.81, 'mean': 19.79, 'median': 17.8, 'std': 8.9, 'var': 79.25, 'skew': 1.13, 'kurt': 1.22, 'iqr': 10.78, 'cv': 0.45, 'lower_bound': -2.82, 'upper_bound': 40.3}, description=None), FieldSpec(field_name='tip', data_type='float', missing_values=0, missing_values_pct=0.0, unique_values=123, unique_values_pct=0.5, most_frequent={2.0: 33, 3.0: 23, 4.0: 12, 2.5: 10, 5.0: 10}, statistics={'min': 1.0, 'max': 10.0, 'mean': 3.0, 'median': 2.9, 'std': 1.38, 'var': 1.91, 'skew': 1.47, 'k

In [10]:
descriptions = dict(
    total_bill="the amount of paid bill of the meal",
    tip="the amount of tip that customers paid",
    sex="the gender of customers",
    smoker="it indicates that a customer is a smoker or not. if No means a customer is a non-smoker, Yes means a customer is a smoker.",
    day="this is a day of the week when a customer having a meal here. i.e. Mon, Tue, Wed, Thu, Fri, Sat, Sun",
    time="the time of the meal. it can be either Dinner or Lunch",
    size="the number of dishes that customers have",
)

field_descriptions = [
    FieldDescription(field_name=k, description=v) for k, v in descriptions.items()
]

metadata.add_field_descriptions(field_descriptions=FieldDescriptions(descriptions=field_descriptions))

In [11]:
metadata

Metadata(table_name='tips', table_description='Restaurant tips dataset', table_spec=TableSpec(rows=244, columns=7, duplicates=1, evidences={0: {'total_bill': 13.0, 'tip': 2.0, 'sex': 'Female', 'smoker': 'Yes', 'day': 'Thur', 'time': 'Lunch', 'size': 2, 'dup_count': 2}}), field_spec=[FieldSpec(field_name='total_bill', data_type='float', missing_values=0, missing_values_pct=0.0, unique_values=229, unique_values_pct=0.94, most_frequent={13.42: 3, 15.98: 2, 20.29: 2, 10.33: 2, 7.25: 2}, statistics={'min': 3.07, 'max': 50.81, 'mean': 19.79, 'median': 17.8, 'std': 8.9, 'var': 79.25, 'skew': 1.13, 'kurt': 1.22, 'iqr': 10.78, 'cv': 0.45, 'lower_bound': -2.82, 'upper_bound': 40.3}, description='the amount of paid bill of the meal'), FieldSpec(field_name='tip', data_type='float', missing_values=0, missing_values_pct=0.0, unique_values=123, unique_values_pct=0.5, most_frequent={2.0: 33, 3.0: 23, 4.0: 12, 2.5: 10, 5.0: 10}, statistics={'min': 1.0, 'max': 10.0, 'mean': 3.0, 'median': 2.9, 'std': 1.

In [None]:
from broinsight.utils.data_spec import (
    Metadata, TableSpec, create_field_specs_from_profile, 
    create_data_quality_assessment
)

# Get profiles
tbl_prof = sql_table_profile(conn, "tips")
field_prof = sql_field_profile(conn, "tips")

# Create all specs
table_spec = TableSpec(**tbl_prof)
field_specs = create_field_specs_from_profile(field_prof)
quality_assessment = create_data_quality_assessment(field_prof)

# Create complete metadata
metadata = Metadata(
    table_name="tips",
    table_description="Restaurant tips dataset",
    table_spec=table_spec,
    field_spec=field_specs,
    data_quality=quality_assessment
)


In [16]:
metadata.data_quality

DataQualityAssessment(field_assessments=[FieldQualityAssessment(field_name='total_bill', quality='good', issues=[QualityIssue(type='moderate_skewness', severity='minor', description='Moderately skewed distribution (skew: 1.13)')], issue_count=1), FieldQualityAssessment(field_name='tip', quality='good', issues=[QualityIssue(type='moderate_skewness', severity='minor', description='Moderately skewed distribution (skew: 1.47)')], issue_count=1), FieldQualityAssessment(field_name='sex', quality='good', issues=[], issue_count=0), FieldQualityAssessment(field_name='smoker', quality='good', issues=[], issue_count=0), FieldQualityAssessment(field_name='day', quality='good', issues=[], issue_count=0), FieldQualityAssessment(field_name='time', quality='good', issues=[], issue_count=0), FieldQualityAssessment(field_name='size', quality='good', issues=[QualityIssue(type='moderate_skewness', severity='minor', description='Moderately skewed distribution (skew: 1.45)')], issue_count=1)], overall_quali

In [8]:
# For data quality assessment
dq_context = metadata.to_data_quality_context()
print("DATA QUALITY CONTEXT:")
print(dq_context)

# For SQL generation  
sql_context = metadata.to_sql_context()
print("\nSQL CONTEXT:")
print(sql_context)

# For suggested questions
question_context = metadata.to_question_context()
print("\nQUESTION CONTEXT:")
print(question_context)

# For answering based on results
answer_context = metadata.to_answer_context()
print("\nANSWER CONTEXT:")
print(answer_context)


DATA QUALITY CONTEXT:
TABLE: tips
DESCRIPTION: Restaurant tips dataset
ROWS: 244, COLUMNS: 7

OVERALL QUALITY: good
TOTAL ISSUES: 3

total_bill (good):
  - minor: Moderately skewed distribution (skew: 1.13)

tip (good):
  - minor: Moderately skewed distribution (skew: 1.47)

size (good):
  - minor: Moderately skewed distribution (skew: 1.45)

SQL CONTEXT:
TABLE: tips
total_bill: float (range: 3.07-50.81, avg: 19.79)
tip: float (range: 1.0-10.0, avg: 3.0)
sex: string (values: Male, Female)
smoker: string (values: No, Yes)
day: string (values: Sat, Sun, Thur)
time: string (values: Dinner, Lunch)
size: integer (range: 1-6, avg: 2.57)

QUESTION CONTEXT:
TABLE: tips - Restaurant tips dataset
SIZE: 244 rows, 7 columns

FIELDS:
- total_bill (float) [min: 3.07, max: 50.81, avg: 19.79]
- tip (float) [min: 1.0, max: 10.0, avg: 3.00]
- sex (string) [2 unique values]
- smoker (string) [2 unique values]
- day (string) [4 unique values]
- time (string) [2 unique values]
- size (integer) [min: 1, max

In [11]:
# Test complete metadata with quality assessment
from broinsight.utils.data_spec import create_data_quality_assessment

# Add quality assessment to metadata
quality_assessment = create_data_quality_assessment(field_prof)
metadata.add_data_quality_assessment(quality_assessment)

# Test all formatters
print("=== 1. DATA QUALITY PROFILE ===")
dq_profile = metadata.to_dq_profile()
print(dq_profile)

print("\n=== 2. SQL METADATA ===")
sql_metadata = metadata.to_sql_metadata()
print(sql_metadata)

print("\n=== 3. GUIDE METADATA ===")
guide_metadata = metadata.to_guide_metadata()
print(guide_metadata)

# Test how BroInsight would use these
print("\n=== 4. BROINSIGHT INPUT FORMATS ===")

# For data quality assessment
print("DQ Assessment Input:")
dq_input = f"PROFILE:\n\n{dq_profile}\n\nUSER_INPUT:\n\nWhat data quality issues should I be aware of?\n\n"
print(dq_input[:200] + "...")

# For SQL generation
print("\nSQL Generation Input:")
sql_input = f"METADATA:\n\n{sql_metadata}\n\nUSER_INPUT:\n\nWhat's the average tip amount?"
print(sql_input[:200] + "...")

# For question suggestions
print("\nQuestion Suggestion Input:")
guide_input = f"METADATA:\n{guide_metadata}\n\nUSER_CONTEXT:\nI'm a restaurant manager looking to understand tipping patterns\n\n"
print(guide_input[:200] + "...")


=== 1. DATA QUALITY PROFILE ===
# Dataset Overview
**Size:** 244 rows × 7 columns
**Duplicates:** 1 duplicate record(s) found

# Fields
## total_bill
**Type:** float
**Missing:** 0 (0.0%)
**Unique:** 229 (94.0%)
**Quality:** good
**Issues:**
  - Moderately skewed distribution (skew: 1.13) (minor)
**Stats:** min=3.07, max=50.81, mean=19.79, skew=1.13

## tip
**Type:** float
**Missing:** 0 (0.0%)
**Unique:** 123 (50.0%)
**Quality:** good
**Issues:**
  - Moderately skewed distribution (skew: 1.47) (minor)
**Stats:** min=1.0, max=10.0, mean=3.0, skew=1.47

## sex
**Type:** string
**Missing:** 0 (0.0%)
**Unique:** 2 (1.0%)
**Quality:** good
**Stats:** mode=Male, avg_length=4.71

## smoker
**Type:** string
**Missing:** 0 (0.0%)
**Unique:** 2 (1.0%)
**Quality:** good
**Stats:** mode=No, avg_length=2.38

## day
**Type:** string
**Missing:** 0 (0.0%)
**Unique:** 4 (2.0%)
**Quality:** good
**Stats:** mode=Sat, avg_length=3.25

## time
**Type:** string
**Missing:** 0 (0.0%)
**Unique:** 2 (1.0%)
*

In [17]:
# Cell 1: Setup
%load_ext autoreload
%autoreload 2

import seaborn as sns
import pandas as pd
from broinsight.utils.data_catalog import DataCatalog
from broinsight.utils.data_spec import FieldDescriptions, FieldDescription

# Cell 2: Create DataCatalog and register data
catalog = DataCatalog()

# Register tips dataset
tips_df = sns.load_dataset("tips")
catalog.register(
    name="tips", 
    source=tips_df, 
    table_description="Restaurant tips dataset with customer information"
)

print("Registered tables:", catalog.list_tables())

# Cell 3: Profile the table
catalog.profile_tables(["tips"])
print("✅ Profiling completed!")

# Cell 4: Check the metadata object
metadata = catalog._tables["tips"]["metadata"]
print("Table name:", metadata.table_name)
print("Description:", metadata.table_description)
print("Rows:", metadata.table_spec.rows)
print("Columns:", metadata.table_spec.columns)
print("Overall quality:", metadata.data_quality.overall_quality)
print("Total issues:", metadata.data_quality.total_issues)

# Cell 5: Add field descriptions
field_descriptions = FieldDescriptions(descriptions=[
    FieldDescription(field_name="total_bill", description="Total bill amount in dollars"),
    FieldDescription(field_name="tip", description="Tip amount in dollars"),
    FieldDescription(field_name="sex", description="Customer gender"),
    FieldDescription(field_name="smoker", description="Whether customer smokes"),
    FieldDescription(field_name="day", description="Day of the week"),
    FieldDescription(field_name="time", description="Meal time (Lunch/Dinner)"),
    FieldDescription(field_name="size", description="Party size")
])

catalog.add_field_descriptions("tips", field_descriptions)
print("✅ Field descriptions added!")

# Cell 6: Verify descriptions were added
for field in metadata.field_spec:
    print(f"{field.field_name}: {field.description}")

# Cell 7: Test SQL query
result = catalog.query("SELECT AVG(tip) as avg_tip, AVG(total_bill) as avg_bill FROM tips")
print("Query result:")
print(result)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Registered tables: ['tips']
✅ Profiling completed!
Table name: tips
Description: Restaurant tips dataset with customer information
Rows: 244
Columns: 7
Overall quality: good
Total issues: 3
✅ Field descriptions added!
total_bill: Total bill amount in dollars
tip: Tip amount in dollars
sex: Customer gender
smoker: Whether customer smokes
day: Day of the week
time: Meal time (Lunch/Dinner)
size: Party size
Query result:
    avg_tip   avg_bill
0  2.998279  19.785943


In [20]:
catalog._tables['tips']['metadata'].field_spec

[FieldSpec(field_name='total_bill', data_type='float', missing_values=0, missing_values_pct=0.0, unique_values=229, unique_values_pct=0.94, most_frequent={13.42: 3, 15.98: 2, 17.92: 2, 13.0: 2, 13.81: 2}, statistics={'min': 3.07, 'max': 50.81, 'mean': 19.79, 'median': 17.8, 'std': 8.9, 'var': 79.25, 'skew': 1.13, 'kurt': 1.22, 'iqr': 10.78, 'cv': 0.45, 'lower_bound': -2.82, 'upper_bound': 40.3}, description='Total bill amount in dollars'),
 FieldSpec(field_name='tip', data_type='float', missing_values=0, missing_values_pct=0.0, unique_values=123, unique_values_pct=0.5, most_frequent={2.0: 33, 3.0: 23, 4.0: 12, 5.0: 10, 2.5: 10}, statistics={'min': 1.0, 'max': 10.0, 'mean': 3.0, 'median': 2.9, 'std': 1.38, 'var': 1.91, 'skew': 1.47, 'kurt': 3.65, 'iqr': 1.56, 'cv': 0.46, 'lower_bound': -0.34, 'upper_bound': 5.91}, description='Tip amount in dollars'),
 FieldSpec(field_name='sex', data_type='string', missing_values=0, missing_values_pct=0.0, unique_values=2, unique_values_pct=0.01, most_

In [21]:
catalog = DataCatalog()
catalog.register("tips", sns.load_dataset("tips"), "information of customers' tips of each bill")
catalog.profile_tables(["tips"])
profile_text = catalog.to_dq_profile("tips")
print(profile_text)

DATASET: tips
DESCRIPTION: information of customers' tips of each bill
ROWS: 244
COLUMNS: 7
DUPLICATES: 1
OVERALL QUALITY: GOOD
TOTAL ISSUES: 3

FIELD PROFILES:

total_bill (float):
  Missing: 0 (0.0%)
  Unique: 229 (0.9%)

tip (float):
  Missing: 0 (0.0%)
  Unique: 123 (0.5%)

sex (string):
  Missing: 0 (0.0%)
  Unique: 2 (0.0%)

smoker (string):
  Missing: 0 (0.0%)
  Unique: 2 (0.0%)

day (string):
  Missing: 0 (0.0%)
  Unique: 4 (0.0%)

time (string):
  Missing: 0 (0.0%)
  Unique: 2 (0.0%)

size (integer):
  Missing: 0 (0.0%)
  Unique: 6 (0.0%)

MINOR ISSUES:
- total_bill: Moderately skewed distribution (skew: 1.13)
- tip: Moderately skewed distribution (skew: 1.47)
- size: Moderately skewed distribution (skew: 1.45)



In [28]:
# Test DataCatalog workflow
from broinsight.utils.data_catalog import DataCatalog
import seaborn as sns

field_descriptions = FieldDescriptions(descriptions=[
    FieldDescription(field_name="total_bill", description="Total bill amount in dollars"),
    FieldDescription(field_name="tip", description="Tip amount in dollars"),
    FieldDescription(field_name="sex", description="Customer gender"),
    FieldDescription(field_name="smoker", description="Whether customer smokes"),
    FieldDescription(field_name="day", description="Day of the week"),
    FieldDescription(field_name="time", description="Meal time (Lunch/Dinner)"),
    FieldDescription(field_name="size", description="Party size")
])

# Create catalog and register data
catalog = DataCatalog()
catalog.register("tips", sns.load_dataset("tips"), "Restaurant tips dataset")

# Profile the table
catalog.profile_tables(["tips"])

# Add field descriptions
catalog.add_field_descriptions("tips", field_descriptions)

# Generate DQ profile
profile_text = catalog.to_dq_profile("tips")
print(profile_text)


DATASET: tips
DESCRIPTION: Restaurant tips dataset
ROWS: 244
COLUMNS: 7
DUPLICATES: 1
OVERALL QUALITY: GOOD
TOTAL ISSUES: 3

FIELD PROFILES:

total_bill (float):
  Missing: 0 (0.0%)
  Unique: 229 (0.9%)
  Description: Total bill amount in dollars

tip (float):
  Missing: 0 (0.0%)
  Unique: 123 (0.5%)
  Description: Tip amount in dollars

sex (string):
  Missing: 0 (0.0%)
  Unique: 2 (0.0%)
  Description: Customer gender

smoker (string):
  Missing: 0 (0.0%)
  Unique: 2 (0.0%)
  Description: Whether customer smokes

day (string):
  Missing: 0 (0.0%)
  Unique: 4 (0.0%)
  Description: Day of the week

time (string):
  Missing: 0 (0.0%)
  Unique: 2 (0.0%)
  Description: Meal time (Lunch/Dinner)

size (integer):
  Missing: 0 (0.0%)
  Unique: 6 (0.0%)
  Description: Party size

MINOR ISSUES:
- total_bill: Moderately skewed distribution (skew: 1.13)
- tip: Moderately skewed distribution (skew: 1.47)
- size: Moderately skewed distribution (skew: 1.45)



In [23]:
from broinsight.experiment.ollama import LocalOpenAI
from broprompt import Prompt

model = LocalOpenAI()

prompt = Prompt.from_markdown("broinsight/prompt_hub/dq_suggestion.md")
# question = "Do we have any concern about this one?"
question = "Any data transformation needed for machine learning project here?"
user_input = "PROFILE:\n\n{profile}\n\nUSER_INPUT:\n\n{question}\n\n".format(profile=profile_text, question=question)
response = model.run(system_prompt=prompt.str, messages=[
    model.UserMessage(text=user_input)
])
response

{'content': "**Data Quality Assessment: READY**  \nThe dataset has no missing values, only one duplicate row, and an overall “GOOD” quality rating. The only notable issue is moderate skewness in three numeric fields.\n\n---\n\n### Minor Issues & Impact\n| Field | Skewness | Impact on ML |\n|-------|----------|--------------|\n| total_bill | 1.13 | Skewed features can bias algorithms that assume normality (e.g., linear regression, Gaussian Naïve Bayes) and may reduce predictive performance. |\n| tip | 1.47 | As the target variable, skewness can lead to high variance predictions; log‑transforming the target often improves model fit. |\n| size | 1.45 | Skewness in party size can affect tree‑based models less, but still useful to transform for consistency across algorithms. |\n\n---\n\n### Recommended Actions (Prioritized)\n\n1. **Remove the Duplicate Row**  \n   - *Why*: Even one duplicate can slightly distort feature distributions and model evaluation.  \n   - *How*: `df.drop_duplicates(

In [24]:
print(response['content'])

**Data Quality Assessment: READY**  
The dataset has no missing values, only one duplicate row, and an overall “GOOD” quality rating. The only notable issue is moderate skewness in three numeric fields.

---

### Minor Issues & Impact
| Field | Skewness | Impact on ML |
|-------|----------|--------------|
| total_bill | 1.13 | Skewed features can bias algorithms that assume normality (e.g., linear regression, Gaussian Naïve Bayes) and may reduce predictive performance. |
| tip | 1.47 | As the target variable, skewness can lead to high variance predictions; log‑transforming the target often improves model fit. |
| size | 1.45 | Skewness in party size can affect tree‑based models less, but still useful to transform for consistency across algorithms. |

---

### Recommended Actions (Prioritized)

1. **Remove the Duplicate Row**  
   - *Why*: Even one duplicate can slightly distort feature distributions and model evaluation.  
   - *How*: `df.drop_duplicates(inplace=True)`.

2. **Transform

In [None]:
metadata_text = catalog.to_guide_metadata(["tips"])

prompt = Prompt.from_markdown("broinsight/prompt_hub/guide_question.md")

# question = "Do we have any concern about this one?"
# question = "Any data transformation needed for machine learning project here?"]
question = "I'm a new manager and I wanna make a new promotion for the shop. What should I do?"
user_input = "METADATA:\n\n{context}\n\nUSER_INPUT:\n\n{question}\n\n".format(context=metadata_text, question=question)
response = model.run(system_prompt=prompt.str, messages=[
    model.UserMessage(text=user_input)
])
response

{'content': "Based on your role and goals, here are some areas you might want to explore:\n\n**Customer demographics and preferences**\n- What is the average tip amount for Male versus Female customers (tip where sex = 'Male' vs sex = 'Female')?\n- Do smokers tip more on average than non-smokers (average tip where smoker = 'Yes' versus smoker = 'No')?\n\n**Timing of visits**\n- Which day of the week has the highest average total bill (average total_bill where day = 'Sat', 'Sun', or 'Thur')?\n- Is there a difference in average tip percentage between Lunch and Dinner (average tip/total_bill where time = 'Lunch' versus time = 'Dinner')?\n\n**Group size and spending**\n- How does the average total bill change with party size (average total_bill where size = 1, 2, 3, etc.)?\n- What is the average tip per person for different party sizes (average tip/size where size = 1, 2, 3, etc.)?\n\n**Promotion targeting**\n- Which combination of day and time yields the highest average total bill (averag

In [27]:
print(response["content"])

Based on your role and goals, here are some areas you might want to explore:

**Customer demographics and preferences**
- What is the average tip amount for Male versus Female customers (tip where sex = 'Male' vs sex = 'Female')?
- Do smokers tip more on average than non-smokers (average tip where smoker = 'Yes' versus smoker = 'No')?

**Timing of visits**
- Which day of the week has the highest average total bill (average total_bill where day = 'Sat', 'Sun', or 'Thur')?
- Is there a difference in average tip percentage between Lunch and Dinner (average tip/total_bill where time = 'Lunch' versus time = 'Dinner')?

**Group size and spending**
- How does the average total bill change with party size (average total_bill where size = 1, 2, 3, etc.)?
- What is the average tip per person for different party sizes (average tip/size where size = 1, 2, 3, etc.)?

**Promotion targeting**
- Which combination of day and time yields the highest average total bill (average total_bill where day = 'Sa

In [31]:
metadata_text = catalog.to_sql_metadata(["tips"])

prompt = Prompt.from_markdown("broinsight/prompt_hub/generate_sql.md")

# question = "Do we have any concern about this one?"
# question = "Any data transformation needed for machine learning project here?"]
# question = "I'm a new manager and I wanna make a new promotion for the shop. What should I do?"
question = "Which day of the week has the highest average total bill (average total_bill where day = 'Sat', 'Sun', or 'Thur')?"
user_input = "METADATA:\n\n{context}\n\nUSER_INPUT:\n\n{question}\n\n".format(context=metadata_text, question=question)
response = model.run(system_prompt=prompt.str, messages=[
    model.UserMessage(text=user_input)
])
response

{'content': "```sql\nSELECT\n    day,\n    AVG(total_bill) AS avg_total_bill\nFROM\n    tips\nWHERE\n    day IN ('Sat', 'Sun', 'Thur')\nGROUP BY\n    day\nORDER BY\n    avg_total_bill DESC\nLIMIT 1;\n```",
 'model_name': 'gpt-oss:latest',
 'input_token': 0,
 'output_token': 0}

In [33]:
sql_query = response['content'].split("```sql")[-1].split("```")[0]
print(sql_query)


SELECT
    day,
    AVG(total_bill) AS avg_total_bill
FROM
    tips
WHERE
    day IN ('Sat', 'Sun', 'Thur')
GROUP BY
    day
ORDER BY
    avg_total_bill DESC
LIMIT 1;



In [34]:
catalog.query(sql_query)

Unnamed: 0,day,avg_total_bill
0,Sun,21.41


In [None]:
class BroInsight:
    def __init__(self, llm): pass
    def assess_data_quality(self, context, message): pass
    def suggest_questions(self, context, message): pass
    def generate_sql(self, context, message): pass
    def create_chart(self, context, message): pass
    def ask_data(self, context, message): pass

In [35]:
catalog

<broinsight.utils.data_catalog.DataCatalog at 0x116f8c260>

# Test with AWS Bedrock

In [167]:
from broinsight.experiment.bedrock import AWSConfig, BedrockOpenAI
from broinsight.broinsight import BroInsight
import boto3

from broinsight.utils.data_catalog import DataCatalog
import seaborn as sns

field_descriptions = FieldDescriptions(descriptions=[
    FieldDescription(field_name="total_bill", description="Total bill amount in dollars"),
    FieldDescription(field_name="tip", description="Tip amount in dollars"),
    FieldDescription(field_name="sex", description="Customer gender"),
    FieldDescription(field_name="smoker", description="Whether customer smokes"),
    FieldDescription(field_name="day", description="Day of the week"),
    FieldDescription(field_name="time", description="Meal time (Lunch/Dinner)"),
    FieldDescription(field_name="size", description="Party size")
])

# Create catalog and register data
catalog = DataCatalog()
catalog.register("tips", sns.load_dataset("tips"), "Restaurant tips dataset")

# Profile the table
catalog.profile_tables(["tips"])

# Add field descriptions
catalog.add_field_descriptions("tips", field_descriptions)

session = boto3.Session()
credentials = session.get_credentials()

aws_configs = AWSConfig(
    aws_access_key_id=credentials.access_key,
    aws_secret_access_key=credentials.secret_key,
    aws_session_token=credentials.token or None,
    region_name='us-west-2'
)

model_id="openai.gpt-oss-20b-1:0"
bedrock = BedrockOpenAI(
    model_id=model_id,
    aws_configs=aws_configs
)



In [168]:
bedrock.run("assistant", [bedrock.UserMessage("hi")])

{'content': 'Hello! How can I help you today?',
 'model': 'openai.gpt-oss-20b-1:0',
 'processed': 1.4361209869384766,
 'tokens': {'input': 75, 'output': 60}}

In [169]:
from broinsight.broinsight import BroInsight

# broinsight = BroInsight(model=LocalOpenAI())
broinsight = BroInsight(model=bedrock)

In [170]:
response = broinsight.assess_data_quality(
    context=catalog.to_dq_profile("tips"),
    message="Do we have any concern about this one?"
)
print(response['content'])

**Data Quality Assessment:** READY  
**Overall Quality:** Good – the dataset is clean, with no missing values, only one duplicate row, and all fields fully populated.  

**Critical Issues Found:** None.  

**Recommended Actions (minor‑issue level):**  
1. **Skewed numeric fields** – `total_bill`, `tip`, and `size` all show moderate right‑skew (skew ≈ 1.1–1.5).  
   - *If you plan to use mean‑based statistics or linear models, consider a log or square‑root transform to reduce skewness.*  
   - *Alternatively, use median‑based summaries or non‑parametric tests that are robust to skew.*  
2. **Duplicate row** – only one duplicate exists.  
   - *Remove it if you want a strictly unique set of observations, though it’s unlikely to bias most analyses.*  

**Next Steps:**  
- Proceed with your analysis.  
- Keep the skewness in mind when interpreting mean‑based results or fitting models that assume normality.  
- If you encounter issues with model convergence or residual diagnostics, revisit 

In [171]:
response = broinsight.suggest_questions(
    context=catalog.to_guide_metadata("tips"),
    message="I'm a new manager and I wanna make a promotion for the shop but I don't know how to start."
)
print(response['content'])

Based on your role and goals, here are some areas you might want to explore:

Customer Spending Patterns  
- What is the average total_bill for each day of the week (day)?  
- How does the average tip vary between Lunch and Dinner (time)?  

Party Size and Revenue  
- What is the average total_bill for each party size (size)?  
- How many customers are in parties of 4 or more, and what is their average tip?  

Customer Demographics and Tipping  
- What is the average tip for Male versus Female customers (sex)?  
- How does tip amount differ between smokers and non-smokers (smoker)?  

Peak Hours and Promotion Timing  
- Which day and time combination has the highest average total_bill?  
- What is the average party size during the busiest time slots?  

Just ask me any of these questions and I'll analyze your data to get the answers!


In [172]:
question = "What is the average total_bill for each day of the week?"
response = broinsight.generate_sql(
    context=catalog.to_sql_metadata("tips"),
    message=question
)
print(response['content'])

```sql
SELECT
    day,
    AVG(total_bill) AS avg_total_bill
FROM
    tips
GROUP BY
    day
ORDER BY
    day;
```


In [173]:
sql_query = response['content'].split("```sql")[-1].split("```")[0]
result = catalog.query(sql_query)

In [174]:
response = broinsight.ask_data(
    context=result.to_string(),
    message=question
)
print(response['content'])

Here’s the average total bill broken down by day of the week:

- **Thursday:** $17.68  
- **Friday:** $17.15  
- **Saturday:** $20.44  
- **Sunday:** $21.41  

**What this tells us**

- **Weekend lift:** Saturday and Sunday are noticeably higher than the weekdays, with Sunday topping the chart. This suggests that customers tend to spend more when they’re off work, especially on Sunday.
- **Thursday‑Friday dip:** The averages for Thursday and Friday are fairly close and lower than the weekend figures, indicating a modest drop in spending as the week progresses toward the weekend.

**Practical next steps**

1. **Targeted promotions:** Consider running special offers or happy‑hour deals on Thursday and Friday to boost spending and bring those days closer to the weekend average.
2. **Staffing & inventory:** Since Saturday and Sunday see the highest spend, ensure you have enough staff and inventory to handle the increased demand.
3. **Further analysis:** Look at the distribution of bills (e

In [175]:
response = broinsight.create_chart(
    query_result=result,
    message=question
)
# print(response['fig'])

In [176]:
response['chart']

In [53]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Dataset 1: Customers
np.random.seed(42)
customers_data = {
    'customer_id': range(1, 101),
    'name': [f'Customer_{i}' for i in range(1, 101)],
    'email': [f'customer{i}@email.com' for i in range(1, 101)],
    'age': np.random.randint(18, 75, 100),
    'city': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'], 100),
    'signup_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(100)],
    'status': np.random.choice(['Active', 'Inactive'], 100, p=[0.8, 0.2])
}
customers_df = pd.DataFrame(customers_data)

# Dataset 2: Orders (with foreign key to customers)
np.random.seed(42)
orders_data = {
    'order_id': range(1, 251),
    'customer_id': np.random.choice(range(1, 101), 250),  # Foreign key to customers
    'order_date': [datetime(2023, 1, 1) + timedelta(days=np.random.randint(0, 365)) for _ in range(250)],
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home', 'Sports'], 250),
    'order_amount': np.round(np.random.uniform(10.0, 500.0, 250), 2),
    'shipping_cost': np.round(np.random.uniform(0.0, 25.0, 250), 2),
    'order_status': np.random.choice(['Completed', 'Pending', 'Cancelled'], 250, p=[0.7, 0.2, 0.1]),
    'payment_method': np.random.choice(['Credit Card', 'PayPal', 'Bank Transfer'], 250)
}
orders_df = pd.DataFrame(orders_data)

# Field descriptions for customers
customers_descriptions = FieldDescriptions(descriptions=[
    FieldDescription(field_name="customer_id", description="Unique customer identifier"),
    FieldDescription(field_name="name", description="Customer full name"),
    FieldDescription(field_name="email", description="Customer email address"),
    FieldDescription(field_name="age", description="Customer age in years"),
    FieldDescription(field_name="city", description="Customer's city of residence"),
    FieldDescription(field_name="signup_date", description="Date when customer registered"),
    FieldDescription(field_name="status", description="Current customer account status")
])

# Field descriptions for orders
orders_descriptions = FieldDescriptions(descriptions=[
    FieldDescription(field_name="order_id", description="Unique order identifier"),
    FieldDescription(field_name="customer_id", description="Links to customer who placed the order"),
    FieldDescription(field_name="order_date", description="Date when order was placed"),
    FieldDescription(field_name="product_category", description="Category of products ordered"),
    FieldDescription(field_name="order_amount", description="Total order value in dollars"),
    FieldDescription(field_name="shipping_cost", description="Shipping fee charged"),
    FieldDescription(field_name="order_status", description="Current status of the order"),
    FieldDescription(field_name="payment_method", description="Payment method used")
])

# Setup with DataCatalog
from broinsight.utils.data_catalog import DataCatalog

catalog = DataCatalog()
catalog.register("customers", customers_df, "Customer information and demographics")
catalog.register("orders", orders_df, "Customer order transactions and details")

# Profile tables
catalog.profile_tables(["customers", "orders"])

# Add field descriptions
catalog.add_field_descriptions("customers", customers_descriptions)
catalog.add_field_descriptions("orders", orders_descriptions)

# Add relationship
catalog.add_relationship("orders", "customer_id", "customers", "customer_id")

# Test SQL metadata generation
sql_metadata = catalog.to_sql_metadata(["customers", "orders"])
print(sql_metadata)

METADATAS:

Table: customers
Columns:
- customer_id (integer): Unique customer identifier, NOT NULL
  Range: 1 - 100, Average: 50.50
- name (string): Customer full name, NOT NULL
  Examples: "Customer_30", "Customer_76", "Customer_94"
- email (string): Customer email address, NOT NULL
  Examples: "customer15@email.com", "customer26@email.com", "customer40@email.com"
- age (integer): Customer age in years, NOT NULL
  Range: 19 - 74, Average: 45.19
- city (string): Customer's city of residence, NOT NULL
  Examples: "New York", "Houston", "Chicago"
- signup_date (unknown): Date when customer registered, NOT NULL
- status (string): Current customer account status, NOT NULL
  Examples: "Active", "Inactive"

Table: orders
Relationships:
  customer_id -> customers.customer_id
Columns:
- order_id (integer): Unique order identifier, NOT NULL
  Range: 1 - 250, Average: 125.50
- customer_id (integer): Links to customer who placed the order, NOT NULL
  Range: 1 - 100, Average: 50.34
- order_date (

In [54]:
response = broinsight.suggest_questions(
    context=catalog.to_guide_metadata(["customers", "orders"]),
    message="I'm a new manager here. I don't know what to look to know more about our business."

)
print(response['content'])

Based on your role and goals, here are some areas you might want to explore:

Customer Overview
- How many customers are Active versus Inactive in the customers table?
- What is the average age of customers in the customers table?
- Which city has the highest number of customers?

Sales Performance
- What is the total order_amount for each product_category in the orders table?
- How many orders were Completed, Pending, or Cancelled in the orders table?
- What is the average order_amount per order in the orders table?

Order Fulfillment
- What is the average shipping_cost across all orders in the orders table?
- How many orders were placed by customers from each city (join customers.city with orders.customer_id)?

Payment Methods
- How many orders were paid using each payment_method in the orders table?
- What is the total order_amount for orders paid with PayPal versus Credit Card versus Bank Transfer?

Just ask me any of these questions and I'll analyze your data to get the answers!


In [66]:
# question = "How many orders were paid using each payment_method in the orders table?"
# question = "Who are the top 5 customers payting with PayPal method? I wanna know thier ages and names"
question = "Who are the top 5 spenders? I wanna know thier ages and names and their most payment method"
response = broinsight.generate_sql(
    context=catalog.to_sql_metadata(["customers", "orders"]),
    message=question

)
print(response['content'])

```sql
WITH total_spend AS (
    SELECT
        customer_id,
        SUM(order_amount) AS total_amount
    FROM orders
    GROUP BY customer_id
),
most_payment AS (
    SELECT
        customer_id,
        payment_method
    FROM (
        SELECT
            customer_id,
            payment_method,
            ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY COUNT(*) DESC) AS rn
        FROM orders
        GROUP BY customer_id, payment_method
    ) t
    WHERE rn = 1
)
SELECT
    c.name,
    c.age,
    mp.payment_method,
    ts.total_amount
FROM customers c
JOIN total_spend ts ON c.customer_id = ts.customer_id
JOIN most_payment mp ON c.customer_id = mp.customer_id
ORDER BY ts.total_amount DESC
LIMIT 5;
```


In [67]:
sql_query = response['content'].split("```sql")[-1].split("```")[0]
result = catalog.query(sql_query)

In [68]:
result

Unnamed: 0,name,age,payment_method,total_amount
0,Customer_62,45,Bank Transfer,2797.0
1,Customer_2,69,Credit Card,2119.07
2,Customer_89,48,PayPal,1923.45
3,Customer_44,74,Bank Transfer,1919.8
4,Customer_88,31,Credit Card,1553.44


In [69]:
response = broinsight.ask_data(
    context=result.to_string(),
    message=question
)
print(response['content'])

Here are the five customers who spent the most, in order from highest to lowest:

| Rank | Customer | Age | Payment Method |
|------|----------|-----|----------------|
| 1 | Customer_62 | 45 | Bank Transfer |
| 2 | Customer_2 | 69 | Credit Card |
| 3 | Customer_89 | 48 | PayPal |
| 4 | Customer_44 | 74 | Bank Transfer |
| 5 | Customer_88 | 31 | Credit Card |

**What this tells us**

- **Bank Transfer** is the most common payment method among the top spenders (appearing twice).  
- The top spender, Customer_62, is a 45‑year‑old, which might suggest that mid‑age customers are high‑value.  
- Credit Card and PayPal also appear, indicating a mix of payment preferences.

**Next steps you might consider**

1. **Targeted offers** – send personalized promotions to these high‑spending customers, especially using the payment method they already prefer.  
2. **Retention focus** – since they’re already spending a lot, a loyalty program could keep them engaged.  
3. **Cross‑sell analysis** – check 

In [70]:
response = broinsight.create_chart(
    query_result=result,
    message=question
)
response['chart']

In [71]:
import boto3
session = boto3.Session()
credentials = session.get_credentials()
bedrock = boto3.client(
    service_name='bedrock-runtime', 
    aws_access_key_id=credentials.access_key,
    aws_secret_access_key=credentials.secret_key,
    aws_session_token=credentials.token or None,
    region_name='us-west-2'
)


In [154]:
import boto3
import time

aws_config = dict(
    aws_access_key_id=credentials.access_key,
    aws_secret_access_key=credentials.secret_key,
    aws_session_token=credentials.token or None,
    region_name='us-west-2'    
)
class BaseLLM:
    def __init__(self, *args, **kwargs): pass
    def UserMessage(self, text): pass
    def AIMessage(self, text): pass
    def SystemMessage(self, text): pass
    def OutputMessage(self, text): pass
    def run(self, system_prompt, messages): pass

class BedrockOpenAI(BaseLLM):
    def __init__(self, model_id, aws_configs, temperature=0.1):
        self.model_id = model_id
        self.aws_configs = aws_configs
        self.temperature = temperature

    def get_model(self):
        model = boto3.client(
            service_name='bedrock-runtime',
            **self.aws_configs
        )
        return model
    def SystemMessage(self, text):
        return [{"text": text}]
    
    def UserMessage(self, text):
        return {"role":"user", "content": [{"text": text}]}
    
    def AIMessage(self, text):
        return {"role":"user", "content": [{"text": text}]}
    
    def OutputMessage(self, response):
        return dict(
            content=[i['text'] for i in response['output']['message']['content'] if 'text' in i][-1],
            model=self.model_id,
            processed=response['processed'],
            tokens=dict(
                input=response['usage']["inputTokens"],
                output=response['usage']["outputTokens"]
            )
        )

    def run(self, system_prompt, messages):
        start = time.time()
        model = self.get_model()
        response = model.converse(
            modelId=self.model_id,
            system=self.SystemMessage(text=system_prompt),
            messages=messages,
            inferenceConfig=dict(
                temperature=self.temperature
            )
        )
        response['processed'] = time.time() - start
        response = self.OutputMessage(response=response)
        return response

In [158]:
model_id="openai.gpt-oss-20b-1:0"
bedrock = BedrockOpenAI(
    model_id=model_id,
    aws_configs=aws_config
)

response = bedrock.run(
    system_prompt="You are a helpful assistant",
    messages=[
        bedrock.UserMessage("Hello, how are you?")
    ]
)

In [157]:
response

{'content': 'Hello! I’m doing great—thanks for asking. How about you? Anything on your mind today?',
 'model': 'openai.gpt-oss-20b-1:0',
 'processed': 1.3433969020843506,
 'tokens': {'input': 84, 'output': 61}}

In [138]:
from broinsight.experiment.bedrock import AWSConfig

aws_configs = AWSConfig(
    aws_access_key_id=credentials.access_key,
    aws_secret_access_key=credentials.secret_key,
    aws_session_token=credentials.token or None,
    region_name='us-west-2'
)

model_id="openai.gpt-oss-20b-1:0"
bedrock = BedrockOpenAI(
    model_id=model_id,
    aws_configs=aws_config
)

response = bedrock.run(
    system_prompt="You are a helpful assistant",
    messages=[
        bedrock.UserMessage("Hello, how are you?")
    ]
)
response

{'content': 'Hello! I’m doing great—thanks for asking. How about you? Anything on your mind today?',
 'model': 'openai.gpt-oss-20b-1:0',
 'processed': 1.4777162075042725,
 'tokens': {'input': 84, 'output': 61}}