In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import duckdb
import pandas as pd
import seaborn as sns

tips = sns.load_dataset("tips")

tips.columns.tolist()

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [3]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
from broinsight.data_quality.field_profile import field_profile

tips_metadata = field_profile(tips)
pd.DataFrame.from_dict(tips_metadata, orient="index")

Unnamed: 0,data_types,missing_values,missing_values_pct,unique_values,unique_values_pct,most_frequent,statistics
total_bill,float,0,0.0,229,0.94,"{13.42: 3, 13.81: 2, 15.98: 2, 17.92: 2, 10.07...","{'min': 3.07, 'max': 50.81, 'mean': 19.79, 'me..."
tip,float,0,0.0,123,0.5,"{2.0: 33, 3.0: 23, 4.0: 12, 5.0: 10, 2.5: 10}","{'min': 1.0, 'max': 10.0, 'mean': 3.0, 'median..."
sex,string,0,0.0,2,0.01,"{'Male': 157, 'Female': 87}","{'mode': 'Male', 'avg_length': 4.71, 'min_leng..."
smoker,string,0,0.0,2,0.01,"{'No': 151, 'Yes': 93}","{'mode': 'No', 'avg_length': 2.38, 'min_length..."
day,string,0,0.0,4,0.02,"{'Sat': 87, 'Sun': 76, 'Thur': 62, 'Fri': 19}","{'mode': 'Sat', 'avg_length': 3.25, 'min_lengt..."
time,string,0,0.0,2,0.01,"{'Dinner': 176, 'Lunch': 68}","{'mode': 'Dinner', 'avg_length': 5.72, 'min_le..."
size,integer,0,0.0,6,0.02,"{2: 156, 3: 38, 4: 37, 5: 5, 1: 4}","{'min': 1.0, 'max': 6.0, 'mean': 2.57, 'median..."


In [5]:
tips_metadata

{'total_bill': {'data_types': 'float',
  'missing_values': 0,
  'missing_values_pct': 0.0,
  'unique_values': 229,
  'unique_values_pct': 0.94,
  'most_frequent': {13.42: 3, 13.81: 2, 15.98: 2, 17.92: 2, 10.07: 2},
  'statistics': {'min': 3.07,
   'max': 50.81,
   'mean': 19.79,
   'median': 17.8,
   'std': 8.9,
   'var': 79.25,
   'skew': 1.13,
   'kurt': 1.22,
   'iqr': 10.78,
   'cv': 0.45,
   'lower_bound': -2.82,
   'upper_bound': 40.3}},
 'tip': {'data_types': 'float',
  'missing_values': 0,
  'missing_values_pct': 0.0,
  'unique_values': 123,
  'unique_values_pct': 0.5,
  'most_frequent': {2.0: 33, 3.0: 23, 4.0: 12, 5.0: 10, 2.5: 10},
  'statistics': {'min': 1.0,
   'max': 10.0,
   'mean': 3.0,
   'median': 2.9,
   'std': 1.38,
   'var': 1.91,
   'skew': 1.47,
   'kurt': 3.65,
   'iqr': 1.56,
   'cv': 0.46,
   'lower_bound': -0.34,
   'upper_bound': 5.91}},
 'sex': {'data_types': 'string',
  'missing_values': 0,
  'missing_values_pct': 0.0,
  'unique_values': 2,
  'unique_values

In [6]:
descriptions = dict(
    total_bill="the amount of paid bill of the meal",
    tip="the amount of tip that customers paid",
    sex="the gender of customers",
    smoker="it indicates that a customer is a smoker or not. if No means a customer is a non-smoker, Yes means a customer is a smoker.",
    day="this is a day of the week when a customer having a meal here. i.e. Mon, Tue, Wed, Thu, Fri, Sat, Sun",
    time="the time of the meal. it can be either Dinner or Lunch",
    size="the number of dishes that customers have",
)

In [7]:
[tips_metadata[feat].update(dict(description=descriptions[feat])) for feat in tips.columns.tolist()]

[None, None, None, None, None, None, None]

In [8]:
from broinsight.experiment.ollama import LocalOpenAI
model = LocalOpenAI()

In [9]:
from broprompt import Prompt

prompt = Prompt.from_markdown("broinsight/prompt_hub/guide_question.md")
metadata = "METADATA:\n\n{metadata}\n\n".format(metadata="\n".join(["{field}: {detail}".format(field=field, detail=detail) for field, detail in tips_metadata.items()]))
user_input = "USER_INPUT:\n\nWhat data do we have?"
response = model.run(system_prompt=prompt.str, messages=[model.UserMessage(text=metadata+user_input)])

print(response['content'])

**Here’s a quick snapshot of what the dataset contains**

| Column | Data type | Key stats (min‑max, mean, etc.) | Typical values | What it tells you |
|--------|-----------|---------------------------------|----------------|-------------------|
| **total_bill** | float | 3.07 – 50.81 USD, mean ≈ 19.79 | Amount customers paid for the meal | The size of each bill |
| **tip** | float | 1.0 – 10.0 USD, mean ≈ 3.0 | Amount customers tipped | How generous customers are |
| **sex** | string | Male / Female | 157 Male, 87 Female | Gender distribution |
| **smoker** | string | No / Yes | 151 Non‑smokers, 93 Smokers | Smoking status |
| **day** | string | Sat, Sun, Thur, Fri | 87 Sat, 76 Sun, 62 Thur, 19 Fri | Day of the week the meal was served |
| **time** | string | Lunch / Dinner | 176 Dinner, 68 Lunch | Meal time |
| **size** | integer | 1 – 6 people, mean ≈ 2.57 | 2 (156), 3 (38), 4 (37) | Party size |

- **Rows:** 222 (no missing values)
- **Columns:** 7 (all clean, no nulls)
- **Typical

In [10]:
prompt = Prompt.from_markdown("broinsight/prompt_hub/table_descriptor.md")
metadata = "METADATA:\n\n{metadata}\n\n".format(metadata="\n".join(["{field}: {detail}".format(field=field, detail=detail) for field, detail in tips_metadata.items()]))
response = model.run(system_prompt=prompt.str, messages=[model.UserMessage(text=metadata)])

print(response["content"])

```text
This table records every meal a customer orders at a restaurant.  
Each row shows how much the bill was, how much tip the customer gave, when the meal happened (day of the week and lunch or dinner), how many people were in the party, and simple demographic details (gender and whether they smoke).  

The data lets the restaurant:  
• Understand tipping patterns and average spend by day, time, party size, and customer type.  
• Forecast revenue and staffing needs for busy periods (e.g., weekends or dinner service).  
• Evaluate the impact of promotions or menu changes on sales and tips.  
• Identify customer segments that generate higher tips or spend more, helping target marketing or loyalty programs.  

Typical users are restaurant managers, finance analysts, marketing teams, and operations planners who need quick, business‑friendly insights into sales performance and customer behavior.


In [11]:
table_name = "tips"
table_description = response['content'].split("```text")[-1].split("```")[0]

table_metadata = dict(
    table_name=table_name,
    table_description=table_description,
)
table_metadata.update(tips_metadata)

In [20]:
# question = "I wanna know how many people visit my shop compare to global population and tell me the source of information."
question = "I wanna see the relationship between average total bill and average tips of segments of sex and smoker."
# question = "Does smoker tip differently than non-smoker breaking down by time of day?"
# question = "Which sex gives the better tips?"
# question = "Which sex gives the better tips than one another?"
# question = "I wanna know that weekend have a size bigger than weekday or not? If so how many?"
# question = "อยากรู้ว่า คนกินข้าวเยอะ ในวันไหนมากกว่ากัน ระหว่าง วันจันทร์ และ วันศุกร์ เป็นจำนวนเท่าไหร่?"
# question = "I wanna know how many people are in between smoker and non-smoker groups. Also, do you know which brand of ciggaratte do the smoker prefer in our restaurant?"
# question = "Which sex has the most dinner?"
# question = "Based on sex, smoker type and meal of the day, which customer group paid most in tips?"
# question = "Based on sex, smoker type and meal of the day, which customer group paid most in tips? I wanna see all comparisons."

prompt = Prompt.from_markdown("broinsight/prompt_hub/generate_sql.md")
metadata = "METADATA:\n\n{metadata}\n\n".format(metadata="\n".join(["{field}: {detail}".format(field=field, detail=detail) for field, detail in table_metadata.items()]))
user_input = "USER_INPUT:\n\n{question}".format(question=question)
response = model.run(system_prompt=prompt.str, messages=[model.UserMessage(text=metadata+user_input)])

print(response["content"])

```sql
SELECT
    sex,
    smoker,
    AVG(total_bill) AS avg_total_bill,
    AVG(tip)       AS avg_tip
FROM
    tips
GROUP BY
    sex,
    smoker
ORDER BY
    avg_tip DESC;
```


In [21]:
sql_query = response['content'].split("```sql")[-1].split("```")[0]
print(sql_query)


SELECT
    sex,
    smoker,
    AVG(total_bill) AS avg_total_bill,
    AVG(tip)       AS avg_tip
FROM
    tips
GROUP BY
    sex,
    smoker
ORDER BY
    avg_tip DESC;



In [22]:
import duckdb
conn = duckdb.connect()
conn.register("tips", tips)
query_result = conn.execute(sql_query).df()
query_result

Unnamed: 0,sex,smoker,avg_total_bill,avg_tip
0,Male,No,19.791237,3.113402
1,Male,Yes,22.2845,3.051167
2,Female,Yes,17.977879,2.931515
3,Female,No,18.105185,2.773519


In [23]:
prompt = Prompt.from_markdown("broinsight/prompt_hub/chat.md")
context = "CONTEXT:\n\n{context}n\n".format(context=query_result.to_string())
user_input = "USER_INPUT:\n\n{question}".format(question=question)
response = model.run(system_prompt=prompt.str, messages=[model.UserMessage(text=context+user_input)])

print(response["content"])

Sure thing!  
Let’s break the problem into a few easy‑to‑follow steps:

| What you want | How to do it | What you’ll learn |
|---------------|--------------|-------------------|
| **Visualize** the relationship between `avg_total_bill` and `avg_tip` for each segment | Scatter‑plot (or line‑plot) with `sex` as the hue and `smoker` as the style (or a faceted grid) | Do the tips rise with the bill? Does that trend differ for males vs. females or smokers vs. non‑smokers? |
| **Quantify** the trend | Compute the Pearson correlation (or Spearman if the relationship looks non‑linear) inside each segment | How strong is the relationship? |
| **Model** the trend | Fit a simple linear regression for each segment | Can we predict the tip from the bill? |

Below is a quick Python notebook snippet that does all of the above using `pandas`, `seaborn` and `scipy`. Feel free to copy‑paste it into a Jupyter notebook or your favourite IDE.

```python
# ---------------------------------------------------

In [24]:
# 1. LLM generates function code
prompt = Prompt.from_markdown("broinsight/prompt_hub/chart_builder.md")
data = "DATA:\n\n{data}\n\n".format(data=query_result.to_string())
user_input = "USER_INPUT:\n\n{question}".format(question=question)
response = model.run(system_prompt=prompt.str, messages=[
    model.UserMessage(text=data+user_input)
])

In [25]:
print(response['content'])

```python
def create_chart(data):
    import plotly.express as px
    import pandas as pd
    
    # Ensure column names are in the DataFrame
    cols = [c.lower() for c in data.columns]
    # Identify numeric and categorical columns
    numeric_cols = data.select_dtypes(include='number').columns.tolist()
    categorical_cols = data.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    
    # Define expected columns
    exp_numeric = {'avg_total_bill', 'avg_tip'}
    exp_categorical = {'sex', 'smoker'}
    
    # Check if required columns exist
    if exp_numeric.issubset(set(cols)) and exp_categorical.issubset(set(cols)):
        # Use original column names
        x_col = data.columns[[c.lower() for c in data.columns].index('avg_total_bill')]
        y_col = data.columns[[c.lower() for c in data.columns].index('avg_tip')]
        color_col = data.columns[[c.lower() for c in data.columns].index('sex')]
        symbol_col = data.columns[[c.lower() for c in data.col

In [26]:
function_code = response['content'].split("```python")[-1].split("```")[0]

# 2. Execute to create function
exec(function_code)  # Now create_chart function exists

# 3. Call with actual data
fig = create_chart(query_result)

In [27]:
fig.show()

In [55]:
# 4. Display with fallback
try:
    fig.show()
except ValueError:
    fig.show(renderer="browser")  # Fallback to browser
