In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import duckdb
import pandas as pd
import seaborn as sns

tips = sns.load_dataset("tips")

tips.columns.tolist()

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [3]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
from broinsight.data_quality.field_profile import field_profile

tips_metadata = field_profile(tips)
pd.DataFrame.from_dict(tips_metadata, orient="index")

Unnamed: 0,data_types,missing_values,missing_values_pct,unique_values,unique_values_pct,most_frequent,statistics
total_bill,float,0,0.0,229,0.94,"{13.42: 3, 13.81: 2, 15.98: 2, 17.92: 2, 10.07...","{'min': 3.07, 'max': 50.81, 'mean': 19.79, 'me..."
tip,float,0,0.0,123,0.5,"{2.0: 33, 3.0: 23, 4.0: 12, 5.0: 10, 2.5: 10}","{'min': 1.0, 'max': 10.0, 'mean': 3.0, 'median..."
sex,string,0,0.0,2,0.01,"{'Male': 157, 'Female': 87}","{'mode': 'Male', 'avg_length': 4.71, 'min_leng..."
smoker,string,0,0.0,2,0.01,"{'No': 151, 'Yes': 93}","{'mode': 'No', 'avg_length': 2.38, 'min_length..."
day,string,0,0.0,4,0.02,"{'Sat': 87, 'Sun': 76, 'Thur': 62, 'Fri': 19}","{'mode': 'Sat', 'avg_length': 3.25, 'min_lengt..."
time,string,0,0.0,2,0.01,"{'Dinner': 176, 'Lunch': 68}","{'mode': 'Dinner', 'avg_length': 5.72, 'min_le..."
size,integer,0,0.0,6,0.02,"{2: 156, 3: 38, 4: 37, 5: 5, 1: 4}","{'min': 1.0, 'max': 6.0, 'mean': 2.57, 'median..."


In [5]:
descriptions = dict(
    total_bill="the amount of paid bill of the meal",
    tip="the amount of tip that customers paid",
    sex="the gender of customers",
    smoker="it indicates that a customer is a smoker or not. if No means a customer is a non-smoker, Yes means a customer is a smoker.",
    day="this is a day of the week when a customer having a meal here. i.e. Mon, Tue, Wed, Thu, Fri, Sat, Sun",
    time="the time of the meal. it can be either Dinner or Lunch",
    size="the number of dishes that customers have",
)

In [6]:
[tips_metadata[feat].update(dict(description=descriptions[feat])) for feat in tips.columns.tolist()]

[None, None, None, None, None, None, None]

In [7]:
from broinsight.experiment.ollama import LocalOpenAI
model = LocalOpenAI()

In [8]:
from broprompt import Prompt

prompt = Prompt.from_markdown("broinsight/prompt_hub/guide_question.md")
metadata = "METADATA:\n\n{metadata}\n\n".format(metadata="\n".join(["{field}: {detail}".format(field=field, detail=detail) for field, detail in tips_metadata.items()]))
user_input = "USER_INPUT:\n\nWhat data do we have?"
response = model.run(system_prompt=prompt.str, messages=[model.UserMessage(text=metadata+user_input)])

print(response['content'])

Hi there!  
From the metadata you shared, we have a tidy dataset that captures a few key aspects of each dining experience. Here’s a quick snapshot of what each column represents:

| Column | Type | What it tells you |
|--------|------|-------------------|
| **total_bill** | float | The full amount paid for the meal (USD) |
| **tip** | float | The tip given by the customer (USD) |
| **sex** | string | Customer gender (Male/Female) |
| **smoker** | string | Whether the customer smokes (Yes/No) |
| **day** | string | Day of the week the meal was served |
| **time** | string | Meal time (Lunch/Dinner) |
| **size** | integer | Number of people in the party |

All columns are complete (no missing values) and the dataset contains 239 individual records.  

---

## Ready to dig deeper?  
Here are **five specific, actionable questions** you could explore next. Each one taps into a different angle of the data and can uncover insights that are useful for operations, marketing, or customer experi

In [9]:
prompt = Prompt.from_markdown("broinsight/prompt_hub/table_descriptor.md")
metadata = "METADATA:\n\n{metadata}\n\n".format(metadata="\n".join(["{field}: {detail}".format(field=field, detail=detail) for field, detail in tips_metadata.items()]))
response = model.run(system_prompt=prompt.str, messages=[model.UserMessage(text=metadata)])

print(response["content"])

```text
This table records every dining visit at the restaurant. For each meal it shows how much the customer paid (total bill), how much tip was left, the day of the week, whether it was lunch or dinner, how many people were in the party, and a few basic customer details such as gender and smoking status.  

The data is used to understand revenue patterns, tip behavior, and customer traffic. It helps managers decide when to schedule staff, how to price menus, and which days or meal types bring in the most money. Marketing teams can use it to target promotions to specific groups (e.g., families, smokers, weekday diners). Finance uses it to track sales and forecast cash flow.  

Typical users include restaurant managers, operations planners, finance and accounting staff, marketing analysts, and data analysts who turn these numbers into reports and dashboards.


In [10]:
table_name = "tips"
table_description = response['content'].split("```text")[-1].split("```")[0]

table_metadata = dict(
    table_name=table_name,
    table_description=table_description,
)
table_metadata.update(tips_metadata)

In [None]:
# question = "Does smoker tip differently than non-smoker breaking down by time of day?"
# question = "Which sex gives the better tips?"
question = "Which sex gives the better tips than one another?"
# question = "Which sex has the most dinner?"
# question = "Based on sex, smoker type and meal of the day, which customer group paid most in tips?"
# question = "Based on sex, smoker type and meal of the day, which customer group paid most in tips? I wanna see all comparisons."

prompt = Prompt.from_markdown("broinsight/prompt_hub/generate_sql.md")
metadata = "METADATA:\n\n{metadata}\n\n".format(metadata="\n".join(["{field}: {detail}".format(field=field, detail=detail) for field, detail in table_metadata.items()]))
user_input = "USER_INPUT:\n\n{question}".format(question=question)
response = model.run(system_prompt=prompt.str, messages=[model.UserMessage(text=metadata+user_input)])

print(response["content"])

In [None]:
sql_query = response['content'].split("```sql")[-1].split("```")[0]
print(sql_query)


SELECT 
    sex,
    AVG(tip) AS avg_tip
FROM 
    tips
GROUP BY 
    sex
ORDER BY 
    avg_tip DESC
LIMIT 1;



In [None]:
import duckdb
conn = duckdb.connect()
conn.register("tips", tips)
query_result = conn.execute(sql_query).df()
query_result

Unnamed: 0,sex,avg_tip
0,Male,3.089618


In [None]:
prompt = Prompt.from_markdown("broinsight/prompt_hub/chat.md")
context = "CONTEXT:\n\n{context}n\n".format(context=query_result.to_string())
user_input = "USER_INPUT:\n\n{question}".format(question=question)
response = model.run(system_prompt=prompt.str, messages=[model.UserMessage(text=context+user_input)])

print(response["content"])

It looks like the snippet you shared contains only one observation—**Male** with an average tip of **$3.09** (the “n” at the end probably indicates the number of observations for that group).  
Because we don’t have the corresponding average for **Female** (or any other sex categories), I can’t say which group tips better based on this data alone.

If you have the full dataset, you could compute the average tip for each sex and compare them. For example, in Python/pandas you could do:

```python
tips.groupby('sex')['tip'].mean()
```

That would give you a clear comparison. Let me know if you’d like help running that analysis or interpreting the results!


In [None]:
# 1. LLM generates function code
prompt = Prompt.from_markdown("broinsight/prompt_hub/chart_builder.md")
data = "DATA:\n\n{data}\n\n".format(data=query_result.to_string())
user_input = "USER_INPUT:\n\n{question}".format(question=question)
response = model.run(system_prompt=prompt.str, messages=[
    model.UserMessage(text=data+user_input)
])

In [None]:
print(response['content'])

```python
def create_chart(data):
    import plotly.express as px
    import pandas as pd

    # Ensure the DataFrame contains the required columns
    if not {'sex', 'avg_tip'}.issubset(data.columns):
        raise ValueError("Data must contain 'sex' and 'avg_tip' columns.")

    # Drop rows with missing values in relevant columns
    df_clean = data.dropna(subset=['sex', 'avg_tip']).copy()

    # Sort by avg_tip for better visual ordering
    df_clean = df_clean.sort_values('avg_tip', ascending=False)

    fig = px.bar(
        df_clean,
        x='sex',
        y='avg_tip',
        color='sex',
        text='avg_tip',
        title='Average Tip by Sex',
        labels={
            'sex': 'Sex',
            'avg_tip': 'Average Tip ($)'
        },
        color_discrete_map={'Male': '#1f77b4', 'Female': '#ff7f0e'}
    )

    # Improve layout
    fig.update_traces(textposition='outside')
    fig.update_layout(
        yaxis=dict(tickprefix="$", title_font=dict(size=12)),
        xaxis

In [None]:
function_code = response['content'].split("```python")[-1].split("```")[0]

# 2. Execute to create function
exec(function_code)  # Now create_chart function exists

# 3. Call with actual data
fig = create_chart(query_result)

In [None]:
fig.show()

In [55]:
# 4. Display with fallback
try:
    fig.show()
except ValueError:
    fig.show(renderer="browser")  # Fallback to browser
