In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import seaborn as sns
from broinsight.utils.data_catalog import DataCatalog
from broinsight.broinsight import BroInsight
from broinsight.experiment.ollama import LocalOpenAI

tips_metadata = dict(
    total_bill="the amount of paid bill of the meal",
    tip="the amount of tip that customers paid",
    sex="the gender of customers",
    smoker="it indicates that a customer is a smoker or not. if No means a customer is a non-smoker, Yes means a customer is a smoker.",
    day="this is a day of the week when a customer having a meal here. i.e. Mon, Tue, Wed, Thu, Fri, Sat, Sun",
    time="the time of the meal. it can be either Dinner or Lunch",
    size="the number of dishes that customers have",
)
table_description = "Restaurant visit records showing bills, tips, customer info..."
catalog = DataCatalog()
catalog.create_table("tips", sns.load_dataset("tips"), metadata=dict(table_description=table_description, description=tips_metadata))
broinsight = BroInsight(llm=LocalOpenAI(), catalog=catalog)

In [3]:
catalog._tables

{'tips': {'type': 'pandas',
  'path': None,
  'metadata': {'table_description': 'Restaurant visit records showing bills, tips, customer info...',
   'description': {'total_bill': 'the amount of paid bill of the meal',
    'tip': 'the amount of tip that customers paid',
    'sex': 'the gender of customers',
    'smoker': 'it indicates that a customer is a smoker or not. if No means a customer is a non-smoker, Yes means a customer is a smoker.',
    'day': 'this is a day of the week when a customer having a meal here. i.e. Mon, Tue, Wed, Thu, Fri, Sat, Sun',
    'time': 'the time of the meal. it can be either Dinner or Lunch',
    'size': 'the number of dishes that customers have'}}}}

In [4]:
catalog.query("SELECT * FROM tips LIMIT 5")

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
response = broinsight.assess_data_quality("tips", "Which data transformation methods do I need to do for machine learning project?")
print(response['content'])

**Data Quality Assessment:** **NEEDS ATTENTION** – overall the data is clean, but a few transformations will improve model performance.

---

### 1. Duplicate Removal
| Issue | Severity | Action |
|-------|----------|--------|
| 1 duplicate record (0.41 % of the dataset) | Minor | Remove the duplicate with `df.drop_duplicates()` before any further processing. |

---

### 2. Numeric Features (total_bill, tip, size)

| Feature | Skewness | Transformation | Rationale |
|---------|----------|----------------|-----------|
| total_bill | 1.13 | **Log‑1‑Plus** (`log1p`) or **Box‑Cox** | Reduces right‑skew, stabilizes variance, improves linearity for many models. |
| tip | 1.47 | **Log‑1‑Plus** (`log1p`) | Helps models that assume normality of residuals (e.g., linear regression). |
| size | 1.45 | **Optional** – treat as categorical (one‑hot) or keep numeric if model can handle integer counts. | Small cardinality; encoding as one‑hot captures non‑linear relationships better. |

**Scaling (reco

In [6]:
suggestions = broinsight.suggest_questions(
    message="As a new manager, which day should I plan to have more staff to work?",
    tables=["tips"]
)

In [7]:
print(suggestions['content'])

Based on your role and goals, here are some areas you might want to explore:

Staffing Needs by Day
- Which day has the highest average group size (size) and should therefore have more staff?
- On which day do customers spend the most on total_bill on average, indicating higher traffic and need for more staff?

Customer Traffic Patterns
- How many customers visit on each day of the week (day) on average?
- Which day has the highest average number of customers (size) combined with high total_bill?

Revenue and Tips
- Which day generates the highest total revenue (sum of total_bill)?
- Which day has the highest average tip amount (tip) per customer?

Just ask me any of these questions and I'll analyze your data to get the answers!


In [15]:
# question = "I wanna know how many people visit my shop compare to global population and tell me the source of information."
# question = "I wanna see the relationship between average total bill and average tips of segments of sex and smoker."
# question = "Does smoker tip differently than non-smoker breaking down by time of day?"
# question = "Which sex gives the better tips?"
# question = "Which sex gives the better tips than one another?"
# question = "I wanna know that weekend have a size bigger than weekday or not? If so how many?"
# question = "อยากรู้ว่า คนกินข้าวเยอะ ในวันไหนมากกว่ากัน ระหว่าง วันจันทร์ และ วันศุกร์ เป็นจำนวนเท่าไหร่?"
# question = "I wanna know how many people are in between smoker and non-smoker groups. Also, do you know which brand of ciggaratte do the smoker prefer in our restaurant?"
# question = "Which sex has the most dinner?"
# question = "Based on sex, smoker type and meal of the day, which customer group paid most in tips?"
question = "Based on sex, smoker type and meal of the day, which customer group paid most in tips? I wanna see all comparisons."
# question = "What is the average `tip` given the `size` of the party?"
# question = "Is there a difference in average tip between Lunch and Dinner using the time and tip fields?"
# question = "What are the average tip amounts of lunch and dinner? Also which one is higher and by percentage?"
# question = "How many customers order with a party size of 5 or larger?"
# question = "Is there a significant difference in total_bill between weekends (Sat, Sun) and weekdays (Thur)?"
# question = "Which day has the greatest total number of transactions (count of rows)?"
# question = "Which day has the greatest total number of transactions (count of rows)? Also I wanna see everyday."

sql = broinsight.generate_sql(question, tables=["tips"])
sql_query = sql['content'].split("```sql")[-1].split("```")[0]

In [16]:
print(sql_query)


SELECT
    sex,
    smoker,
    time,
    SUM(tip)   AS total_tip,
    AVG(tip)   AS avg_tip
FROM tips
GROUP BY sex, smoker, time
ORDER BY total_tip DESC;



In [17]:
catalog.query(sql_query)

Unnamed: 0,sex,smoker,time,total_tip,avg_tip
0,Male,No,Dinner,243.17,3.158052
1,Male,Yes,Dinner,146.79,3.123191
2,Female,No,Dinner,88.28,3.044138
3,Female,Yes,Dinner,67.83,2.94913
4,Female,No,Lunch,61.49,2.4596
5,Male,No,Lunch,58.83,2.9415
6,Male,Yes,Lunch,36.28,2.790769
7,Female,Yes,Lunch,28.91,2.891


In [18]:
answer = broinsight.ask_data(message=f"{question}. Create a chart that is easy to read and understand.", visualize=True)
print(answer['content'])

**Which customer group tipped the most?**

From the data you provided, the single group that brought in the highest total tip amount is:

| Sex | Smoker | Day | **Total Tip** | Avg Tip | Visits |
|-----|--------|-----|---------------|---------|--------|
| Male | No | Sun | **133.96** | 3.12 | 43 |

That’s the top spot by far. The next highest groups are:

| Sex | Smoker | Day | Total Tip |
|-----|--------|-----|-----------|
| Male | No | Sat | 104.21 |
| Male | Yes | Sat | 77.74 |
| Female | No | Thur | 61.49 |
| Male | No | Thur | 58.83 |
| Male | Yes | Sun | 52.82 |
| Female | No | Sun | 46.61 |
| … | … | … | … |

*(The full list is sorted from highest to lowest total_tip.)*

---

### What does this mean for the business?

| Insight | Why it matters | Suggested action |
|---------|----------------|------------------|
| **Male, non‑smokers on Sunday** dominate tip revenue | Sunday is a prime day for high‑spending male diners who don’t smoke. | Consider targeted promotions (e.g., “Sund





In [19]:
answer['chart']