In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import seaborn as sns
# from broinsight.utils.data_catalog import DataCatalog
# from broinsight.broinsight import BroInsight
from broinsight import DataCatalog, BroInsight
from broinsight.experiment.ollama import LocalOpenAI

tips_metadata = dict(
    total_bill="the amount of paid bill of the meal",
    tip="the amount of tip that customers paid",
    sex="the gender of customers",
    smoker="it indicates that a customer is a smoker or not. if No means a customer is a non-smoker, Yes means a customer is a smoker.",
    day="this is a day of the week when a customer having a meal here. i.e. Mon, Tue, Wed, Thu, Fri, Sat, Sun",
    time="the time of the meal. it can be either Dinner or Lunch",
    size="the number of dishes that customers have",
)
table_description = "Restaurant visit records showing bills, tips, customer info..."
catalog = DataCatalog()
catalog.create_table("tips", sns.load_dataset("tips"), metadata=dict(table_description=table_description, description=tips_metadata))
broinsight = BroInsight(llm=LocalOpenAI(), catalog=catalog)

In [22]:
catalog._tables

{'tips': {'type': 'pandas',
  'path': None,
  'metadata': {'table_description': 'Restaurant visit records showing bills, tips, customer info...',
   'description': {'total_bill': 'the amount of paid bill of the meal',
    'tip': 'the amount of tip that customers paid',
    'sex': 'the gender of customers',
    'smoker': 'it indicates that a customer is a smoker or not. if No means a customer is a non-smoker, Yes means a customer is a smoker.',
    'day': 'this is a day of the week when a customer having a meal here. i.e. Mon, Tue, Wed, Thu, Fri, Sat, Sun',
    'time': 'the time of the meal. it can be either Dinner or Lunch',
    'size': 'the number of dishes that customers have'}}}}

In [23]:
catalog.query("SELECT * FROM tips LIMIT 5")

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [24]:
response = broinsight.assess_data_quality("tips", "Which data transformation methods do I need to do for machine learning project?")
print(response['content'])

**Data Transformation Recommendations for Your Machine‑Learning Pipeline**

| Field | Current Status | Why Transform? | Suggested Transformation |
|-------|----------------|----------------|---------------------------|
| **total_bill** | Float, moderately skewed (skew = 1.13) | Skewed targets can bias distance‑based models (KNN, SVM) and tree‑based models may over‑fit to long tails. | • Log‑transform (`log1p`) or Box‑Cox (if all values > 0). <br>• After transform, apply **StandardScaler** (zero‑mean, unit‑variance). |
| **tip** | Float, moderately skewed (skew = 1.47) | Same rationale as above. | • Log‑transform (`log1p`) or Box‑Cox. <br>• Scale with **StandardScaler**. |
| **size** | Integer, moderately skewed (skew = 1.45) | Small range but skewness can affect linear models and clustering. | • Log‑transform or **PowerTransformer** (Yeo‑Johnson works with zeros). <br>• Scale thereafter. |
| **sex, smoker, day, time** | Categorical (2–4 unique values) | Machine‑learning algorithms need

In [25]:
suggestions = broinsight.suggest_questions(
    message="As a new manager, which day should I plan to have more staff to work?",
    tables=["tips"]
)

In [26]:
print(suggestions['content'])

Based on your role and goals, here are some areas you might want to explore:

**Staffing Needs by Day**
- Which day has the highest average total_bill?  
- Which day has the largest average size of parties?  
- Which day has the most orders (highest record count)?

**Revenue and Tip Patterns by Day**
- Which day has the highest average tip?  
- Which day has the highest total tip revenue (sum of tip)?

**Customer Traffic by Day**
- Which day has the highest number of customers (sum of size)?  
- Which day has the highest average number of customers per order (average size)?

Just ask me any of these questions and I'll analyze your data to get the answers!


In [27]:
# question = "I wanna know how many people visit my shop compare to global population and tell me the source of information."
# question = "I wanna see the relationship between average total bill and average tips of segments of sex and smoker."
# question = "Does smoker tip differently than non-smoker breaking down by time of day?"
# question = "Which sex gives the better tips?"
# question = "Which sex gives the better tips than one another?"
# question = "I wanna know that weekend have a size bigger than weekday or not? If so how many?"
# question = "อยากรู้ว่า คนกินข้าวเยอะ ในวันไหนมากกว่ากัน ระหว่าง วันจันทร์ และ วันศุกร์ เป็นจำนวนเท่าไหร่?"
# question = "I wanna know how many people are in between smoker and non-smoker groups. Also, do you know which brand of ciggaratte do the smoker prefer in our restaurant?"
# question = "Which sex has the most dinner?"
# question = "Based on sex, smoker type and meal of the day, which customer group paid most in tips?"
question = "Based on sex, smoker type and meal of the day, which customer group paid most in tips? I wanna see all comparisons."
# question = "What is the average `tip` given the `size` of the party?"
# question = "Is there a difference in average tip between Lunch and Dinner using the time and tip fields?"
# question = "What are the average tip amounts of lunch and dinner? Also which one is higher and by percentage?"
# question = "How many customers order with a party size of 5 or larger?"
# question = "Is there a significant difference in total_bill between weekends (Sat, Sun) and weekdays (Thur)?"
# question = "Which day has the greatest total number of transactions (count of rows)?"
# question = "Which day has the greatest total number of transactions (count of rows)? Also I wanna see everyday."

sql = broinsight.generate_sql(question, tables=["tips"])
sql_query = sql['content'].split("```sql")[-1].split("```")[0]

In [28]:
print(sql_query)


WITH tip_stats AS (
    SELECT
        sex,
        smoker,
        time,
        COUNT(*)          AS visits,
        SUM(tip)          AS total_tip,
        AVG(tip)          AS avg_tip
    FROM tips
    GROUP BY sex, smoker, time
)
SELECT
    sex,
    smoker,
    time,
    visits,
    total_tip,
    avg_tip,
    DENSE_RANK() OVER (ORDER BY avg_tip DESC) AS tip_rank
FROM tip_stats
ORDER BY tip_rank, sex, smoker, time;



In [29]:
catalog.query(sql_query)

Unnamed: 0,sex,smoker,time,visits,total_tip,avg_tip,tip_rank
0,Male,No,Dinner,77,243.17,3.158052,1
1,Male,Yes,Dinner,47,146.79,3.123191,2
2,Female,No,Dinner,29,88.28,3.044138,3
3,Female,Yes,Dinner,23,67.83,2.94913,4
4,Male,No,Lunch,20,58.83,2.9415,5
5,Female,Yes,Lunch,10,28.91,2.891,6
6,Male,Yes,Lunch,13,36.28,2.790769,7
7,Female,No,Lunch,25,61.49,2.4596,8


In [30]:
answer = broinsight.ask_data(message=f"{question}. Create a chart that is easy to read and understand.", visualize=True)
print(answer['content'])

**Who tipped the most?**  
The biggest tip‑gathering group is **Male, Non‑Smoker, Dinner** – they brought in **$243.17** in total tips.  

Below is a quick comparison of all the groups in the data set, ranked by the amount of tips they generated.  

| Rank | Sex    | Smoker | Meal  | Total Tips | Avg Tip per Visit | Visits |
|------|--------|--------|-------|------------|-------------------|--------|
| 1    | Male   | No     | Dinner| **$243.17**| 3.16 | 77 |
| 2    | Male   | Yes    | Dinner| $146.79 | 3.12 | 47 |
| 3    | Female | No     | Dinner| $88.28  | 3.04 | 29 |
| 4    | Female | Yes    | Dinner| $67.83  | 2.95 | 23 |
| 5    | Female | No     | Lunch | $61.49  | 2.46 | 25 |
| 6    | Male   | No     | Lunch | $58.83  | 2.94 | 20 |
| 7    | Male   | Yes    | Lunch | $36.28  | 2.79 | 13 |
| 8    | Female | Yes    | Lunch | $28.91  | 2.89 | 10 |

### What does this mean for the business?

- **Dinner is the tip‑heavy meal**: Every dinner group outpaces its lunch counterpart, with a

In [31]:
answer['chart']