In [12]:
%load_ext autoreload
%autoreload 2

# DataCatalog

In [24]:
# from broinsight.utils.data_catalog import DataCatalog
from broinsight import DataCatalog
import seaborn as sns
import boto3

bucket='dev-broinsight'

s3_client = boto3.client('s3')
for file in ["tips", "iris"]:
    df = sns.load_dataset(file)
    s3_client.put_object(Bucket=bucket, Key=f'{file}.csv', Body=df.to_csv(index=False))

In [25]:
session = boto3.Session()
credentials = session.get_credentials()

catalog = DataCatalog(s3_config={
    'key_id': credentials.access_key,
    'secret': credentials.secret_key,
    'region': session.region_name,
})

In [26]:
# catalog.create_table('tips', f's3://{bucket}/tips.csv')
catalog.create_table('tips', sns.load_dataset('tips'))
catalog.create_table('iris', f's3://{bucket}/iris.csv')

In [27]:
catalog.query('SELECT * FROM tips').head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [28]:
catalog.list_tables()

['tips', 'iris']

In [29]:
catalog.query('SELECT * FROM iris').head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [30]:
catalog.get_table_profile("iris")

{'rows': 150,
 'columns': 5,
 'duplicates': 1,
 'evidences': {0: {'sepal_length': 5.8,
   'sepal_width': 2.7,
   'petal_length': 5.1,
   'petal_width': 1.9,
   'species': 'virginica',
   'dup_count': 2}}}

In [31]:
catalog.get_field_profile("iris")

{'sepal_length': {'data_types': 'float',
  'missing_values': 0,
  'missing_values_pct': 0.0,
  'unique_values': 35,
  'unique_values_pct': 0.23,
  'most_frequent': {5.0: 10, 5.1: 9, 6.3: 9, 5.7: 8, 6.7: 8},
  'statistics': {'min': 4.3,
   'max': 7.9,
   'mean': 5.84,
   'median': 5.8,
   'std': 0.83,
   'var': 0.69,
   'skew': 0.31,
   'kurt': -0.55,
   'iqr': 1.3,
   'cv': 0.14,
   'lower_bound': 3.15,
   'upper_bound': 8.35}},
 'sepal_width': {'data_types': 'float',
  'missing_values': 0,
  'missing_values_pct': 0.0,
  'unique_values': 23,
  'unique_values_pct': 0.15,
  'most_frequent': {3.0: 26, 2.8: 14, 3.2: 13, 3.4: 12, 3.1: 11},
  'statistics': {'min': 2.0,
   'max': 4.4,
   'mean': 3.06,
   'median': 3.0,
   'std': 0.44,
   'var': 0.19,
   'skew': 0.32,
   'kurt': 0.23,
   'iqr': 0.5,
   'cv': 0.14,
   'lower_bound': 2.05,
   'upper_bound': 4.05}},
 'petal_length': {'data_types': 'float',
  'missing_values': 0,
  'missing_values_pct': 0.0,
  'unique_values': 43,
  'unique_values

In [32]:
profile_str = catalog.get_formatted_profile("iris")
print(profile_str)

# Dataset Overview
**Size:** 150 rows × 5 columns
**Duplicates:** 1 duplicate record(s) found

# Fields
## sepal_length
**Type:** float
**Missing:** 0 (0.0%)
**Unique:** 35 (23.0%)
**Quality:** good
**Stats:** min=4.3, max=7.9, mean=5.84, skew=0.31

## sepal_width
**Type:** float
**Missing:** 0 (0.0%)
**Unique:** 23 (15.0%)
**Quality:** good
**Stats:** min=2.0, max=4.4, mean=3.06, skew=0.32

## petal_length
**Type:** float
**Missing:** 0 (0.0%)
**Unique:** 43 (29.0%)
**Quality:** good
**Stats:** min=1.0, max=6.9, mean=3.76, skew=-0.27

## petal_width
**Type:** float
**Missing:** 0 (0.0%)
**Unique:** 22 (15.0%)
**Quality:** good
**Stats:** min=0.1, max=2.5, mean=1.2, skew=-0.1

## species
**Type:** string
**Missing:** 0 (0.0%)
**Unique:** 3 (2.0%)
**Quality:** good
**Stats:** mode=virginica, avg_length=8.33



In [33]:
from broinsight.experiment.ollama import LocalOpenAI
from broprompt import Prompt

model = LocalOpenAI()

prompt = Prompt.from_markdown("broinsight/prompt_hub/dq_suggestion.md")
# question = "Do we have any concern about this one?"
question = "Any data transformation needed for machine learning project here?"
user_input = "PROFILE:\n\n{profile}\n\nUSER_INPUT:\n\n{question}\n\n".format(profile=profile_str, question=question)
response = model.run(system_prompt=prompt.str, messages=[
    model.UserMessage(text=user_input)
])
response

{'content': "**Data Quality Assessment:** READY  \nThe dataset is complete, with no missing values and only a single duplicate record. All numeric fields are of type `float`, and the target (`species`) is a clean string with no missing entries. The statistics show reasonable ranges and low skewness, indicating no obvious outliers or extreme values.\n\n---\n\n### Recommended Transformations for Machine‑Learning Use\n\n| # | Transformation | Why It Matters | Practical Steps |\n|---|----------------|----------------|-----------------|\n| 1 | **Remove duplicate record** | Duplicate can bias model training (especially if the duplicate is a repeated instance). | Use `drop_duplicates()` in pandas or equivalent. |\n| 2 | **Encode target variable** | Most algorithms expect numeric labels. | Map `species` to integers (e.g., `{'setosa':0, 'versicolor':1, 'virginica':2}`) or use one‑hot encoding if required. |\n| 3 | **Scale/normalize numeric features** | Algorithms that use distance metrics (k‑NN

In [34]:
print(response['content'])

**Data Quality Assessment:** READY  
The dataset is complete, with no missing values and only a single duplicate record. All numeric fields are of type `float`, and the target (`species`) is a clean string with no missing entries. The statistics show reasonable ranges and low skewness, indicating no obvious outliers or extreme values.

---

### Recommended Transformations for Machine‑Learning Use

| # | Transformation | Why It Matters | Practical Steps |
|---|----------------|----------------|-----------------|
| 1 | **Remove duplicate record** | Duplicate can bias model training (especially if the duplicate is a repeated instance). | Use `drop_duplicates()` in pandas or equivalent. |
| 2 | **Encode target variable** | Most algorithms expect numeric labels. | Map `species` to integers (e.g., `{'setosa':0, 'versicolor':1, 'virginica':2}`) or use one‑hot encoding if required. |
| 3 | **Scale/normalize numeric features** | Algorithms that use distance metrics (k‑NN, SVM, neural nets) or r