In [1]:
%load_ext autoreload
%autoreload 2

# Using pandas dataframe for profiling data quality

In [2]:
from broinsight.data_quality.create_profile import create_profile, format_profile
import seaborn as sns

df = sns.load_dataset('tips')
profile_dict = create_profile(df)
profile_str = format_profile(profile_dict)

In [3]:
from broinsight.experiment.ollama import LocalOpenAI
from broprompt import Prompt

model = LocalOpenAI()

prompt = Prompt.from_markdown("broinsight/prompt_hub/dq_suggestion.md")
question = "Do we have any concern about this one?"
user_input = "PROFILE:\n\n{profile}\n\nUSER_INPUT:\n\n{question}\n\n".format(profile=profile_str, question=question)
response = model.run(system_prompt=prompt.str, messages=[
    model.UserMessage(text=user_input)
])
response

{'content': '**Data Quality Assessment: READY**  \n\n**Critical Issues Found:** None – all fields have complete data and no severe inconsistencies.  \n\n**Recommended Actions:**  \n1. **Duplicate Check** – There is one duplicate record (rows\u202f198\u202f&\u202f202).  \n   - *Option A:* Remove the duplicate to avoid any minor bias.  \n   - *Option B:* Keep it if the duplicate represents a legitimate repeat visit and you want to preserve frequency.  \n2. **Skewness Awareness** – The `total_bill`, `tip`, and `size` columns show moderate right‑skewness (skew\u202f≈\u202f1.1–1.5).  \n   - If you plan to use algorithms sensitive to distribution shape (e.g., linear regression, k‑means), consider applying a log or Box‑Cox transform.  \n   - For tree‑based models (random forest, gradient boosting), the skewness is usually not problematic.  \n3. **Documentation** – Note the duplicate and skewness in your data dictionary so downstream users are aware.  \n\n**Next Steps:**  \n- Proceed with expl

In [4]:
print(response['content'])

**Data Quality Assessment: READY**  

**Critical Issues Found:** None – all fields have complete data and no severe inconsistencies.  

**Recommended Actions:**  
1. **Duplicate Check** – There is one duplicate record (rows 198 & 202).  
   - *Option A:* Remove the duplicate to avoid any minor bias.  
   - *Option B:* Keep it if the duplicate represents a legitimate repeat visit and you want to preserve frequency.  
2. **Skewness Awareness** – The `total_bill`, `tip`, and `size` columns show moderate right‑skewness (skew ≈ 1.1–1.5).  
   - If you plan to use algorithms sensitive to distribution shape (e.g., linear regression, k‑means), consider applying a log or Box‑Cox transform.  
   - For tree‑based models (random forest, gradient boosting), the skewness is usually not problematic.  
3. **Documentation** – Note the duplicate and skewness in your data dictionary so downstream users are aware.  

**Next Steps:**  
- Proceed with exploratory data analysis and modeling.  
- If you decid

# Using duckdb for profiling data quality

In [None]:
import pandas as pd
import seaborn as sns
import duckdb
from broinsight.data_quality.sql_profile import sql_table_profile, sql_field_profile
from broinsight.data_quality.create_profile import format_profile
from broinsight.data_quality.criteria import assess_data_quality
df = sns.load_dataset('tips')
conn = duckdb.connect()
conn.register("tips", df)
profile_dict = sql_field_profile(conn, "tips")
dataset_summary = sql_table_profile(conn, "tips")
dq_summary = assess_data_quality(profile_dict)

In [6]:
profile_df = pd.DataFrame.from_dict(profile_dict, orient="index")
print(profile_df.to_string())

           data_types  missing_values  missing_values_pct  unique_values  unique_values_pct                                     most_frequent                                                                                                                                                                             statistics
total_bill      float               0                 0.0            229               0.94  {13.42: 3, 20.29: 2, 13.0: 2, 7.25: 2, 10.07: 2}  {'min': 3.07, 'max': 50.81, 'mean': 19.79, 'median': 17.8, 'std': 8.9, 'var': 79.25, 'skew': 1.13, 'kurt': 1.22, 'iqr': 10.78, 'cv': 0.45, 'lower_bound': -2.82, 'upper_bound': 40.3}
tip             float               0                 0.0            123               0.50     {2.0: 33, 3.0: 23, 4.0: 12, 5.0: 10, 2.5: 10}        {'min': 1.0, 'max': 10.0, 'mean': 3.0, 'median': 2.9, 'std': 1.38, 'var': 1.91, 'skew': 1.47, 'kurt': 3.65, 'iqr': 1.56, 'cv': 0.46, 'lower_bound': -0.34, 'upper_bound': 5.91}
sex            string    

In [7]:
profile = dict(
    dataset_summary=dataset_summary,
    fields={k: {'profile': v, 'quality': dq_summary[k]['summary']} for k, v in profile_dict.items()}
)
profile_str = format_profile(profile)
print(profile_str)

# Dataset Overview
**Size:** 244 rows × 7 columns
**Duplicates:** 1 duplicate record(s) found

# Fields
## total_bill
**Type:** float
**Missing:** 0 (0.0%)
**Unique:** 229 (94.0%)
**Quality:** good
**Issues:**
  - Moderately skewed distribution (skew: 1.13) (minor)
**Stats:** min=3.07, max=50.81, mean=19.79, skew=1.13

## tip
**Type:** float
**Missing:** 0 (0.0%)
**Unique:** 123 (50.0%)
**Quality:** good
**Issues:**
  - Moderately skewed distribution (skew: 1.47) (minor)
**Stats:** min=1.0, max=10.0, mean=3.0, skew=1.47

## sex
**Type:** string
**Missing:** 0 (0.0%)
**Unique:** 2 (1.0%)
**Quality:** good
**Stats:** mode=Male, avg_length=4.71

## smoker
**Type:** string
**Missing:** 0 (0.0%)
**Unique:** 2 (1.0%)
**Quality:** good
**Stats:** mode=No, avg_length=2.38

## day
**Type:** string
**Missing:** 0 (0.0%)
**Unique:** 4 (2.0%)
**Quality:** good
**Stats:** mode=Sat, avg_length=3.25

## time
**Type:** string
**Missing:** 0 (0.0%)
**Unique:** 2 (1.0%)
**Quality:** good
**Stats:** mode

In [8]:
from broinsight.experiment.ollama import LocalOpenAI
from broprompt import Prompt

model = LocalOpenAI()

prompt = Prompt.from_markdown("broinsight/prompt_hub/dq_suggestion.md")
question = "Do we have any concern about this one?"
user_input = "PROFILE:\n\n{profile}\n\nUSER_INPUT:\n\n{question}\n\n".format(profile=profile_str, question=question)
response = model.run(system_prompt=prompt.str, messages=[
    model.UserMessage(text=user_input)
])
response

{'content': '**Data Quality Assessment:** READY  \n**Critical Issues Found:** None  \n\n**Recommended Actions:**  \n1. **Duplicate Record** – One duplicate exists (≈0.4\u202f% of rows).  \n   - *Action:* Decide whether to keep it (e.g., if it represents a genuine repeat visit) or drop it for a cleaner dataset.  \n2. **Minor Skewness** – All numeric fields (`total_bill`, `tip`, `size`) show moderate positive skew (1.13–1.47).  \n   - *Action:* If you plan to use models sensitive to distribution shape (e.g., linear regression, Gaussian‑based clustering), consider a log or Box‑Cox transformation. Otherwise, the skew is acceptable for most machine‑learning or descriptive tasks.  \n\n**Next Steps:**  \n- Proceed with exploratory data analysis and model building.  \n- If you encounter any modeling issues that hint at distributional assumptions, revisit the skewness handling step.  \n\n**Overall Recommendation:**  \nThe dataset is in good shape for analysis. A single duplicate and minor skewn

In [9]:
print(response['content'])

**Data Quality Assessment:** READY  
**Critical Issues Found:** None  

**Recommended Actions:**  
1. **Duplicate Record** – One duplicate exists (≈0.4 % of rows).  
   - *Action:* Decide whether to keep it (e.g., if it represents a genuine repeat visit) or drop it for a cleaner dataset.  
2. **Minor Skewness** – All numeric fields (`total_bill`, `tip`, `size`) show moderate positive skew (1.13–1.47).  
   - *Action:* If you plan to use models sensitive to distribution shape (e.g., linear regression, Gaussian‑based clustering), consider a log or Box‑Cox transformation. Otherwise, the skew is acceptable for most machine‑learning or descriptive tasks.  

**Next Steps:**  
- Proceed with exploratory data analysis and model building.  
- If you encounter any modeling issues that hint at distributional assumptions, revisit the skewness handling step.  

**Overall Recommendation:**  
The dataset is in good shape for analysis. A single duplicate and minor skewness do not impede typical analys

# Using duckdb with s3 for data quality profiling

In [24]:
def connect_s3_duckdb(key_id=None, secret=None, region='ap-southeast-1', endpoint=None):
    """Connect DuckDB to S3 with authentication.
    
    Args:
        key_id: AWS access key ID
        secret: AWS secret access key  
        region: AWS region
        endpoint: Custom S3 endpoint (optional)
        
    Returns:
        duckdb.DuckDBPyConnection: Configured connection
    """
    import duckdb
    
    conn = duckdb.connect()
    conn.execute("INSTALL httpfs")
    conn.execute("LOAD httpfs")
    
    if key_id and secret:
        # Manual credentials
        secret_sql = f"""
        CREATE OR REPLACE SECRET s3_secret (
            TYPE s3,
            PROVIDER config,
            KEY_ID '{key_id}',
            SECRET '{secret}',
            REGION '{region}'
        )"""
        if endpoint:
            secret_sql = secret_sql.replace(")", f", ENDPOINT '{endpoint}')")
        conn.execute(secret_sql)
    else:
        # Use credential chain (AWS CLI, env vars, etc.)
        conn.execute("""
        CREATE OR REPLACE SECRET s3_secret (
            TYPE s3,
            PROVIDER credential_chain
        )""")
    
    return conn

In [35]:
import seaborn as sns
import boto3

bucket = 'dev-broinsight'

df = sns.load_dataset('tips')
s3_client = boto3.client('s3')
s3_client.put_object(Bucket=bucket, Key='tips.csv', Body=df.to_csv(index=False))

{'ResponseMetadata': {'RequestId': 'VS1HE1EKZT7VAZTB',
  'HostId': 'APi1+VwjKBovUxhte2P98newhgzhjM3/+eaEUXH3mr2Z/FVtAm5Mxvsw2jxVaex5LLjvKdm3cgpaij6tC37mt+P0yo8OksJf',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'APi1+VwjKBovUxhte2P98newhgzhjM3/+eaEUXH3mr2Z/FVtAm5Mxvsw2jxVaex5LLjvKdm3cgpaij6tC37mt+P0yo8OksJf',
   'x-amz-request-id': 'VS1HE1EKZT7VAZTB',
   'date': 'Wed, 08 Oct 2025 16:26:22 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"21a1749bc760e771c7a011b17e17d17b"',
   'x-amz-checksum-crc32': 'NUN6ig==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"21a1749bc760e771c7a011b17e17d17b"',
 'ChecksumCRC32': 'NUN6ig==',
 'ChecksumType': 'FULL_OBJECT',
 'ServerSideEncryption': 'AES256'}

In [43]:
session = boto3.Session()
credentials = session.get_credentials()
conn = connect_s3_duckdb(key_id=credentials.access_key, secret=credentials.secret_key, region=session.region_name)

In [44]:
conn.execute("SELECT * from 's3://{bucket}/tips.csv'".format(bucket=bucket)).df()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,False,Sun,Dinner,2
1,10.34,1.66,Male,False,Sun,Dinner,3
2,21.01,3.50,Male,False,Sun,Dinner,3
3,23.68,3.31,Male,False,Sun,Dinner,2
4,24.59,3.61,Female,False,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,False,Sat,Dinner,3
240,27.18,2.00,Female,True,Sat,Dinner,2
241,22.67,2.00,Male,True,Sat,Dinner,2
242,17.82,1.75,Male,False,Sat,Dinner,2


In [None]:
from broinsight.data_quality.sql_profile import sql_table_profile, sql_field_profile
from broinsight.data_quality.criteria import assess_data_quality

s3_path = "'s3://{bucket}/tips.csv'".format(bucket=bucket)

profile_dict = sql_field_profile(conn, s3_path)
dataset_summary = sql_table_profile(conn, s3_path)
dq_summary = assess_data_quality(profile_dict)

In [None]:
profile_df = pd.DataFrame.from_dict(profile_dict, orient="index")
print(profile_df.to_string())

           data_types  missing_values  missing_values_pct  unique_values  unique_values_pct                                       most_frequent                                                                                                                                                                             statistics
total_bill      float               0                 0.0            229               0.94  {13.42: 3, 10.34: 2, 10.07: 2, 20.69: 2, 18.29: 2}  {'min': 3.07, 'max': 50.81, 'mean': 19.79, 'median': 17.8, 'std': 8.9, 'var': 79.25, 'skew': 1.13, 'kurt': 1.22, 'iqr': 10.78, 'cv': 0.45, 'lower_bound': -2.82, 'upper_bound': 40.3}
tip             float               0                 0.0            123               0.50       {2.0: 33, 3.0: 23, 4.0: 12, 2.5: 10, 5.0: 10}        {'min': 1.0, 'max': 10.0, 'mean': 3.0, 'median': 2.9, 'std': 1.38, 'var': 1.91, 'skew': 1.47, 'kurt': 3.65, 'iqr': 1.56, 'cv': 0.46, 'lower_bound': -0.34, 'upper_bound': 5.91}
sex            stri

In [None]:
profile = dict(
    dataset_summary=dataset_summary,
    fields={k: {'profile': v, 'quality': dq_summary[k]['summary']} for k, v in profile_dict.items()}
)
profile_str = format_profile(profile)
print(profile_str)

# Dataset Overview
**Size:** 244 rows × 7 columns
**Duplicates:** 1 duplicate record(s) found

# Fields
## total_bill
**Type:** float
**Missing:** 0 (0.0%)
**Unique:** 229 (94.0%)
**Quality:** good
**Issues:**
  - Moderately skewed distribution (skew: 1.13) (minor)
**Stats:** min=3.07, max=50.81, mean=19.79, skew=1.13

## tip
**Type:** float
**Missing:** 0 (0.0%)
**Unique:** 123 (50.0%)
**Quality:** good
**Issues:**
  - Moderately skewed distribution (skew: 1.47) (minor)
**Stats:** min=1.0, max=10.0, mean=3.0, skew=1.47

## sex
**Type:** string
**Missing:** 0 (0.0%)
**Unique:** 2 (1.0%)
**Quality:** good
**Stats:** mode=Male, avg_length=4.71

## smoker
**Type:** unknown
**Missing:** 0 (0.0%)
**Unique:** 2 (1.0%)
**Quality:** good

## day
**Type:** string
**Missing:** 0 (0.0%)
**Unique:** 4 (2.0%)
**Quality:** good
**Stats:** mode=Sat, avg_length=3.25

## time
**Type:** string
**Missing:** 0 (0.0%)
**Unique:** 2 (1.0%)
**Quality:** good
**Stats:** mode=Dinner, avg_length=5.72

## size
*

In [None]:
from broinsight.experiment.ollama import LocalOpenAI
from broprompt import Prompt

model = LocalOpenAI()

prompt = Prompt.from_markdown("broinsight/prompt_hub/dq_suggestion.md")
question = "Do we have any concern about this one?"
user_input = "PROFILE:\n\n{profile}\n\nUSER_INPUT:\n\n{question}\n\n".format(profile=profile_str, question=question)
response = model.run(system_prompt=prompt.str, messages=[
    model.UserMessage(text=user_input)
])
response

{'content': '**Data Quality Assessment: READY**\n\n**Critical Issues Found:** None – all fields have no missing data, appropriate types, and “good” quality ratings.\n\n**Recommended Actions:**\n\n1. **Duplicate Record**  \n   - *What to do:* Verify whether the single duplicate row is truly redundant or represents a legitimate repeat observation.  \n   - *If redundant:* Drop it to keep the dataset size accurate.  \n   - *If legitimate:* Keep it and document that it represents a repeated visit.\n\n2. **Moderately Skewed Distributions**  \n   - *Fields affected:* `total_bill`, `tip`, and `size`.  \n   - *Impact:* Minor skewness is unlikely to distort most descriptive or classification tasks. However, if you plan to use models that assume normality (e.g., linear regression, ANOVA) or perform statistical tests that are sensitive to distribution shape, consider a log or Box–Cox transformation for these columns.\n\n3. **Documentation**  \n   - Record the duplicate handling decision and any tr

In [None]:
print(response['content'])

**Data Quality Assessment: READY**

**Critical Issues Found:** None – all fields have no missing data, appropriate types, and “good” quality ratings.

**Recommended Actions:**

1. **Duplicate Record**  
   - *What to do:* Verify whether the single duplicate row is truly redundant or represents a legitimate repeat observation.  
   - *If redundant:* Drop it to keep the dataset size accurate.  
   - *If legitimate:* Keep it and document that it represents a repeated visit.

2. **Moderately Skewed Distributions**  
   - *Fields affected:* `total_bill`, `tip`, and `size`.  
   - *Impact:* Minor skewness is unlikely to distort most descriptive or classification tasks. However, if you plan to use models that assume normality (e.g., linear regression, ANOVA) or perform statistical tests that are sensitive to distribution shape, consider a log or Box–Cox transformation for these columns.

3. **Documentation**  
   - Record the duplicate handling decision and any transformations applied so futu