## **Feature:** Dataset summaries

**Names:** Zim

### **What it does**
Creates dataset summaries for dataset (including numerical and categorical data)

### **Helper Functions**
`numerical_summary`: Return summary dataframe for given columns of numeric type

`categorical_summary`: Returns summary dataframe for columns of categorical type

In [33]:
# Get API Key

import os
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
import pandas as pd
import numpy as np
import math
import re
import datetime
from sklearn import preprocessing, impute

# Langchain imports
from langchain.chat_models import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage

In [34]:
def numerical_summary(df, columns=None, metrics=None):
    """
    Returns summary dataframe for given columns of numeric type

    - params:
        - columns: list[str] or None (defaults to numeric columns)
        - metrics: list[str] subset of ["count","missing","mean","std","min","median","p25","p75","max"]
    - returns: pd.DataFrame
    """
     
    if not isinstance(df, pd.DataFrame):
        return pd.DataFrame()
    
    # Default to numeric columns if no specific columns given
    if columns:
        columns = [col for col in columns if col in df.columns]
    else:
        columns = df.select_dtypes(include=[np.number]).columns.tolist()

    if not columns:
        return pd.DataFrame

    filtered_df = df[columns]

    # Default metric set
    default_metrics = {
        "count": filtered_df.count(),
        "missing": filtered_df.isna().sum(),
        "mean": filtered_df.mean(),
        "std": filtered_df.std(),
        "min": filtered_df.min(),
        "p25": filtered_df.quantile(0.25),
        "median": filtered_df.median(),
        "p75": filtered_df.quantile(0.75),
        "max": filtered_df.max(),
    }
    
    # Filter to requested metrics (always including count)
    if metrics:
        metrics = {m.lower() for m in metrics}
        output_cols = {"count": default_metrics["count"]}
        for d in default_metrics:
            if d != "count" and d in metrics:
                output_cols[d] = default_metrics[d]
    else:
        output_cols = default_metrics

    output_df = pd.DataFrame(output_cols)

    return output_df

In [35]:
def categorical_summary_summary(df, columns=None):
    """
    Returns summary dataframe for given columns of categorical type

    - params:
        - columns: list[str] or None (defaults to non-numeric columns)
    - returns: pd.DataFrame
    """

    if not isinstance(df, pd.DataFrame):
        return pd.DataFrame()

    # Default to non-numeric columns if no specific columns given
    if columns:
        columns = [col for col in columns if col in df.columns]
    else:
        numeric_columns = set(df.select_dtypes(include=[np.number]).columns.tolist())
        columns = [col for col in df.columns if col not in numeric_columns]

    if not columns:
        return pd.DataFrame()
    
    filtered_df = df[columns]

    output_df = pd.DataFrame({
        "count": filtered_df.count(),
        "missing": filtered_df.isna().sum(),
        "nunique": filtered_df.nunique(dropna=True),
        "top": filtered_df.apply(lambda col: col.mode(dropna=True).iloc[0] if not col.mode(dropna=True).empty else np.nan),
        "top_freq": filtered_df.apply(lambda col: col.value_counts(dropna=True).iloc[0] if not col.value_counts(dropna=True).empty else 0)
    })

    return output_df

In [36]:
def get_summaries(user_query, df):
    """
    Main function that gets called by the main router.
    MUST take (user_query, df) and return df
    """
    
    # TODO: Create helper docs (Reimplement with functions)
    helper_docs = """
    - numerical_summary(df, columns=None, metrics=None): Returns a numeric summary DataFrame for given columns. 
      Metrics can include: count, missing, mean, std, min, median, p25, p75, max.
    - categorical_summary(df, columns=None): Returns a categorical summary DataFrame (count, missing, nunique, top, top_freq).
    """
    
    # Create message chain
    messages = []
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent.
    
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}
    
    Helper functions available:
    {helper_docs}

    Libraries available:
    - pd (pandas), np (numpy)
    - math, re, datetime
    - preprocessing, impute (from sklearn)
    
    Rules:
    - Return only executable Python code, no explanations, no markdown blocks
    - Use helper functions where possible
    - Store final result in 'result_df'
    - No explanations, just code
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))
    
    # Call LLM with message chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()
    
    # Execute code
    try:
        original_df = df.copy()
        local_vars = {"df": df, "user_query": user_query}
        exec(generated_code, globals(), local_vars)
        # Expect the LLM to store the result in 'result_df'
        if 'result_df' in local_vars:
            return local_vars["result_df"]
        else:
            print("No result_df found, returning df")
            return df
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

In [37]:
# TEST OUT YOUR FEATURE
life_expectancy_df = pd.read_csv('../sample_data/Life Expectancy Data.csv')

queries = [
    "Summarize the status and country columns",
    "Give me the mean and median for life expectancy",
    "Create summary statistics for the alochol column",
    "Give me all 5 quantiles for infant deaths column",
    "What year was the most common?"
]

for query in queries:
    print(query, ':\n', get_summaries(query, life_expectancy_df),'\n\n')

Summarize the status and country columns :
           count  missing  nunique         top  top_freq   count  missing  \
Status   2938.0      0.0      2.0  Developing    2426.0     NaN      NaN   
Country     NaN      NaN      NaN         NaN       NaN  2938.0      0.0   

         nunique          top  top_freq  
Status       NaN          NaN       NaN  
Country    193.0  Afghanistan      16.0   


Give me the mean and median for life expectancy :
                  count       mean  median
Life expectancy   2928  69.224932    72.1 


Create summary statistics for the alochol column :
          count  missing      mean       std   min     p25  median     p75  \
Alcohol   2744      194  4.602861  4.052413  0.01  0.8775   3.755  7.7025   

           max  
Alcohol  17.87   


Give me all 5 quantiles for infant deaths column :
                count  min  p25  median   p75   max
infant deaths   2938    0  0.0     3.0  22.0  1800 


What year was the most common? :
       count  missing  nun