## **Feature:** Dataset Summaries

**Names:** Dhruv

### **What it does**
Generates statistical summaries including descriptive statistics (mean, median, mode, variance, std, quartiles), distribution analysis, correlation matrices, and basic visualisations for dataset exploration.

### **Helper Functions**
- calculate_basic_stats(df, columns=None): Calculate mean, median, std, variance, skewness, kurtosis for numeric columns
- calculate_five_number_summary(df, columns=None): Calculate min, Q1, median, Q3, max, IQR for numeric columns
- calculate_mode_stats(df, columns=None): Calculate mode, frequency, unique values for all columns
- generate_correlation_matrix(df, method='pearson'): Create correlation matrix and heatmap for numeric columns
- create_distribution_plots(df, columns=None, max_plots=6): Create histograms for numeric columns
- analyze_categorical_columns(df, columns=None, top_n=5): Analyze categorical columns with frequency tables
    """

In [None]:
# Get API Key
import os
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPEN_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
import pandas as pd
import numpy as np
import math
import re
import datetime
from sklearn import preprocessing, impute
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Langchain imports
from langchain.chat_models import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage

In [None]:
def calculate_basic_stats(df, columns=None):
    """Calculate basic descriptive statistics for numeric columns like mean, median, std, variance, skewness, kurtosis for numeric columns"""
    if columns is None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns
    else:
        numeric_cols = [col for col in columns if col in df.columns and df[col].dtype in ['int64', 'float64']]
    
    stats_dict = {}
    for col in numeric_cols:
        stats_dict[col] = {
            'count': df[col].count(),
            'mean': df[col].mean(),
            'median': df[col].median(),
            'std': df[col].std(),
            'variance': df[col].var(),
            'min': df[col].min(),
            'max': df[col].max(),
            'skewness': df[col].skew(),
            'kurtosis': df[col].kurtosis()
        }
    
    return pd.DataFrame(stats_dict).T

In [None]:
def calculate_five_number_summary(df, columns=None):
    """Calculate five number summary (min, Q1, median, Q3, max) for numeric columns"""
    if columns is None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns
    else:
        numeric_cols = [col for col in columns if col in df.columns and df[col].dtype in ['int64', 'float64']]
    
    summary_dict = {}
    for col in numeric_cols:
        summary_dict[col] = {
            'min': df[col].min(),
            'Q1': df[col].quantile(0.25),
            'median': df[col].median(),
            'Q3': df[col].quantile(0.75),
            'max': df[col].max(),
            'IQR': df[col].quantile(0.75) - df[col].quantile(0.25)
        }
    
    return pd.DataFrame(summary_dict).T

In [None]:
def calculate_mode_stats(df, columns=None):
    """Calculate mode and frequency statistics for all columns"""
    if columns is None:
        columns = df.columns
    
    mode_stats = {}
    for col in columns:
        if col in df.columns:
            mode_val = df[col].mode().iloc[0] if not df[col].mode().empty else None
            mode_stats[col] = {
                'mode': mode_val,
                'mode_frequency': df[col].value_counts().iloc[0] if not df[col].value_counts().empty else 0,
                'unique_values': df[col].nunique(),
                'most_frequent_values': df[col].value_counts().head(3).to_dict()
            }
    
    return mode_stats

In [None]:
def feature_function(df, user_query):
    """
    Main function that gets called by the main router.
    MUST take (user_query, df) and return df
    """
    
    # TODO: Create helper docs (Reimplement with functions)
    helper_docs = """
    - calculate_basic_stats(df, columns=None): Calculate mean, median, std, variance, skewness, kurtosis for numeric columns
    - calculate_five_number_summary(df, columns=None): Calculate min, Q1, median, Q3, max, IQR for numeric columns
    - calculate_mode_stats(df, columns=None): Calculate mode, frequency, unique values for all columns
    - generate_correlation_matrix(df, method='pearson'): Create correlation matrix and heatmap for numeric columns
    - create_distribution_plots(df, columns=None, max_plots=6): Create histograms for numeric columns
    - analyze_categorical_columns(df, columns=None, top_n=5): Analyze categorical columns with frequency tables
    """
    
    # Create message chain
    messages = []
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent.
    
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}
    
    Helper functions available:
    {helper_docs}

    Libraries available:
    - pd (pandas), np (numpy)
    - math, re, datetime
    - preprocessing, impute (from sklearn)
    - plt (matplotlib.pyplot), sns (seaborn)
    - stats (from scipy)
    
    Rules:
    - Return only executable Python code, no explanations, no markdown blocks
    - Use helper functions where possible
    - Store final result in 'result_df'
    - No explanations, just code
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))
    
    # Call LLM with message chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()
    
    # Execute code
    try:
        original_df = df.copy()
        exec(generated_code)
        return df
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

In [None]:
# TEST OUT YOUR FEATURE

## Import Data

## Run code