## **Feature:** Dataset Summaries

**Names:** Dhruv

### **What it does**
Generates statistical summaries including descriptive statistics (mean, median, mode, variance, std, quartiles), distribution analysis, correlation matrices, and basic visualisations for dataset exploration.

### **Helper Functions**
- calculate_basic_stats(df, columns=None): Calculate mean, median, std, variance, skewness, kurtosis for numeric columns
- calculate_five_number_summary(df, columns=None): Calculate min, Q1, median, Q3, max, IQR for numeric columns
- calculate_mode_stats(df, columns=None): Calculate mode, frequency, unique values for all columns
- generate_correlation_matrix(df, method='pearson'): Create correlation matrix and heatmap for numeric columns
- create_distribution_plots(df, columns=None, max_plots=6): Create histograms for numeric columns
- analyse_categorical_columns(df, columns=None, top_n=5): Analyse categorical columns with frequency tables
    """

In [None]:
# Get API Key
import os
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPEN_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
import pandas as pd
import numpy as np
import math
import re
import datetime
from sklearn import preprocessing, impute
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Langchain imports
from langchain.chat_models import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage

In [None]:
def calculate_basic_stats(df, columns=None):
    """Calculate basic descriptive statistics for numeric columns like mean, median, std, variance, skewness, kurtosis for numeric columns"""
    if columns is None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns
    else:
        numeric_cols = [col for col in columns if col in df.columns and df[col].dtype in ['int64', 'float64']]
    
    stats_dict = {}
    for col in numeric_cols:
        stats_dict[col] = {
            'count': df[col].count(),
            'mean': df[col].mean(),
            'median': df[col].median(),
            'std': df[col].std(),
            'variance': df[col].var(),
            'min': df[col].min(),
            'max': df[col].max(),
            'skewness': df[col].skew(),
            'kurtosis': df[col].kurtosis()
        }
    
    return pd.DataFrame(stats_dict).T

In [None]:
def calculate_five_number_summary(df, columns=None):
    """Calculate five number summary (min, Q1, median, Q3, max) for numeric columns"""
    if columns is None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns
    else:
        numeric_cols = [col for col in columns if col in df.columns and df[col].dtype in ['int64', 'float64']]
    
    summary_dict = {}
    for col in numeric_cols:
        summary_dict[col] = {
            'min': df[col].min(),
            'Q1': df[col].quantile(0.25),
            'median': df[col].median(),
            'Q3': df[col].quantile(0.75),
            'max': df[col].max(),
            'IQR': df[col].quantile(0.75) - df[col].quantile(0.25)
        }
    
    return pd.DataFrame(summary_dict).T

In [None]:
def calculate_mode_stats(df, columns=None):
    """Calculate mode and frequency statistics for all columns"""
    if columns is None:
        columns = df.columns
    
    mode_stats = {}
    for col in columns:
        if col in df.columns:
            mode_val = df[col].mode().iloc[0] if not df[col].mode().empty else None
            mode_stats[col] = {
                'mode': mode_val,
                'mode_frequency': df[col].value_counts().iloc[0] if not df[col].value_counts().empty else 0,
                'unique_values': df[col].nunique(),
                'most_frequent_values': df[col].value_counts().head(3).to_dict()
            }
    
    return mode_stats

In [None]:
def generate_correlation_matrix(df, method='pearson'):
    """Generate correlation matrix and heatmap for numeric columns"""
    numeric_df = df.select_dtypes(include=[np.number])
    if numeric_df.empty:
        print("No numeric columns found for correlation analysis")
        return None
    
    correlation_matrix = numeric_df.corr(method=method)
    
    # Create correlation heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, linewidths=0.5)
    plt.title(f'Correlation Matrix ({method.title()})')
    plt.tight_layout()
    plt.show()
    
    return correlation_matrix

In [None]:
def create_distribution_plots(df, columns=None, max_plots=6):
    """Create distribution plots and histograms for numeric columns"""
    if columns is None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns
    else:
        numeric_cols = [col for col in columns if col in df.columns and df[col].dtype in ['int64', 'float64']]
    
    numeric_cols = numeric_cols[:max_plots]  # Limit number of plots
    
    if len(numeric_cols) == 0:
        print("No numeric columns found for distribution plots")
        return
    
    n_cols = min(3, len(numeric_cols))
    n_rows = math.ceil(len(numeric_cols) / n_cols)
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
    if n_rows == 1 and n_cols == 1:
        axes = [axes]
    elif n_rows == 1:
        axes = axes
    else:
        axes = axes.flatten()
    
    for i, col in enumerate(numeric_cols):
        if i < len(axes):
            axes[i].hist(df[col].dropna(), bins=30, alpha=0.7, edgecolor='black')
            axes[i].set_title(f'Distribution of {col}')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency')
            axes[i].grid(True, alpha=0.3)
    
    # Hide empty subplots
    for i in range(len(numeric_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

In [None]:
def analyse_categorical_columns(df, columns=None, top_n=5):
    """Analyse categorical columns with frequency tables"""
    if columns is None:
        categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    else:
        categorical_cols = [col for col in columns if col in df.columns and df[col].dtype in ['object', 'category']]
    
    categorical_analysis = {}
    for col in categorical_cols:
        value_counts = df[col].value_counts()
        categorical_analysis[col] = {
            'unique_count': df[col].nunique(),
            'most_frequent': value_counts.index[0] if not value_counts.empty else None,
            'most_frequent_count': value_counts.iloc[0] if not value_counts.empty else 0,
            'top_values': value_counts.head(top_n).to_dict(),
            'missing_count': df[col].isna().sum()
        }
    
    return categorical_analysis

In [None]:
def get_summaries(df, user_query):
    """
    Main function that gets called by the main router.
    MUST take (user_query, df) and return df
    """
    
    # TODO: Create helper docs (Reimplement with functions)
    helper_docs = """
    - calculate_basic_stats(df, columns=None): Calculate mean, median, std, variance, skewness, kurtosis for numeric columns
    - calculate_five_number_summary(df, columns=None): Calculate min, Q1, median, Q3, max, IQR for numeric columns
    - calculate_mode_stats(df, columns=None): Calculate mode, frequency, unique values for all columns
    - generate_correlation_matrix(df, method='pearson'): Create correlation matrix and heatmap for numeric columns
    - create_distribution_plots(df, columns=None, max_plots=6): Create histograms for numeric columns
    - analyse_categorical_columns(df, columns=None, top_n=5): Analyse categorical columns with frequency tables
    """
    
    # Create message chain
    messages = []
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent trying to generate dataset summaries.
    
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}
    
    Helper functions available:
    {helper_docs}

    Libraries available:
    - pd (pandas), np (numpy)
    - math, re, datetime
    - preprocessing, impute (from sklearn)
    - plt (matplotlib.pyplot), sns (seaborn)
    - stats (from scipy)
    
    Rules:
    - Return only executable Python code, no explanations, no markdown blocks
    - Use helper functions where possible
    - Store final result in 'result_df'
    - Always print results using print() statements
    - For specific columns mentioned in query, pass them as lists to helper functions
    - Use df.copy() at the start to avoid modifying original
    - Set result_df = df at the end (even though df won't be modified for summaries)
    - No explanations, just code

    Examples:
    - "show basic stats for age and income" -> calculate_basic_stats(df, columns=['age', 'income'])
    - "correlation matrix" -> generate_correlation_matrix(df)
    - "summary of all data" -> use multiple helper functions
    - "distribution plots" -> create_distribution_plots(df)
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))
    
    # Call LLM with message chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()
    
    # Execute code
    print(f"Generated code for summaries:\n{generated_code}\n")
    try:
        original_df = df.copy()
        exec_globals = {
            'df': df,
            'pd': pd,
            'np': np,
            'math': math,
            'plt': plt,
            'sns': sns,
            'stats': stats,
            'calculate_basic_stats': calculate_basic_stats,
            'calculate_five_number_summary': calculate_five_number_summary,
            'calculate_mode_stats': calculate_mode_stats,
            'generate_correlation_matrix': generate_correlation_matrix,
            'create_distribution_plots': create_distribution_plots,
            'analyze_categorical_columns': analyse_categorical_columns,
            'print': print
        }
        exec(generated_code)
        return df
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df

In [None]:
# TEST OUT YOUR FEATURE

## Import Data

## Run code