## **Feature:** Dataset Summaries

**Names:** Ishan

### **What it does**
Generates statistical summaries including descriptive statistics (mean, median, mode, variance, std, quartiles), distribution analysis, correlation matrices, and basic visualisations for dataset exploration.

### **Helper Functions**
- calculate_basic_stats(df, columns=None): Calculate mean, median, std, variance, skewness, kurtosis for numeric columns
- calculate_five_number_summary(df, columns=None): Calculate min, Q1, median, Q3, max, IQR for numeric columns
- calculate_mode_stats(df, columns=None): Calculate mode, frequency, unique values for all columns
- generate_correlation_matrix(df, method='pearson'): Create correlation matrix and heatmap for numeric columns
- create_distribution_plots(df, columns=None, max_plots=6): Create histograms for numeric columns
- analyse_categorical_columns(df, columns=None, top_n=5): Analyse categorical columns with frequency tables
    """

In [66]:
# Get API Key
import os
from dotenv import load_dotenv
from sklearn.preprocessing import LabelEncoder

load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
import pandas as pd
import numpy as np
import math
import re
import datetime
from sklearn import preprocessing, impute
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Langchain imports
from langchain.chat_models import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage

In [67]:
def one_hot_encode(df, user_query):
    """
    Function to one-hot encode categorical columns in a DataFrame.
    user_query: string describing what to encode (optional, just for uniformity)
    df: pandas DataFrame
    Returns a new DataFrame with categorical columns one-hot encoded.
    """

    
    # Detect categorical columns automatically
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    if not categorical_cols:
        print("No categorical columns found to encode.")
        return df.copy()
    
    print(f"Encoding columns: {categorical_cols}")
    
    # One-hot encode categorical columns
    df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=False)
    
    print(f"New DataFrame shape after encoding: {df_encoded.shape}")
    return df_encoded


In [68]:

def label_encode(df, user_query):

    """
    Function to label encode categorical columns in a DataFrame.
    user_query: string describing what to encode (optional, just for uniformity)
    df: pandas DataFrame
    Returns a new DataFrame with categorical columns label-encoded.
    """
    
    # Detect categorical columns automatically
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    if not categorical_cols:
        print("No categorical columns found to encode.")
        return df.copy()
    
    print(f"Label encoding columns: {categorical_cols}")
    
    # Create a copy to avoid modifying original
    df_encoded = df.copy()
    
    # Encode each categorical column
    for col in categorical_cols:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
    
    print(f"New DataFrame shape after label encoding: {df_encoded.shape}")
    return df_encoded


In [None]:
def encode_categorical(df, user_query):
    """
    Main function that gets called by the main router.
    MUST take (user_query, df) and return df
    """

    helper_docs = """
    - One-hot encoding: pd.get_dummies(df, columns=[categorical_cols], drop_first=False)
    - Label encoding: sklearn.preprocessing.LabelEncoder()
    """

    messages = []
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent trying to encode categorical columns in a dataset.

    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}

    Helper functions available:
    {helper_docs}

    Libraries available:
    - pd (pandas), np (numpy)
    - preprocessing (from sklearn)
    
    Rules:
    - Return only executable Python code, no explanations, no markdown blocks
    - Automatically detect categorical columns
    - If query mentions 'one-hot' or 'one hot', perform one-hot encoding
    - If query mentions 'label', perform label encoding
    - Store final result in 'encoded_df'
    - Always print which columns are encoded and new DataFrame shape
    - Use df.copy() at the start to avoid modifying original
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))

    # Call LLM with message chain
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()

    # Execute code
    print(f"Generated code for encoding:\n{generated_code}\n")
    try:
        original_df = df.copy()
        exec_globals = {
            'df': df,
            'pd': pd,
            'np': np,
            'preprocessing': preprocessing,
            'LabelEncoder': preprocessing.LabelEncoder,
            'print': print
        }
        exec(generated_code, exec_globals)
        return exec_globals.get('encoded_df', df)
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df
