## **Feature:** Missing Vals

**Names:** Tanat

### **What it does**
[Brief description]

### **Helper Functions**
[List Helper Functions]

In [12]:
# Get API Key
from dotenv import load_dotenv
load_dotenv()
import os
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
import pandas as pd
import numpy as np
import math
import re
import datetime

# Langchain imports
from langchain.chat_models import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage

In [10]:
def analyze_missing_values(df, drop_threshold = 0.5):
    suggestions = {}
    """
    Automatically suggest missing values imputation methods in a DataFrame
    based on simple best-practice heuristics.
    """
    for col in df.columns:
        missing_pct = df[col].isna().mean()
        dtype = df[col].dtype
        suggestion = None

        if missing_pct > drop_threshold:
            suggestion = "Drop column (too many missing values)"
        else:
            # Numerical features (Mean or Median imputation based on skew)
            if np.issubdtype(dtype, np.number):
                skewness = df[col].dropna().skew()
                if abs(skewness) < 1:
                    suggestion = "Mean imputation"
                else:
                    suggestion = "Median imputation (skewed distribution)"
            # Categorical features (Check Low/High Cardinality)
            elif df[col].dtype == "object" or pd.api.types.is_categorical_dtype(df[col]):
                n_unique = df[col].nunique(dropna=True)
                if n_unique <= 10:
                    suggestion = "Mode imputation (most frequent)"
                else:
                    suggestion = "Impute with 'Unknown' or predictive model"

            # Datetime features
            elif np.issubdtype(dtype, np.datetime64):
                suggestion = "Forward/Backward fill or interpolation (time series)"
            else:
                suggestion = "Custom handling needed"

        suggestions[col] = {
            "dtype": str(dtype),
            "missing_pct": round(missing_pct, 3),
            "suggestion": suggestion
        }

    return suggestions

In [14]:
import pandas as pd
import numpy as np

def auto_impute(df, drop_threshold=0.5):
    """
    Automatically imputes missing values in a DataFrame based on simple 
    best-practice heuristics.
    """
    for col in df.columns:
        missing_pct = df[col].isna().mean()
        dtype = df[col].dtype

        # Drop if too many missing
        if missing_pct > drop_threshold:
            df = df.drop(columns=[col])
            continue

        # Numerical features (mean or median)
        if np.issubdtype(dtype, np.number):
            skewness = df[col].dropna().skew()
            if abs(skewness) < 1:
                fill_value = df[col].mean()
                print(f"Imputed '{col}' with mean")
            else:
                fill_value = df[col].median()
                print(f"'{col}' is skewed, imputed with median")
            df[col] = df[col].fillna(fill_value)

        # Categorical features
        elif df[col].dtype == "object" or pd.api.types.is_categorical_dtype(df[col]):
            n_unique = df[col].nunique(dropna=True)
            unique_ratio = n_unique / df.shape[0]
            # Low Cardinality Fill with median else impute with unknown 
            if n_unique <= 20 or unique_ratio < 0.05:
                fill_value = df[col].mode(dropna=True)[0] if not df[col].mode(dropna=True).empty else "Unknown"
                df[col] = df[col].fillna(fill_value)
                print(f"{col} has low Cardinality, imputed with mode")
            else:
                df[col] = df[col].fillna("Unknown")

        # Datetime features
        elif np.issubdtype(dtype, np.datetime64):
            df[col] = df[col].fillna(method="ffill").fillna(method="bfill")

        # Fallback
        else:
            df[col] = df[col].fillna("Unknown")

    return df