# Dataset links:
1. https://www.kaggle.com/datasets/pavansubhasht/ibm-hr-analytics-attrition-dataset
2. https://www.kaggle.com/datasets/stealthtechnologies/employee-attrition-dataset

# Discussions

1. How do we use Gen-AI in this project? (Code plugin) - Done by Xiwen & Aaron
	- RAG (broadbased recommendation?)
		- Scrape or get the best practices from top companies
		- Use the information to form the knowledge base for the LLM
		- Use the LLM to perform broadbased recommendation based on the parameters of the group of employees. 
2. EDA & Features - Done by Shenghao
	- Feature selection  (Domain knowledge)
	- Dimension reduction? - Done by
3. Classification / NN-DL - Done by Jia Hong & Kelda
	- Train a few ensemble models and stack the best 2-3? 
	- Hyper-param tuning 
	- NN / DL: Are we gonna use supervised learning or NN/DL
4. Deployment - Done by Alexander
	- Streamlit
	- FastAPI
5. Report structre - Done by Kelda
6. Presentation structure - Done by Jia Hong
7. Report + Presentation content - By Everyone


# Tentative deadline: (Main project deadline: 7th November 6.30pm)
- Best practices from top companies (Research) : 27 Oct
- Feature selection : 27 Oct
- Classification : 1 Nov
- Gen-AI RAG : 1 Nov
- Deployment : 3 Nov
- Draft report + presentation : 5 Nov
- Report + presentation : 6 Nov

In [None]:
%pip install matplotlib seaborn pandas numpy openai scikit-learn fastapi streamlit dotenv pydantic requests
%pip install torch torchvision --index-url https://download.pytorch.org/whl/cu130
%pip install langchain-core langchain-openai langgraph 
%pip install langchain

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from openai import OpenAI
from pydantic import BaseModel

In [2]:
# load OpenAI API key


OPENAI_API_KEY = ''
try:
	from google.colab import userdata
	print("Running in Google Colab")
	OPENAI_API_KEY = userdata.get('OPENAI_API_KEY', '')
except ImportError:
	import dotenv
	dotenv.load_dotenv()
	print("Running locally")
	OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', '')

openai = OpenAI(api_key=OPENAI_API_KEY)
try:
	response = openai.models.list()
	print("Available models:", response.data)
	print("OpenAI API key is valid.")
except Exception as e:
	print("Error with OpenAI API key:", e)

Running locally
Available models: [Model(id='gpt-4o-mini', created=1721172741, object='model', owned_by='system'), Model(id='gpt-4o-mini-realtime-preview', created=1734387380, object='model', owned_by='system'), Model(id='gpt-5-nano', created=1754426384, object='model', owned_by='system'), Model(id='text-embedding-ada-002', created=1671217299, object='model', owned_by='openai-internal')]
OpenAI API key is valid.


In [3]:
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head(10)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
5,32,No,Travel_Frequently,1005,Research & Development,2,2,Life Sciences,1,8,...,3,80,0,8,2,2,7,7,3,6
6,59,No,Travel_Rarely,1324,Research & Development,3,3,Medical,1,10,...,1,80,3,12,3,2,1,0,0,0
7,30,No,Travel_Rarely,1358,Research & Development,24,1,Life Sciences,1,11,...,2,80,1,1,2,3,1,0,0,0
8,38,No,Travel_Frequently,216,Research & Development,23,3,Life Sciences,1,12,...,2,80,0,10,2,3,9,7,1,8
9,36,No,Travel_Rarely,1299,Research & Development,27,3,Medical,1,13,...,2,80,2,17,3,2,7,7,7,7


In [None]:
df.isna().sum()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# convert 'Attrition' to binary classes: Yes=1, No=0
df["Attrition_Binary"] = df["Attrition"].apply(lambda x: 1 if x == "Yes" else 0)

# Convert 'MaritalStatus' to numerical classes: Married=1, Single=0, Divorced=2
df["Marital_Status_Class"] = df["MaritalStatus"].apply(lambda x: 1 if x == "Married" else (0 if x == "Single" else 2))

# Convert 'Gender' to binary classes: Male=1, Female=0
df["Gender_Binary"] = df["Gender"].apply(lambda x: 1 if x == "Male" else 0)


# convert 'OverTime' to binary classes: Yes=1, No=0
df["OverTime_Binary"] = df["OverTime"].apply(lambda x: 1 if x == "Yes" else 0)

# convert 'BusinessTravel' to numerical classes: Non-Travel=0, Travel_Rarely=1, Travel_Frequently=2
df["Business_Travel_Class"] = df["BusinessTravel"].apply(lambda x: 0 if x == "Non-Travel" else (1 if x == "Travel_Rarely" else 2))

display(df.head(10))

In [None]:

df['Attrition_Binary'].value_counts().plot(kind='bar')
plt.title('Attrition Distribution')
plt.show()

# TODO : Plot a pie chart for Marital Status distribution
df['Marital_Status_Class'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Marital Status Distribution')
plt.show()

# TODO : Plot a histogram for Age distribution
df['Age'].plot(kind='hist', bins=10)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.show()

# Correlation heatmap for numerical features
import seaborn as sns
plt.figure(figsize=(12,10))
correlation_matrix = df[df.select_dtypes(include=[np.number]).columns].corr()
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
df.describe()

# Experiment with Gen-AI

In [None]:
# TODO: Use OpenAI API to generate insights from the dataset

from IPython.display import Markdown, display

system_prompt = """You are a data analyst. Given the summary statistics of a dataset, provide insights about the data."""
user_prompt = f"""Here are the summary statistics of the dataset:
{df.describe().to_string()}
Provide insights about the data."""
response = openai.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
		{"role": "system", "content": system_prompt},
		{"role": "user", "content": user_prompt}
	]
)
print("AI-generated insights:")
display(Markdown(response.choices[0].message.content))

In [None]:
# TODO: To use OpenAI for feature selection or engineering and return the results as a list

class FeatureSuggestions(BaseModel):
    important_features: list[str]
    reasoning: str

system_prompt_fs = """You are a feature engineering expert.
Return your answer as a JSON object like this:
{
  "important_features": ["feature1", "feature2"],
  "reasoning": "short explanation"
}"""
user_prompt_fs = f"""Here are the summary statistics of the dataset:
{df.describe().to_string()}
Suggest the top 10 most important features for predicting employee attrition."""
response_fs = openai.chat.completions.create(
	model="gpt-4o-mini",
 	response_format={"type": "json_object"},
	temperature=0.5,
	messages=[
		{"role": "system", "content": system_prompt_fs},
		{"role": "user", "content": user_prompt_fs}
	]
)
response_json = response_fs.choices[0].message.content
parsed = FeatureSuggestions.model_validate_json(response_json)

# 5️⃣ Use safely
print(parsed.model_dump_json(indent=2))
print("Important features:", parsed.important_features)

In [None]:
# TODO: Use Openai API to perform EDA, will need to perform function calling

import json


# Define functions that the AI can call
def check_missing_data(df_name="df"):
    """Check for missing data in the dataset"""
    missing = df.isna().sum()
    missing_pct = (df.isna().sum() / len(df)) * 100
    result = pd.DataFrame({
        'Missing_Count': missing,
        'Missing_Percentage': missing_pct
    })
    result = result[result['Missing_Count'] > 0]
    
    if len(result) == 0:
        return "No missing data found in the dataset."
    else:
        return f"Missing data summary:\n{result.to_string()}"

def plot_visualization(plot_type, column_name, title=None, bins=20):
    """Create a visualization for the specified column"""
    try:
        if plot_type == "histogram":
            plt.figure(figsize=(10, 6))
            df[column_name].plot(kind='hist', bins=bins, edgecolor='black')
            plt.title(title or f'Histogram of {column_name}')
            plt.xlabel(column_name)
            plt.ylabel('Frequency')
            plt.grid(axis='y', alpha=0.3)
            plt.show()
            return f"Created histogram for {column_name}"
            
        elif plot_type == "bar":
            plt.figure(figsize=(10, 6))
            df[column_name].value_counts().plot(kind='bar', edgecolor='black')
            plt.title(title or f'Bar Chart of {column_name}')
            plt.xlabel(column_name)
            plt.ylabel('Count')
            plt.xticks(rotation=45)
            plt.grid(axis='y', alpha=0.3)
            plt.show()
            return f"Created bar chart for {column_name}"
            
        elif plot_type == "pie":
            plt.figure(figsize=(10, 6))
            df[column_name].value_counts().plot(kind='pie', autopct='%1.1f%%')
            plt.title(title or f'Pie Chart of {column_name}')
            plt.ylabel('')
            plt.show()
            return f"Created pie chart for {column_name}"
            
        elif plot_type == "box":
            plt.figure(figsize=(10, 6))
            df.boxplot(column=column_name)
            plt.title(title or f'Box Plot of {column_name}')
            plt.ylabel(column_name)
            plt.grid(axis='y', alpha=0.3)
            plt.show()
            return f"Created box plot for {column_name}"
            
        elif plot_type == "scatter":
            # For scatter, column_name should be "column1,column2"
            cols = column_name.split(',')
            if len(cols) == 2:
                plt.figure(figsize=(10, 6))
                plt.scatter(df[cols[0]], df[cols[1]], alpha=0.5)
                plt.xlabel(cols[0])
                plt.ylabel(cols[1])
                plt.title(title or f'Scatter Plot: {cols[0]} vs {cols[1]}')
                plt.grid(alpha=0.3)
                plt.show()
                return f"Created scatter plot for {cols[0]} vs {cols[1]}"
            else:
                return "Error: Scatter plot requires two columns separated by comma"
        else:
            return f"Error: Unknown plot type '{plot_type}'"
    except Exception as e:
        return f"Error creating {plot_type} for {column_name}: {str(e)}"

def encode_categorical_column(column_name):
    """
    Encode a categorical column to numerical values
    mapping_dict should be like: {"Yes": 1, "No": 0}
    """
    try:
        mapping_dict = {val: idx for idx, val in enumerate(df[column_name].unique())}
            
        new_column_name = f"{column_name}_Encoded"
        df[new_column_name] = df[column_name].map(mapping_dict)
        
        # Check if any values were not mapped
        unmapped = df[df[new_column_name].isna()][column_name].unique()
        if len(unmapped) > 0:
            return f"Warning: Column '{column_name}' encoded to '{new_column_name}', but values {unmapped.tolist()} were not in mapping"
        
        return f"Successfully encoded '{column_name}' to '{new_column_name}' with mapping: {mapping_dict}"
    except Exception as e:
        return f"Error encoding {column_name}: {str(e)}"

def get_column_info(column_name=None):
    """Get information about columns in the dataset"""
    if column_name:
        if column_name in df.columns:
            unique_vals = df[column_name].unique()
            return f"Column '{column_name}':\n- Type: {df[column_name].dtype}\n- Unique values ({len(unique_vals)}): {unique_vals.tolist()[:20]}\n- Sample data: {df[column_name].head(5).tolist()}"
        else:
            return f"Column '{column_name}' not found in dataset"
    else:
        return f"Available columns:\n{df.columns.tolist()}\n\nData types:\n{df.dtypes.to_string()}"

# Define function schemas for OpenAI
functions = [
    {
        "name": "check_missing_data",
        "description": "Check for missing data in the dataset and return a summary",
        "parameters": {
            "type": "object",
            "properties": {
                "df_name": {
                    "type": "string",
                    "description": "Name of the dataframe (default: 'df')"
                }
            }
        }
    },
    {
        "name": "plot_visualization",
        "description": "Create a visualization (histogram, bar, pie, box, or scatter plot) for a specified column",
        "parameters": {
            "type": "object",
            "properties": {
                "plot_type": {
                    "type": "string",
                    "enum": ["histogram", "bar", "pie", "box", "scatter"],
                    "description": "Type of plot to create"
                },
                "column_name": {
                    "type": "string",
                    "description": "Name of the column to plot. For scatter plots, use 'column1,column2' format"
                },
                "title": {
                    "type": "string",
                    "description": "Optional title for the plot"
                },
                "bins": {
                    "type": "integer",
                    "description": "Number of bins for histogram (default: 20)"
                }
            },
            "required": ["plot_type", "column_name"]
        }
    },
    {
        "name": "encode_categorical_column",
        "description": "Encode a categorical column to numerical values using a provided mapping dictionary",
        "parameters": {
            "type": "object",
            "properties": {
                "column_name": {
                    "type": "string",
                    "description": "Name of the column to encode"
                }
            },
            "required": ["column_name"]
        }
    },
    {
        "name": "get_column_info",
        "description": "Get information about columns in the dataset, including data types and unique values",
        "parameters": {
            "type": "object",
            "properties": {
                "column_name": {
                    "type": "string",
                    "description": "Name of specific column to inspect. If not provided, returns info about all columns"
                }
            }
        }
    }
]

# Map function names to actual functions
available_functions = {
    "check_missing_data": check_missing_data,
    "plot_visualization": plot_visualization,
    "encode_categorical_column": encode_categorical_column,
    "get_column_info": get_column_info
}

print("✅ EDA functions and schemas defined successfully!")
print(f"Available functions: {list(available_functions.keys())}")

In [None]:
# Execute OpenAI-powered EDA with function calling

# Create the initial prompt
system_prompt_eda = """You are a data analyst assistant. You have access to functions to perform EDA on an employee attrition dataset.

Your task is to:
1. Check for missing data in the dataset
2. Create at least 4 different visualizations to explore the data (choose appropriate plot types for different columns)
3. Identify and encode categorical columns that need to be converted to numerical values for machine learning

Use the available functions to accomplish these tasks. Call get_column_info first to understand the dataset structure."""

user_prompt_eda = """Please perform comprehensive EDA on the employee attrition dataset. 
First, check what columns are available, then check for missing data, create insightful visualizations, 
and finally encode any remaining categorical columns that haven't been encoded yet."""

# Initialize conversation
messages = [
    {"role": "system", "content": system_prompt_eda},
    {"role": "user", "content": user_prompt_eda}
]

print("🤖 Starting AI-powered EDA with function calling...\n")
print("=" * 80)

# Run the conversation loop
max_iterations = 15  # Prevent infinite loops
iteration = 0

while iteration < max_iterations:
    iteration += 1
    print(f"\n--- Iteration {iteration} ---")
    
    # Make API call
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        tools=[{"type": "function", "function": func} for func in functions],
        tool_choice="auto"
    )
    
    response_message = response.choices[0].message
    messages.append(response_message)
    
    # Check if the model wants to call functions
    tool_calls = response_message.tool_calls
    
    if not tool_calls:
        # No more function calls - the assistant has finished
        print("\n✅ AI Assistant finished:")
        print(response_message.content)
        break
    
    # Process each function call
    for tool_call in tool_calls:
        function_name = tool_call.function.name
        function_args = json.loads(tool_call.function.arguments)
        
        print(f"\n🔧 Calling function: {function_name}")
        print(f"   Arguments: {function_args}")
        
        # Call the actual function
        function_to_call = available_functions[function_name]
        function_response = function_to_call(**function_args)
        
        print(f"   Result: {function_response[:200]}..." if len(str(function_response)) > 200 else f"   Result: {function_response}")
        
        # Add function response to messages
        messages.append({
            "role": "tool",
            "tool_call_id": tool_call.id,
            "name": function_name,
            "content": str(function_response)
        })

print("\n" + "=" * 80)
print("✅ EDA completed!")
print(f"\nTotal iterations: {iteration}")
print(f"Check the visualizations above and the new encoded columns in the dataframe.")


In [None]:
# Display the updated dataframe with newly encoded columns
print("\n📊 Updated DataFrame Info:")
print(f"Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print("\n" + "=" * 80)
print("\nFirst few rows with all columns (including newly encoded):")
df.head(10)


In [None]:
# TODO: Use OpenAI API to suggest Machine Learning models based on the dataset and suggest starting parameters
from pydantic import BaseModel, Field
from typing import Optional

class ModelSuggestion(BaseModel):
	model_name: str = Field(description="Name of the suggested machine learning model")
	parameters: dict = Field(description="Suggested starting parameters for the model")
	reasoning: str = Field(description="Reasoning behind the model choice and parameters")

model_suggestion_schema = {
    "name": "ModelSuggestion",
    "schema": ModelSuggestion.model_json_schema()
}

system_prompt_ml = """You are a machine learning expert. Based on the dataset provided, suggest an appropriate machine learning model for predicting employee attrition along with starting parameters.
Return your answer based on the following JSON schema:
{
  "model_name": "string",
  "parameters": {"type": "object"},
  "reasoning": "string"
}"""

user_prompt_ml = f"""Here are the summary statistics of the dataset:
{df.describe().to_string()}
Suggest an appropriate machine learning model for predicting employee attrition along with starting parameters."""
response_ml = openai.chat.completions.create(
	model="gpt-4o-mini",
 	response_format={"type": "json_schema", "json_schema": model_suggestion_schema},
	temperature=0.5,
	messages=[
		{"role": "system", "content": system_prompt_ml},
		{"role": "user", "content": user_prompt_ml}
	]
)
response_json_ml = response_ml.choices[0].message.content
parsed_ml = ModelSuggestion.model_validate_json(response_json_ml)

print("✅ Machine Learning Model Suggestion:")
print(parsed_ml.model_dump_json(indent=2))

# Agentic AI Experimentation

# 🔁 Multi-Agent EDA with LangChain + LangGraph

This section builds a small agentic pipeline that collaborates on EDA for the employee attrition dataset using OpenAI tools:

- Data dictionary: generates column descriptions with an LLM (Pydantic-typed)
- Missing values: decides mean vs median and imputes (Pydantic-typed)
- Visualizations: produces plots based on the dictionary
- Encoding: recommends and applies encodings for non-numeric columns (LLM-guided, Pydantic-typed)
- Insights: generates human-readable Markdown
- Feature selection: suggests whether to use all columns or a subset (Pydantic-typed)

You can re-run this section safely; it’s designed to be idempotent for repeated runs.

In [None]:
# Pydantic models and helpers for the multi-agent EDA
from typing import List, Literal, Optional, Dict, Any
from pydantic import BaseModel, Field
import pandas as pd
import numpy as np

# 1) Data dictionary
class DataDictionaryField(BaseModel):
    name: str
    dtype: str
    description: str
    is_numeric: bool
    unique_values_sample: List[Any] = Field(default_factory=list)

class DataDictionary(BaseModel):
    dataset_name: str
    fields: List[DataDictionaryField]

# 2) Missing values decisions
ImputeStrategy = Literal["mean", "median", "none"]

class ImputationDecision(BaseModel):
    column: str
    strategy: ImputeStrategy
    reason: str

class ImputationPlan(BaseModel):
    decisions: List[ImputationDecision]

# 4) Encoding decisions
EncodingType = Literal[
    "one_hot",    # small cardinality categorical
    "ordinal",    # natural order
    "binary",     # boolean/yes-no
    "target",     # high-cardinality target encoding (note: here we won’t implement target leakage; for demo use one_hot fallback)
    "none"
]

class EncodingDecision(BaseModel):
    column: str
    encode: bool
    encoding_type: EncodingType
    reason: str

class EncodingPlan(BaseModel):
    decisions: List[EncodingDecision]

# 6) Feature selection
class FeatureSelection(BaseModel):
    use_all_columns: bool
    selected_columns: List[str]
    reason: str

# Helpers
NUMERIC_DTYPES = (np.number,)


def summarize_dataframe(df: pd.DataFrame) -> Dict[str, Any]:
    summary = {
        "shape": df.shape,
        "columns": [],
        "missing": df.isna().sum().to_dict(),
        "desc_numeric": df.select_dtypes(include=[np.number]).describe().to_dict(),
        "dtypes": df.dtypes.astype(str).to_dict(),
    }
    for col in df.columns:
        col_info = {
            "name": col,
            "dtype": str(df[col].dtype),
            "is_numeric": pd.api.types.is_numeric_dtype(df[col]),
            "n_unique": int(df[col].nunique(dropna=True)),
            "unique_values_sample": df[col].dropna().unique().tolist()[:10],
        }
        summary["columns"].append(col_info)
    return summary

print("✅ Models and helpers ready.")

In [None]:
# LangChain + LangGraph setup and LLMs
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser, StrOutputParser
from langgraph.graph import StateGraph, END
from typing import TypedDict
import os

# Ensure env var is available for LangChain OpenAI
if 'OPENAI_API_KEY' in globals() and OPENAI_API_KEY and not os.environ.get('OPENAI_API_KEY'):
    os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

# Use existing OPENAI_API_KEY from earlier cell; if missing, set via env
llm_json = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)
llm_text = ChatOpenAI(model="gpt-4o-mini", temperature=0.2)

# Output parsers
parser_data_dict = PydanticOutputParser(pydantic_object=DataDictionary)
parser_impute = PydanticOutputParser(pydantic_object=ImputationPlan)
parser_encode = PydanticOutputParser(pydantic_object=EncodingPlan)
parser_features = PydanticOutputParser(pydantic_object=FeatureSelection)

# Prompts
prompt_data_dict = ChatPromptTemplate.from_messages([
    ("system", "You are a data dictionary generator for an HR attrition dataset. Given column summaries, write concise, practical descriptions for business analysts."),
    ("human", "Dataset name: {dataset_name}\n\nColumns JSON (with dtype, numeric flag, unique samples):\n{columns_json}\n\nReturn a JSON strictly matching this schema:\n{format_instructions}")
]).partial(format_instructions=parser_data_dict.get_format_instructions())

prompt_impute = ChatPromptTemplate.from_messages([
    ("system", "You decide imputation for missing values. Use mean for symmetric numeric distributions without strong outliers; use median otherwise. For non-numeric, choose 'none'."),
    ("human", "Missing summary: {missing_json}\nNumeric describe: {desc_json}\nDtypes: {dtypes_json}\nReturn a JSON plan with one decision per column.\n{format_instructions}")
]).partial(format_instructions=parser_impute.get_format_instructions())

prompt_encode = ChatPromptTemplate.from_messages([
    ("system", "You are an encoding planner. For non-numeric columns, decide whether to encode and how. Use: binary for yes/no; one_hot for low-cardinality categoricals; ordinal only if natural order is evident; none for IDs/unique identifiers. Avoid target encoding here."),
    ("human", "Dtypes: {dtypes_json}\nUnique counts: {unique_counts_json}\nReturn a JSON encoding plan.\n{format_instructions}")
]).partial(format_instructions=parser_encode.get_format_instructions())

prompt_insights = ChatPromptTemplate.from_messages([
    ("system", "You are a senior data analyst. Write crisp, well-structured Markdown insights for stakeholders. Avoid hallucinating unknown business rules."),
    ("human", "Provide insights based on: shape={shape}, dictionary_fields={fields}, missing={missing_json}, numeric_summary={desc_json}. Include 4-7 bullet points with concrete observations and cautions.")
])

prompt_features = ChatPromptTemplate.from_messages([
    ("system", "You recommend feature subsets for employee attrition prediction. Avoid leakage (e.g., Attrition label). Prefer columns with predictive signal and reasonable cardinality."),
    ("human", "Columns and dtypes: {dtypes_json}\nEncoded columns present: {encoded_cols}\nBusiness target column: {target}\nReturn JSON with use_all_columns and selected_columns if not all.\n{format_instructions}")
]).partial(format_instructions=parser_features.get_format_instructions())

print("✅ LangChain+LangGraph and prompts initialized.")

In [None]:
# Agent node functions
import json
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display

# Define a graph state
class EDAState(TypedDict, total=False):
    df: pd.DataFrame
    summary: Dict[str, Any]
    data_dictionary: DataDictionary
    imputation_plan: ImputationPlan
    encoding_plan: EncodingPlan
    insights_md: str
    feature_selection: FeatureSelection


def node_summarize(state: EDAState) -> EDAState:
    df_local = state["df"]
    state["summary"] = summarize_dataframe(df_local)
    return state


def node_data_dictionary(state: EDAState) -> EDAState:
    summary = state["summary"]
    columns_json = json.dumps(summary["columns"], ensure_ascii=False)
    chain = prompt_data_dict | llm_json | parser_data_dict
    dd = chain.invoke({
        "dataset_name": "IBM HR Attrition",
        "columns_json": columns_json,
    })
    state["data_dictionary"] = dd
    return state


def node_missing_values(state: EDAState) -> EDAState:
    df_local = state["df"].copy()
    summary = state["summary"]
    chain = prompt_impute | llm_json | parser_impute
    plan = chain.invoke({
        "missing_json": json.dumps(summary["missing"], ensure_ascii=False),
        "desc_json": json.dumps(summary["desc_numeric"], ensure_ascii=False),
        "dtypes_json": json.dumps(summary["dtypes"], ensure_ascii=False),
    })
    # Apply imputations
    for d in plan.decisions:
        col = d.column
        if col not in df_local.columns:
            continue
        if d.strategy == "mean" and pd.api.types.is_numeric_dtype(df_local[col]):
            df_local[col] = df_local[col].fillna(df_local[col].mean())
        elif d.strategy == "median" and pd.api.types.is_numeric_dtype(df_local[col]):
            df_local[col] = df_local[col].fillna(df_local[col].median())
        # 'none' -> no action
    state["df"] = df_local
    state["imputation_plan"] = plan
    return state


def node_visualize(state: EDAState) -> EDAState:
    df_local = state["df"]
    dd = state["data_dictionary"]
    # Basic, informative plots chosen based on dictionary
    numeric_cols = [f.name for f in dd.fields if f.is_numeric][:6]
    cat_cols = [f.name for f in dd.fields if not f.is_numeric][:4]

    # Distribution plots for numeric columns
    for col in numeric_cols:
        try:
            plt.figure(figsize=(6,4))
            sns.histplot(df_local[col].dropna(), kde=True)
            plt.title(f"Distribution of {col}")
            plt.tight_layout()
            plt.show()
        except Exception as e:
            print(f"[viz] Skipped {col}: {e}")

    # Bar charts for categorical columns
    for col in cat_cols:
        try:
            plt.figure(figsize=(6,4))
            df_local[col].value_counts(dropna=False).head(20).plot(kind='bar', edgecolor='black')
            plt.title(f"Value Counts of {col}")
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()
        except Exception as e:
            print(f"[viz] Skipped {col}: {e}")

    # Correlation heatmap
    try:
        plt.figure(figsize=(10,8))
        corr = df_local.select_dtypes(include=[np.number]).corr()
        sns.heatmap(corr, cmap='coolwarm', center=0)
        plt.title('Correlation Heatmap (Numeric)')
        plt.tight_layout()
        plt.show()
    except Exception as e:
        print(f"[viz] Heatmap skipped: {e}")

    return state


def node_encoding(state: EDAState) -> EDAState:
    df_local = state["df"].copy()
    summary = summarize_dataframe(df_local)  # refresh after imputation
    unique_counts = {c["name"]: c["n_unique"] for c in summary["columns"]}
    chain = prompt_encode | llm_json | parser_encode
    plan = chain.invoke({
        "dtypes_json": json.dumps(summary["dtypes"], ensure_ascii=False),
        "unique_counts_json": json.dumps(unique_counts, ensure_ascii=False),
    })

    # Apply encoding decisions
    for d in plan.decisions:
        col = d.column
        if col not in df_local.columns:
            continue
        if d.encoding_type == "none" or not d.encode:
            continue
        if d.encoding_type == "binary":
            # Map yes/no-like or two-level categories to 0/1
            if pd.api.types.is_numeric_dtype(df_local[col]):
                continue
            levels = df_local[col].dropna().unique()
            if len(levels) == 2:
                mapping = {levels[0]: 0, levels[1]: 1}
                df_local[f"{col}_bin"] = df_local[col].map(mapping)
        elif d.encoding_type == "ordinal":
            # Fallback to factorize order unless a natural order is obvious; here we factorize
            if not pd.api.types.is_numeric_dtype(df_local[col]):
                codes, uniques = pd.factorize(df_local[col])
                df_local[f"{col}_ord"] = codes
        elif d.encoding_type == "one_hot":
            try:
                dummies = pd.get_dummies(df_local[col], prefix=col, dummy_na=False)
                # Avoid column explosion for very high cardinality
                if dummies.shape[1] <= 30:
                    df_local = pd.concat([df_local, dummies], axis=1)
                else:
                    # fallback to factorize when too many levels
                    codes, uniques = pd.factorize(df_local[col])
                    df_local[f"{col}_ord"] = codes
            except Exception as e:
                print(f"[encode] {col} one_hot failed: {e}")
    state["df"] = df_local
    state["encoding_plan"] = plan
    return state


def node_insights(state: EDAState) -> EDAState:
    summary = state["summary"]
    dd = state.get("data_dictionary")
    fields_json = [f.model_dump() for f in (dd.fields if dd else [])]
    chain = prompt_insights | llm_text | StrOutputParser()
    md = chain.invoke({
        "shape": str(summary.get("shape")),
        "fields": json.dumps(fields_json, ensure_ascii=False),
        "missing_json": json.dumps(summary.get("missing", {}), ensure_ascii=False),
        "desc_json": json.dumps(summary.get("desc_numeric", {}), ensure_ascii=False),
    })
    state["insights_md"] = md
    return state


def node_feature_selection(state: EDAState) -> EDAState:
    df_local = state["df"]
    dtypes = df_local.dtypes.astype(str).to_dict()
    encoded_cols = [c for c in df_local.columns if c.endswith(("_bin", "_ord")) or "_" in c and any(x in c for x in ["Attrition_Binary", "Business_Travel_Class"]) ]
    chain = prompt_features | llm_json | parser_features
    fs = chain.invoke({
        "dtypes_json": json.dumps(dtypes, ensure_ascii=False),
        "encoded_cols": json.dumps(encoded_cols, ensure_ascii=False),
        "target": "Attrition_Binary",
    })
    state["feature_selection"] = fs
    return state

print("✅ Agent node functions defined.")

In [None]:
# Build and run the LangGraph pipeline
from langgraph.graph import StateGraph

# Guard: ensure df exists
try:
    assert 'df' in globals()
except AssertionError:
    raise RuntimeError("DataFrame 'df' not found. Run earlier cells to load the dataset.")

eda_graph = StateGraph(EDAState)
eda_graph.add_node("summarize", node_summarize)
eda_graph.add_node("data_dictionary", node_data_dictionary)
eda_graph.add_node("missing_values", node_missing_values)
eda_graph.add_node("visualize", node_visualize)
eda_graph.add_node("encoding", node_encoding)
eda_graph.add_node("insights", node_insights)
eda_graph.add_node("feature_selection", node_feature_selection)

# Order: summarize -> data_dictionary -> missing -> visualize -> encoding -> insights -> feature_selection
eda_graph.add_edge("summarize", "data_dictionary")
eda_graph.add_edge("data_dictionary", "missing_values")
eda_graph.add_edge("missing_values", "visualize")
eda_graph.add_edge("visualize", "encoding")
eda_graph.add_edge("encoding", "insights")
eda_graph.add_edge("insights", "feature_selection")
eda_graph.add_edge("feature_selection", END)
eda_graph.set_entry_point("summarize")

compiled = eda_graph.compile()

initial_state: EDAState = {"df": df.copy()}
result_state = compiled.invoke(initial_state)

# Store to variables for easy access
eda_state = result_state
print("✅ EDA graph executed.")

In [None]:
# Display results from the multi-agent EDA
from IPython.display import display, Markdown

if 'eda_state' not in globals():
    raise RuntimeError("EDA state not found. Run the graph execution cell first.")

# 1) Data dictionary
print("\n📘 Data Dictionary (first 20 fields):")
if eda_state.get("data_dictionary"):
    dd_df = pd.DataFrame([f.model_dump() for f in eda_state["data_dictionary"].fields])
    display(dd_df.head(20))
else:
    print("No data dictionary produced.")

# 2) Imputation plan
print("\n🧩 Imputation Decisions:")
if eda_state.get("imputation_plan"):
    imp_df = pd.DataFrame([d.model_dump() for d in eda_state["imputation_plan"].decisions])
    display(imp_df)
else:
    print("No imputation plan produced.")

# 3) Encoding plan
print("\n🔡 Encoding Decisions:")
if eda_state.get("encoding_plan"):
    enc_df = pd.DataFrame([d.model_dump() for d in eda_state["encoding_plan"].decisions])
    display(enc_df)
else:
    print("No encoding plan produced.")

# 4) Insights (Markdown)
print("\n🧠 Insights:")
if eda_state.get("insights_md"):
    display(Markdown(eda_state["insights_md"]))
else:
    print("No insights produced.")

# 5) Feature selection
print("\n🎯 Feature Selection Recommendation:")
if eda_state.get("feature_selection"):
    fs = eda_state["feature_selection"]
    print("Use all columns:", fs.use_all_columns)
    if not fs.use_all_columns:
        print("Selected columns (suggested):", fs.selected_columns)
    print("Reason:\n", fs.reason)
else:
    print("No feature selection produced.")

# 6) Updated dataframe preview after imputations/encodings
print("\n📊 Updated DataFrame preview:")
updated_df = eda_state.get("df", df)
display(updated_df.head())
print("Shape:", updated_df.shape)