In [None]:
# install necesary packages
%pip install import-ipynb

%pip install pandas
%pip install pint 
%pip install numpy
%pip install sklearn
%pip install langchain langchain-community openai

In [2]:
# Lang chain imports
from langchain.chat_models import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage

# Get API KEY
import os
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
import pandas as pd
import numpy as np

In [None]:
# Import features (TODO: Add features as we develop)
import import_ipynb
# from features.summaries import get_summaries
# from features.outliers import detect_outliers

In [44]:
def main(user_query, df):
    
    # List features for agent 
    features = """
    Available features:
    - get_summaries(): summaries 
    - handle_missing_vals(): imputation, missing values 
    - detect_outliers(): outliers
    """

    # Create message chain
    messages = []
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent
                                  
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}

    {features}

    Rules:
    - Each function takes (user_query, df) and returns modified df
    - Generate Python code that calls the functions with SPECIFIC instructions
    - Each function call should have a targeted query explaining exactly what to do
    - Return only executable Python code, no explanations, no markdown blocks
    - Only if no actions can be taken, print a descriptive message why

    Examples:
    - User: Find outliers for price and stock, Generated: Single: df = detect_outliers("find outliers in price, stock", df)
    - User: Impute numeric columns and generate mean and std for age, Generated: df = handle_missing_vals("impute numeric columns", df); df = get_summary("calculate mean and std for age", df)
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))

    # Call LLM
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()
    print(generated_code)

    # Execute AI generated code
    try:
        original_df = df.copy()
        exec(generated_code)
        return df
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df


In [49]:
# Sample data
df = pd.read_csv('sample_data/smoke.csv')
user_query="Impute missing values and then find any outliers in the dataset"

In [None]:
main(user_query, df)