In [1]:
# install necesary packages
%pip install import-ipynb

%pip install pandas pint numpy sklearn seaborn matplotlib
%pip install langchain langchain-community langchain-openai
%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.
Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-pypi-packag

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Lang chain imports
from langchain.chat_models import ChatOpenAI  
from langchain.schema import HumanMessage, SystemMessage
from dotenv import load_dotenv

# Get API KEY
import os

load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    print("OpenAI API Key not found")

# Import libraries
import pandas as pd
import numpy as np

In [5]:
# Import features (TODO: Add features as we develop)
import import_ipynb
from features.summaries import get_summaries
from features.pattern_matching import pattern_matching
from features.missing_vals import missing_vals

In [48]:
def route_query(user_query, df):
    # List features for agent 
    features = """
    Available features:
    - pattern_matching(): handle object columns, preprocessing, cleaning, how to clean data?
    - get_summaries(): data summaries, visualise data 
    - missing_vals(): handling missing data, imputation recommendations
    """
    # - handle_missing_vals(): imputation, missing values 
    # - detect_outliers(): outliers

    # Create message chain
    messages = []
    messages.append(SystemMessage(content=f"""
    You are a data cleaning agent
                                  
    Dataset info: Shape: {df.shape}, Sample: {df.head(3).to_string()}

    {features}

    Route:
    - Any general queries about cleaning data go to pattern_matching
    - Any questions about analysing data go to get_summaries
    - Any questions about missing values go to missing_vals

    Rules:
    - Each function takes (df, user_query) and returns modified df
    - Generate Python code that calls the functions with instructions or suggestions
    - Each function call should have a targeted query explaining exactly what to do
    - Return only executable Python code, no explanations, NO MARKDOWN BLOCKS
    - Only if no actions can be taken, print a descriptive message why
    - In order to generate a response/message to the user use print statements
    print("message")

    Examples:
    - User: Find outliers for price and stock, Generated: Single: df = detect_outliers(df, "find outliers in price, stock")
    - User: Impute numeric columns and generate mean and std for age, Generated: df = handle_missing_vals(df, "impute numeric columns"); df = get_summary(df, "calculate mean and std for age")
    """))
    messages.append(HumanMessage(content=f"User request: {user_query}"))

    # Call LLM
    llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    response = llm.invoke(messages)
    generated_code = response.content.strip()
    print(generated_code)

    # Execute AI generated code
    try:
        original_df = df.copy()
        exec(generated_code)
        return df
    except Exception as e:
        print(f"Error: {e}")
        print(f"Generated Code:{generated_code}")
        return original_df


In [42]:
def user_load_dataset():
    print("LOAD YOUR DATASET TO GET STARTED!")
    
    datasets_path = "datasets"
    if os.path.exists(datasets_path):
        file_paths = [f for f in os.listdir(datasets_path) if f.lower().endswith('.csv')]
        if file_paths:
            print(f"CSV files in '{datasets_path}':")
            for i, fname in enumerate(file_paths):
                print(f"{i+1}. {fname}")
        else:
            print("No CSV files found in 'datasets' folder.")
            return
    else:
        print(f"'{datasets_path}' folder not found.")
        return
    
    print("\nNot seeing your data? Make sure your CSV file is in 'datasets' folder")
    print("Type x to exit.")
    user_file = input("To load your dataset, input either the number or name of your desired CSV: ")

    if user_file.lower() == "x":
        print("Exiting.")
        return

    # Try to load by number
    try:
        idx = int(user_file) - 1
        if 0 <= idx < len(file_paths):
            selected_file = file_paths[idx]
        else:
            print("Invalid number.")
            return
    except ValueError:
        # Try to load by name
        if user_file in file_paths:
            selected_file = user_file
        else:
            print("File not found.")
            return

    df = pd.read_csv(os.path.join(datasets_path, selected_file))
    print(f"\nLoaded '{selected_file}' with shape {df.shape}")
    print(df.head(5))
    return df

In [43]:
def user_actions(df):
    out_path = "datasets_out"
    while True:
        print("\nCommands:")
        print("1. View dataframe (head)")
        print("2. Output dataframe to CSV")
        print("x. Exit")
        user_query = input("\nEnter a command number or a data cleaning/query command: ")
        if user_query.lower() == "x":
            print("Exiting.")
            return df
        elif user_query == "1":
            print(df.head())
            _ = input("enter anything to continue")
        elif user_query == "2":
            if not os.path.exists(out_path):
                os.makedirs(out_path)
            out_file = os.path.join(out_path, "output.csv")
            df.to_csv(out_file, index=False)
            print(f"Dataframe saved to {out_file}")
            _ = input("enter anything to continue")
        else:
            df = route_query(user_query, df)

In [44]:
def main():
    df = user_load_dataset()
    df = user_actions(df)

In [None]:
main()

In [50]:
# Sample data
df = pd.read_csv('datasets/smoke.csv')
test_df = df.copy()

In [None]:
user_query = "how to clean data"
route_query(user_query, df)

df = pattern_matching(df, "handle object columns, preprocessing, cleaning")
df['State'] = df['State'].replace({'\s+': ''}, regex=True)  # Remove any extra spaces
df['Smoke everyday'] = handle_numeric(df['Smoke everyday'])
df['Smoke some days'] = handle_numeric(df['Smoke some days'])
df['Former smoker'] = handle_numeric(df['Former smoker'])
df['Never smoked'] = handle_numeric(df['Never smoked'])

print("Object columns have been preprocessed and cleaned. The 'State' column had extra spaces removed, and all percentage columns have been converted to numeric format for consistency.")
numeric vals: Failed to clean 'twenty-threepoint6percent'
Object columns have been preprocessed and cleaned. The 'State' column had extra spaces removed, and all percentage columns have been converted to numeric format for consistency.


Unnamed: 0,Year,State,Smoke everyday,Smoke some days,Former smoker,Never smoked
0,2010,AL,0.156,0.063,0.239,0.542
1,2010,AK,0.135,0.068,0.261,0.536
2,2010,AZ,0.107,0.044,0.279,0.571
3,2100,Arkansas,0.173,0.056,0.241,0.530
4,2010,California,0.075,0.046,0.231,0.648
...,...,...,...,...,...,...
871,1995,Virginia,0.187,0.027,0.252,0.535
872,1995,Washington,0.175,0.024,0.299,0.502
873,1995,WestVirginia,0.237,0.019,0.233,0.511
874,1995,Wisconsin,0.182,0.035,0.276,0.507
