# Init values

In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
SEED = 42
# dataset_folder = "datasets/financial"
# dataset_folder = "datasets/healthcare"
# dataset_folder = "datasets/eligibility"
dataset_folder = "datasets/nasa-FD002"

MAX_ITERATIONS = 1
TEMPERATURE = 0.9
LLM_NAME = "meta-llama/llama-3.1-8b-instruct"
# LLM_NAME = "meta-llama/llama-3.1-8b-instruct:free"


In [21]:
from dotenv import load_dotenv
from langchain_community.cache import SQLiteCache
from langchain.globals import set_llm_cache
from caia.utils import save_yaml_results
from caia.utils import ChatOpenRouter

import json
import uuid


# dataset_folder = "datasets/financial"
with open(f'{dataset_folder}/dataset_description.json', 'r') as f:
    dataset_description = json.load(f)

load_dotenv("env")
# set_llm_cache(SQLiteCache(database_path=".cache_langchain.db"))

llm_generator = ChatOpenRouter(model_name=LLM_NAME, cache=False,
                               temperature=TEMPERATURE) 

In [22]:
from rich import print

training_code = """
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# load the old data
dataset_folder = "datasets/nasa-FD002"
X_train_old = pd.read_csv(f"{dataset_folder}/X_train_old.csv")
X_test_old = pd.read_csv(f"{dataset_folder}/X_test_old.csv")
y_train_old = pd.read_csv(f"{dataset_folder}/y_train_old.csv").squeeze("columns")
y_test_old = pd.read_csv(f"{dataset_folder}/y_test_old.csv").squeeze("columns")

model_old = RandomForestClassifier(random_state=42)


model_old.fit(X_train_old, y_train_old)

# Test the model on the old test set
old_accuracy = model_old.score(X_test_old, y_test_old)

print(f'Model trained and evaluated on the old distribution: {old_accuracy}')
"""
print(training_code)

In [23]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier


# load the reference data
X_train_old = pd.read_csv(f"{dataset_folder}/X_train_old.csv")
X_test_old = pd.read_csv(f"{dataset_folder}/X_test_old.csv")
y_train_old = pd.read_csv(f"{dataset_folder}/y_train_old.csv").squeeze("columns")
y_test_old = pd.read_csv(f"{dataset_folder}/y_test_old.csv").squeeze("columns")

print(f"X_train_old shape: {X_train_old.shape}")
print(f"X_test_old shape: {X_test_old.shape}")

model_old = RandomForestClassifier(random_state=SEED)
model_old.fit(X_train_old, y_train_old)

# Test the model on the initial test set
initial_accuracy = model_old.score(X_test_old, y_test_old)

print(f'Model trained and evaluated on the old distribution: {initial_accuracy}')

# Test the model on the drifted test set
X_test_new = pd.read_csv(f"{dataset_folder}/X_test_new.csv")
y_test_new = pd.read_csv(f"{dataset_folder}/y_test_new.csv").squeeze("columns")

print(f"X_test_new shape: {X_test_new.shape}")
drifted_accuracy = model_old.score(X_test_new, y_test_new)
print(f'Model evaluated on the new distribution: {drifted_accuracy}')

# calcualte the average accuracy
average_accuracy = (initial_accuracy + drifted_accuracy) / 2
print(f'Average accuracy on both distributions: {average_accuracy}')

In [24]:
metrics = {"model_old_score": {
            "on_new_data": drifted_accuracy,
            "on_old_data": initial_accuracy
        }
}
print(metrics)

# Baseline agent

In [25]:
from caia.benchmark.baseline import StandardGraph
standard_graph = StandardGraph(llm_generator, debug=False)

initial_state = {
    "model_code": training_code,
    "metrics":metrics,
    "max_iterations": MAX_ITERATIONS,
    "dataset_description": dataset_description
}

output = standard_graph.run(initial_state)

short_uuid = str(uuid.uuid4())[:8]
filename = f"results/baseline_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
save_yaml_results(output, filename)

[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


# ReAct agent

In [26]:
from caia.benchmark.react import ReactImprover

# Initialize the React improver with your LLM
react_graph = ReactImprover(llm_generator)

# Prepare initial state
initial_state = {
    "model_code": training_code,
    "metrics": metrics,
    "max_iterations": MAX_ITERATIONS,
    "dataset_description": dataset_description
}

# Run the agent
output = react_graph.run(initial_state)

short_uuid = str(uuid.uuid4())[:8]
filename = f"results/react_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
save_yaml_results(output, filename)

[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


# Plan and execute agent

In [27]:
from caia.benchmark.plan_and_execute import PlanAndExecuteGraph

# Initialize with max_failures parameter
plan_execute_graph = PlanAndExecuteGraph(
    llm_generator, 
    max_iterations=MAX_ITERATIONS,
    max_failures=3  # Will stop after 3 consecutive failures
)

# Prepare initial state
initial_state = {
    "model_code": training_code,
    "metrics": metrics,
    "dataset_description": dataset_description
}

# Run the agent
output = plan_execute_graph.run(initial_state)

# create a short version of uuid using python


short_uuid = str(uuid.uuid4())[:8]
filename = f"results/planexecute_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"


save_yaml_results(output, filename)

[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


# Reflection agent

In [28]:
from caia.benchmark.reflection import ReflectionGraph


# Initialize with both max_iterations and max_failures parameters
reflection_graph = ReflectionGraph(
    llm_generator, 
    max_iterations=MAX_ITERATIONS,
    max_failures=3  # Will stop after 3 consecutive failures
)

# Prepare initial state
initial_state = {
    "model_code": training_code,
    "metrics": metrics,
    "dataset_description": dataset_description
}

# Run the agent
output = reflection_graph.run(initial_state)

short_uuid = str(uuid.uuid4())[:8]
filename = f"results/reflection_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
save_yaml_results(output, filename)

[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


# ToT

In [29]:
from caia.benchmark.tot import TreeOfThoughtsGraph
import json



# Initialize with ToT-specific parameters
tot_graph = TreeOfThoughtsGraph(
    llm_generator, 
    max_iterations=MAX_ITERATIONS,
    beam_width=3,           # Number of candidates to keep after pruning
    num_candidates=3,       # Number of candidates to generate in each expansion
    threshold=0.9,          # Score threshold for accepting a solution
    max_depth=3,            # Maximum search depth in the tree
    max_failures=3          # Will stop after 3 consecutive failures
)

# Prepare initial state
initial_state = {
    "model_code": training_code,
    "metrics": metrics,
    "dataset_description": dataset_description
}

# Run the agent
output = tot_graph.run(initial_state)

short_uuid = str(uuid.uuid4())[:8]
filename = f"results/tot_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
save_yaml_results(output, filename)

[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


# Self-Discovery

In [30]:
from caia.benchmark.self_discover import SelfDiscoverGraph

# Initialize with both max_iterations and max_failures
self_discovery_agent = SelfDiscoverGraph(
    llm_generator, 
    max_iterations=MAX_ITERATIONS,
    max_failures=4  # Will stop after 3 consecutive failures
)

# Prepare initial state
initial_state = {
    "model_code": training_code,
    "metrics": metrics,
    "dataset_description": dataset_description
}

# Run the agent
output = self_discovery_agent.run(initial_state)

short_uuid = str(uuid.uuid4())[:8]
filename = f"results/selfdiscovery_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
save_yaml_results(output, filename)


🚀 Starting Self-Discovery Model Improvement Process
Dataset: NASA Turbofan FD002 Maintenance Windows Classification Dataset
Error handling: stopping after 4 consecutive failures

🔍 SELECTING REASONING MODULES
Selected modules: 3
- 1. How could I simpl...
- 2. What are the key ...
- 3. How can I impleme...

Current Token Usage:
Prompt: 0
Completion: 0
Total: 0

🛠️ ADAPTING MODULES
Adapted modules: **Adapted Solution**

**Simplifying the Problem**

To simplify the problem, we'll focus on the following:

*   **Scaling and Normalization**: Since all features are numerical, we'll apply standard scaling to reduce the effect of feature magnitude and improve model stability.
*   **Handling Missing Values**: We'll use imputation techniques to replace missing values, as our dataset is relatively small and has no categorical features.
*   **Feature Engineering**: We'll avoid over-engineering our features but instead rely on the provided 7 numerical features.

**Improving on Distribution Shifts**

# Improver

## Semantic memory

### Training code

In [31]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# load the old data
# dataset_folder = "datasets/healthcare"
X_train_old = pd.read_csv(f"{dataset_folder}/X_train_old.csv")
X_test_old = pd.read_csv(f"{dataset_folder}/X_test_old.csv")
y_train_old = pd.read_csv(f"{dataset_folder}/y_train_old.csv").squeeze("columns")
y_test_old = pd.read_csv(f"{dataset_folder}/y_test_old.csv").squeeze("columns")

model_old = RandomForestClassifier(random_state=42)


model_old.fit(X_train_old, y_train_old)

# Test the model on the old test set
old_accuracy = model_old.score(X_test_old, y_test_old)

print(f'Model trained and evaluated on the old distribution: {old_accuracy}')

In [32]:
from caia.memory import WorkingMemory, EpisodicMemory, SemanticMemory
from caia.memory import Dataset


# tools = get_tools([calculate_trust_score])


# At the beginning, the agent has 1 entry in the semantic memory. 
# Here we put the path of each dataset file in the semantic memory.
dataset_old = Dataset(X_train=f"{dataset_folder}/X_train_old.csv",
                                     X_test=f"{dataset_folder}/X_test_old.csv",
                                     y_train=f"{dataset_folder}/y_train_old.csv",
                                     y_test=f"{dataset_folder}/y_test_old.csv",
                                     description=dataset_description)

model_code = """
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# load the old data
dataset_folder = "datasets/nasa-FD002"
X_train_old = pd.read_csv(f"{dataset_folder}/X_train_old.csv")
X_test_old = pd.read_csv(f"{dataset_folder}/X_test_old.csv")
y_train_old = pd.read_csv(f"{dataset_folder}/y_train_old.csv").squeeze("columns")
y_test_old = pd.read_csv(f"{dataset_folder}/y_test_old.csv").squeeze("columns")

model_old = RandomForestClassifier(random_state=42)


model_old.fit(X_train_old, y_train_old)

# Test the model on the old test set
old_accuracy = model_old.score(X_test_old, y_test_old)

print(f'Model trained and evaluated on the old distribution: {old_accuracy}')
"""

init_semantic_memory = SemanticMemory(dataset_old=dataset_old, 
                                        model_object=model_old, 
                                        model_code=model_code)
# semantic_memory
print(init_semantic_memory.model_code)

## Episodic memory

In [33]:
from caia.memory import Dataset
from docarray import DocList

dataset_new = Dataset(X_train=f"{dataset_folder}/X_train_new.csv",
                        X_test=f"{dataset_folder}/X_test_new.csv",
                        y_train=f"{dataset_folder}/y_train_new.csv",
                        y_test=f"{dataset_folder}/y_test_new.csv",
                        description=dataset_description)


first_episodic_memory = EpisodicMemory(dataset_new=dataset_new,
                                        quick_insight={},
                                       deep_insight=None)
init_episodic_memory = DocList[EpisodicMemory]([first_episodic_memory])
init_episodic_memory[0]


## Fast graph

In [34]:
from caia.fast.fast_graph import FastGraph
from caia.utils import save_yaml_results

working_memory = WorkingMemory(
    episodic_memory=init_episodic_memory,
    semantic_memory=init_semantic_memory,
    threshold=0.05,
    generations_fast_graph={},
    generations_slow_graph={},
    improvement_history=[],
)


fast_graph = FastGraph(llm_generator, debug=False)
output_fast_graph = fast_graph.run(working_memory)

short_uuid = str(uuid.uuid4())[:8]
filename = f"results/fast_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
save_yaml_results(output_fast_graph, filename)

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


## Slow graph

In [37]:
from caia.slow.slow_graph import SlowGraph
from caia.utils import save_yaml_results

slow_graph = SlowGraph(llm_generator, debug=False)
working_memory["max_iterations"] = MAX_ITERATIONS
working_memory["max_failures"] = 5
output_slow_graph = slow_graph.run(working_memory)


short_uuid = str(uuid.uuid4())[:8]
filename = f"results/slow_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
save_yaml_results(output_slow_graph, filename)

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


## Fast graph again

In [38]:
from caia.fast.fast_graph import FastGraph
from caia.utils import save_yaml_results

working_memory = WorkingMemory(
    episodic_memory=init_episodic_memory,
    semantic_memory=init_semantic_memory,
    threshold=0.05,
    generations_fast_graph={},
    generations_slow_graph=output_slow_graph,
    improvement_history=[],
)

fast_graph = FastGraph(llm_generator, debug=False)
output_fast_graph = fast_graph.run(working_memory)

short_uuid = str(uuid.uuid4())[:8]
print(short_uuid)
filename = f"results/improver_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
save_yaml_results(output_fast_graph, filename)

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m
