# Init values

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
SEED = 42
dataset_folder = "datasets/financial"
# dataset_folder = "datasets/healthcare"
# dataset_folder = "datasets/eligibility"

MAX_ITERATIONS = 1
TEMPERATURE = 1.0
LLM_NAME = "meta-llama/llama-3.1-8b-instruct:free"
# LLM_NAME = "meta-llama/llama-3.1-8b-instruct:free"


In [3]:
from dotenv import load_dotenv
from langchain_community.cache import SQLiteCache
from langchain.globals import set_llm_cache
from caia.utils import save_yaml_results
from caia.utils import ChatOpenRouter

import json
import uuid


# dataset_folder = "datasets/financial"
with open(f'{dataset_folder}/dataset_description.json', 'r') as f:
    dataset_description = json.load(f)

load_dotenv("env")
# set_llm_cache(SQLiteCache(database_path=".cache_langchain.db"))

llm_generator = ChatOpenRouter(model_name=LLM_NAME, cache=False,
                               temperature=TEMPERATURE) 

In [4]:
from rich import print

training_code = """
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# load the old data
dataset_folder = "datasets/financial"
X_train_old = pd.read_csv(f"{dataset_folder}/X_train_old.csv")
X_test_old = pd.read_csv(f"{dataset_folder}/X_test_old.csv")
y_train_old = pd.read_csv(f"{dataset_folder}/y_train_old.csv").squeeze("columns")
y_test_old = pd.read_csv(f"{dataset_folder}/y_test_old.csv").squeeze("columns")

model_old = RandomForestClassifier(random_state=42)


model_old.fit(X_train_old, y_train_old)

# Test the model on the old test set
old_accuracy = model_old.score(X_test_old, y_test_old)

print(f'Model trained and evaluated on the old distribution: {old_accuracy}')
"""
print(training_code)

# Baseline agent

In [None]:
from caia.benchmark.baseline import StandardGraph
standard_graph = StandardGraph(llm_generator, debug=False)

initial_state = {
    "model_code": training_code,
    "metrics": {
        "model_old_score": {
            "on_new_data": 0.7166666666666667,
            "on_old_data": 0.9133333333333333
        }
    },
    "max_iterations": MAX_ITERATIONS,
    "dataset_description": dataset_description
}

output = standard_graph.run(initial_state)

short_uuid = str(uuid.uuid4())[:8]
filename = f"results/baseline_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
save_yaml_results(output, filename)

[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


# ReAct agent

In [6]:
from caia.benchmark.react import ReactImprover

# Initialize the React improver with your LLM
react_graph = ReactImprover(llm_generator)

# Prepare initial state
initial_state = {
    "model_code": training_code,
    "metrics": {
        "model_old_score": {
            "on_new_data": 0.7166666666666667,
            "on_old_data": 0.9133333333333333
        }
    },
    "max_iterations": MAX_ITERATIONS,
    "dataset_description": dataset_description
}

# Run the agent
output = react_graph.run(initial_state)

short_uuid = str(uuid.uuid4())[:8]
filename = f"results/react_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
save_yaml_results(output, filename)

[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


Traceback (most recent call last):
  File "/home/guess/phd/improver/caia/benchmark/react.py", line 1158, in run
    for output in self.decision_procedure.stream(typed_state):
  File "/home/guess/mambaforge/envs/dspy/lib/python3.11/site-packages/langgraph/pregel/__init__.py", line 1000, in stream
    raise GraphRecursionError(
langgraph.errors.GraphRecursionError: Recursion limit of 25 reachedwithout hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.


GraphRecursionError: Recursion limit of 25 reachedwithout hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.

# Plan and execute agent

In [7]:
from caia.benchmark.plan_and_execute import PlanAndExecuteGraph

# Initialize with max_failures parameter
plan_execute_graph = PlanAndExecuteGraph(
    llm_generator, 
    max_iterations=MAX_ITERATIONS,
    max_failures=3  # Will stop after 3 consecutive failures
)

# Prepare initial state
initial_state = {
    "model_code": training_code,
    "metrics": {
        "model_old_score": {
            "on_new_data": 0.7166666666666667,
            "on_old_data": 0.9133333333333333
        }
    },
    "dataset_description": dataset_description
}

# Run the agent
output = plan_execute_graph.run(initial_state)

# create a short version of uuid using python


short_uuid = str(uuid.uuid4())[:8]
filename = f"results/planexecute_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"


save_yaml_results(output, filename)

[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


# Reflection agent

In [8]:
from caia.benchmark.reflection import ReflectionGraph


# Initialize with both max_iterations and max_failures parameters
reflection_graph = ReflectionGraph(
    llm_generator, 
    max_iterations=MAX_ITERATIONS,
    max_failures=3  # Will stop after 3 consecutive failures
)

# Prepare initial state
initial_state = {
    "model_code": training_code,
    "metrics": {
        "model_old_score": {
            "on_new_data": 0.7166666666666667,
            "on_old_data": 0.9133333333333333
        }
    },
    "dataset_description": dataset_description
}

# Run the agent
output = reflection_graph.run(initial_state)

short_uuid = str(uuid.uuid4())[:8]
filename = f"results/reflection_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
save_yaml_results(output, filename)

[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


# ToT

In [44]:
from caia.benchmark.tot import TreeOfThoughtsGraph
import json



# Initialize with ToT-specific parameters
tot_graph = TreeOfThoughtsGraph(
    llm_generator, 
    max_iterations=MAX_ITERATIONS,
    beam_width=3,           # Number of candidates to keep after pruning
    num_candidates=3,       # Number of candidates to generate in each expansion
    threshold=0.9,          # Score threshold for accepting a solution
    max_depth=3,            # Maximum search depth in the tree
    max_failures=3          # Will stop after 3 consecutive failures
)

# Prepare initial state
initial_state = {
    "model_code": training_code,
    "metrics": {
        "model_old_score": {
            "on_new_data": 0.7166666666666667,
            "on_old_data": 0.9133333333333333
        }
    },
    "dataset_description": dataset_description
}

# Run the agent
output = tot_graph.run(initial_state)

short_uuid = str(uuid.uuid4())[:8]
filename = f"results/tot_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
save_yaml_results(output, filename)

[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> USING AUTO REPLY...[0m
[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


# Self-Discovery

In [10]:
from caia.benchmark.self_discover import SelfDiscoverGraph

# Initialize with both max_iterations and max_failures
self_discovery_agent = SelfDiscoverGraph(
    llm_generator, 
    max_iterations=MAX_ITERATIONS,
    max_failures=4  # Will stop after 3 consecutive failures
)

# Prepare initial state
initial_state = {
    "model_code": training_code,
    "metrics": {
        "model_old_score": {
            "on_new_data": 0.7167,
            "on_old_data": 0.9133
        }
    },
    "dataset_description": dataset_description
}

# Run the agent
output = self_discovery_agent.run(initial_state)

short_uuid = str(uuid.uuid4())[:8]
filename = f"results/selfdiscovery_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
save_yaml_results(output, filename)


🚀 Starting Self-Discovery Model Improvement Process
Dataset: Loan Default Prediction Data
Error handling: stopping after 4 consecutive failures
Features: 10 total, 7 numerical, 3 categorical

🔍 SELECTING REASONING MODULES
Selected modules: 3
- 1. How could I simpl...
- 2. What are the key ...
- 3. How can I impleme...

Current Token Usage:
Prompt: 0
Completion: 0
Total: 0

🛠️ ADAPTING MODULES
Adapted modules: **Simplified Problem**

To simplify the problem, let's focus on a binary classification task: predicting the likelihood of a consumer defaulting on a loan based on their financial data. We'll use the existing datasets as is, without any modifications.

**Key Techniques for Improving Distribution Shifts**

To improve the model's robustness to distribution shifts, we'll employ the following techniques:

1.  **Data Ensembling**: Combine the old and new datasets to create a more comprehensive training set, reducing the impact of new data mismatch.
2.  **Regularization**: Use L1 and L

# Improver

## Semantic memory

### Training code

In [32]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# load the old data
dataset_folder = "datasets/financial"
X_train_old = pd.read_csv(f"{dataset_folder}/X_train_old.csv")
X_test_old = pd.read_csv(f"{dataset_folder}/X_test_old.csv")
y_train_old = pd.read_csv(f"{dataset_folder}/y_train_old.csv").squeeze("columns")
y_test_old = pd.read_csv(f"{dataset_folder}/y_test_old.csv").squeeze("columns")

model_old = RandomForestClassifier(random_state=42)


model_old.fit(X_train_old, y_train_old)

# Test the model on the old test set
old_accuracy = model_old.score(X_test_old, y_test_old)

print(f'Model trained and evaluated on the old distribution: {old_accuracy}')

In [33]:
from caia.memory import WorkingMemory, EpisodicMemory, SemanticMemory
from caia.memory import Dataset


# tools = get_tools([calculate_trust_score])


# At the beginning, the agent has 1 entry in the semantic memory. 
# Here we put the path of each dataset file in the semantic memory.
dataset_old = Dataset(X_train=f"{dataset_folder}/X_train_old.csv",
                                     X_test=f"{dataset_folder}/X_test_old.csv",
                                     y_train=f"{dataset_folder}/y_train_old.csv",
                                     y_test=f"{dataset_folder}/y_test_old.csv",
                                     description=dataset_description)

model_code = """
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# load the old data
dataset_folder = "datasets/financial"
X_train_old = pd.read_csv(f"{dataset_folder}/X_train_old.csv")
X_test_old = pd.read_csv(f"{dataset_folder}/X_test_old.csv")
y_train_old = pd.read_csv(f"{dataset_folder}/y_train_old.csv").squeeze("columns")
y_test_old = pd.read_csv(f"{dataset_folder}/y_test_old.csv").squeeze("columns")

model_old = RandomForestClassifier(random_state=42)


model_old.fit(X_train_old, y_train_old)

# Test the model on the old test set
old_accuracy = model_old.score(X_test_old, y_test_old)

print(f'Model trained and evaluated on the old distribution: {old_accuracy}')
"""

init_semantic_memory = SemanticMemory(dataset_old=dataset_old, 
                                        model_object=model_old, 
                                        model_code=model_code)
# semantic_memory
print(init_semantic_memory.model_code)

## Episodic memory

In [34]:
from caia.memory import Dataset
from docarray import DocList

dataset_new = Dataset(X_train=f"{dataset_folder}/X_train_new.csv",
                        X_test=f"{dataset_folder}/X_test_new.csv",
                        y_train=f"{dataset_folder}/y_train_new.csv",
                        y_test=f"{dataset_folder}/y_test_new.csv",
                        description=dataset_description)


first_episodic_memory = EpisodicMemory(dataset_new=dataset_new,
                                        quick_insight={},
                                       deep_insight=None)
init_episodic_memory = DocList[EpisodicMemory]([first_episodic_memory])
init_episodic_memory[0]


## Fast graph

In [36]:
from caia.fast.fast_graph import FastGraph
from caia.utils import save_yaml_results

working_memory = WorkingMemory(
    episodic_memory=init_episodic_memory,
    semantic_memory=init_semantic_memory,
    threshold=0.05,
    generations_fast_graph={},
    generations_slow_graph={},
    improvement_history=[],
)

fast_graph = FastGraph(llm_generator, debug=False)
output_fast_graph = fast_graph.run(working_memory)

short_uuid = str(uuid.uuid4())[:8]
filename = f"results/fast_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
save_yaml_results(output_fast_graph, filename)

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


## Slow graph

In [37]:
from caia.slow.slow_graph import SlowGraph
from caia.utils import save_yaml_results

slow_graph = SlowGraph(llm_generator, debug=False)
working_memory["max_iterations"] = MAX_ITERATIONS
working_memory["max_failures"] = 5
output_slow_graph = slow_graph.run(working_memory)


short_uuid = str(uuid.uuid4())[:8]
filename = f"results/slow_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
save_yaml_results(output_slow_graph, filename)

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


## Fast graph again

In [43]:
from caia.fast.fast_graph import FastGraph
from caia.utils import save_yaml_results

working_memory = WorkingMemory(
    episodic_memory=init_episodic_memory,
    semantic_memory=init_semantic_memory,
    threshold=0.05,
    generations_fast_graph={},
    generations_slow_graph=output_slow_graph,
    improvement_history=[],
)

fast_graph = FastGraph(llm_generator, debug=False)
output_fast_graph = fast_graph.run(working_memory)

short_uuid = str(uuid.uuid4())[:8]
print(short_uuid)
filename = f"results/improver_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
save_yaml_results(output_fast_graph, filename)

[31m
>>>>>>>> EXECUTING CODE BLOCK (inferred language is python)...[0m


KeyError: 'new_training_code'

## Improver

In [18]:
working_memory

{'episodic_memory': <DocList[EpisodicMemory] (length=1)>,
 'semantic_memory': SemanticMemory(id='d20de7e820a8fc3701975092a486899b', dataset_old=Dataset(id='9a6f8eb95b5c75e63801f0b45a6b8ea7', X_train='datasets/financial/X_train_old.csv', X_test='datasets/financial/X_test_old.csv', y_train='datasets/financial/y_train_old.csv', y_test='datasets/financial/y_test_old.csv', description={'NUM_SAMPLES': 2000, 'FEATURES': ['Age', 'Income', 'Credit Score', 'Loan Amount', 'Loan Term', 'Interest Rate', 'Employment Length', 'Home Ownership', 'Marital Status', 'Dependents'], 'NUMERICAL_FEATURES': ['Age', 'Income', 'Credit Score', 'Loan Amount', 'Loan Term', 'Interest Rate', 'Employment Length'], 'CATEGORICAL_FEATURES': ['Home Ownership', 'Marital Status', 'Dependents'], 'LABEL': 'Loan Default', 'COLUMN_TYPES': {'Age': 'int', 'Income': 'float', 'Credit Score': 'int', 'Loan Amount': 'float', 'Loan Term': 'int', 'Interest Rate': 'float', 'Employment Length': 'int', 'Home Ownership': 'int', 'Marital Sta

In [None]:
from caia.improver import Improver
from caia.memory import WorkingMemory
from caia.utils import save_yaml_results
import uuid

# Initialize working memory
working_memory = WorkingMemory(
    episodic_memory=init_episodic_memory,
    semantic_memory=init_semantic_memory,
    threshold=0.05,
    generations_fast_graph={},
    generations_slow_graph={},
    improvement_history=[],
)

# Create the improver agent
improver = Improver(
    llm_generator, 
    max_iterations=MAX_ITERATIONS, 
    max_failures=5,
    debug=False
)

# Run the agent
output = improver.run(working_memory)

# Create standardized filename
short_uuid = str(uuid.uuid4())[:8]
filename = f"results/improver_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"

# Save results
save_yaml_results(output, filename)

In [None]:
# Run the agent
output = improver.run(working_memory)

# Create standardized filename
short_uuid = str(uuid.uuid4())[:8]
filename = f"results/improver_temp_{TEMPERATURE}_max_iter_{MAX_ITERATIONS}_llm_{LLM_NAME.split('/')[1].split(':')[0]}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"

# Save results
save_yaml_results(output, filename)

# AutoML

In [18]:
from caia.benchmark.automl import run_automl_benchmark
import yaml


# Prepare initial metrics
initial_metrics = {
    "model_old_score": {
        "on_new_data": 0.7166666666666667,
        "on_old_data": 0.9133333333333333
    }
}

# Run AutoML benchmark
result = run_automl_benchmark(
    training_code, 
    dataset_description, 
    initial_metrics,
    max_iterations=3
)

# Save results
import uuid
short_uuid = str(uuid.uuid4())[:8]
filename = f"results/automl_max_iter_{MAX_ITERATIONS}_dataset_{dataset_folder.split('/')[-1]}_{short_uuid}.yaml"
with open(filename, 'w') as f:
    yaml.dump(result, f)


AutoML initialized with 7 numerical features and 3 categorical features
Max iterations: 3
Base model - Old distribution score: 0.9133
Base model - New distribution score: 0.7167




Trying random_forest...
  random_forest - Old distribution: 0.9083, New distribution: 0.8833
  Best params: {'classifier__max_depth': 10, 'classifier__n_estimators': 100, 'classifier__random_state': 42}

Trying gradient_boosting...
  gradient_boosting - Old distribution: 0.9117, New distribution: 0.8333
  Best params: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__n_estimators': 200, 'classifier__random_state': 42}

Trying logistic_regression...
  logistic_regression - Old distribution: 0.8600, New distribution: 0.8333
  Best params: {'classifier__C': 0.1, 'classifier__random_state': 42}

Iteration 1 complete in 6.18 seconds
Best model: random_forest
Old distribution: 0.9083, New distribution: 0.8833




Trying random_forest...
  random_forest - Old distribution: 0.9083, New dis



  adaboost - Old distribution: 0.8500, New distribution: 0.8333
  Best params: {'classifier__learning_rate': 0.5, 'classifier__n_estimators': 100, 'classifier__random_state': 42}

Iteration 2 complete in 23.05 seconds
Best model: random_forest
Old distribution: 0.9083, New distribution: 0.8833




Trying gradient_boosting...
  gradient_boosting - Old distribution: 0.9100, New distribution: 0.8500
  Best params: {'classifier__learning_rate': 0.05, 'classifier__max_depth': 7, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 500, 'classifier__random_state': 42, 'classifier__subsample': 0.6}

Trying svm...
  svm - Old distribution: 0.9317, New distribution: 0.8667
  Best params: {'classifier__C': 10.0, 'classifier__gamma': 'scale', 'classifier__kernel': 'rbf', 'classifier__probability': True, 'classifier__random_state': 42}

Iteration 3 complete in 152.32 seconds
Best model: svm
Old distribution: 0.9317, New distribution: 0.8667
