# Simpler Pipeline Optimization Demo

## Load datasets

In [None]:
from autointent.context.data_handler import Dataset
from autointent.context.utils import load_data

scoring_dataset = load_data("./data/train_data.json")
prediction_dataset = load_data("./data/test_data.json")
len(scoring_dataset.utterances), len(prediction_dataset.utterances)

(165, 57)

## Define Search Space

In [2]:
from autointent.pipeline.optimization import PipelineOptimizer

config = {
    "nodes": [
        {
            "node_type": "scoring",
            "metric": "scoring_roc_auc",
            "search_space": [
                {"module_type": "knn", "k": [5, 10], "weights": ["uniform", "distance", "closest"], "model_name": ["avsolatorio/GIST-small-Embedding-v0"]},
                {"module_type": "linear", "model_name": ["avsolatorio/GIST-small-Embedding-v0"]},
                # {
                #     "module_type": "dnnc",
                #     "cross_encoder_name": ["cross-encoder/ms-marco-MiniLM-L-6-v2", "avsolatorio/GIST-small-Embedding-v0"],
                #     "search_model_name": ["avsolatorio/GIST-small-Embedding-v0"],
                #     "k": [1, 3],
                #     "train_head": [False, True],
                # },
            ],
        },
        {
            "node_type": "prediction",
            "metric": "prediction_accuracy",
            "search_space": [
                {"module_type": "threshold", "thresh": [0.5]},
                {"module_type": "tunable"},
                # {"module_type": "argmax"},
                # {"module_type": "jinoos"},
            ],
        },
    ]
}

pipeline_optimizer = PipelineOptimizer.from_dict_config(config)

## [Optional] Configure Your Run

In [5]:
from autointent.configs.optimization_cli import LoggingConfig, VectorIndexConfig, EmbedderConfig
from pathlib import Path

pipeline_optimizer.set_config(LoggingConfig(run_name="sweet_cucumber", dirpath=Path(".").resolve(), dump_modules=False))
pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path("./my_vector_db").resolve(), device="cuda"))
pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))

## Run Optimization

In [6]:
context = pipeline_optimizer.optimize_from_dataset(scoring_dataset, prediction_dataset) # data with partitions: train_1, train_2, val_1, val_2, test

[I 2024-11-05 18:08:17,123] A new study created in memory with name: no-name-5066322d-4fcd-4a17-8699-c3670e71e698


## Save Logs

In [7]:
context.dump()