In [1]:
%load_ext autoreload
%autoreload 2

# Simpler Pipeline Optimization Demo

## Replicate full-fledged optimization

### Load datasets

In [2]:
from autointent.context.data_handler import Dataset
from autointent.context.utils import load_data

scoring_dataset = load_data("./data/train_data.json")
prediction_dataset = load_data("./data/test_data.json")
len(scoring_dataset.utterances), len(prediction_dataset.utterances)

(165, 57)

### Define Search Space

In [3]:
from autointent.pipeline.optimization import PipelineOptimizer

config = {
    "nodes": [
        {
            "node_type": "scoring",
            "metric": "scoring_roc_auc",
            "search_space": [
                {"module_type": "knn", "k": [5, 10], "weights": ["uniform", "distance", "closest"], "model_name": ["avsolatorio/GIST-small-Embedding-v0"]},
                {"module_type": "linear", "model_name": ["avsolatorio/GIST-small-Embedding-v0"]},
                # {
                #     "module_type": "dnnc",
                #     "cross_encoder_name": ["cross-encoder/ms-marco-MiniLM-L-6-v2", "avsolatorio/GIST-small-Embedding-v0"],
                #     "search_model_name": ["avsolatorio/GIST-small-Embedding-v0"],
                #     "k": [1, 3],
                #     "train_head": [False, True],
                # },
            ],
        },
        {
            "node_type": "prediction",
            "metric": "prediction_accuracy",
            "search_space": [
                {"module_type": "threshold", "thresh": [0.5]},
                {"module_type": "tunable"},
                # {"module_type": "argmax"},
                # {"module_type": "jinoos"},
            ],
        },
    ]
}

pipeline_optimizer = PipelineOptimizer.from_dict_config(config)

### [Optional] Configure Your Run

In [4]:
from autointent.configs.optimization_cli import LoggingConfig, VectorIndexConfig, EmbedderConfig
from pathlib import Path

pipeline_optimizer.set_config(LoggingConfig(run_name="sweet_cucumber", dirpath=Path.cwd(), dump_modules=True, clear_ram=True))
pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path("./my_vector_db").resolve(), device="cuda"))
pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))

### Run Optimization

In [5]:
context = pipeline_optimizer.optimize_from_dataset(scoring_dataset, prediction_dataset)

[I 2024-11-06 13:10:03,339] A new study created in memory with name: no-name-85c71fe7-cc94-448b-a9a0-46470688fb6b


### Save Logs

In [6]:
context.dump()

### Run Inference

In [7]:
from autointent.pipeline.inference import InferencePipeline

inference_pipeline = InferencePipeline.from_context(context)

In [8]:
inference_pipeline.predict(["hello world", "what is the eagles address"])

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])

## No modules dumping

In [9]:
! rm -rf sweet_cucumber*

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [10]:
pipeline_optimizer.set_config(LoggingConfig(dump_modules=False, clear_ram=False))
pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path("./my_vector_db").resolve(), device="cuda"))
pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))

In [11]:
context = pipeline_optimizer.optimize_from_dataset(scoring_dataset, prediction_dataset)

In [12]:
inference_pipeline = InferencePipeline.from_context(context)

In [13]:
inference_pipeline.predict(["hello world", "what is the eagles address"])

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]])