# Hydra configs for nodes

In [1]:
from hydra.utils import instantiate
import logging

logger = logging.getLogger(__name__)

In [2]:
from autointent import Context
from autointent.pipeline.optimization import get_db_dir, get_run_name, load_data

run_name = get_run_name("multiclass-cpu")
db_dir = get_db_dir("", run_name)

data = load_data("/home/voorhs/repos/AutoIntent/tests/minimal-optimization/data/clinc_subset.json", multilabel=False)
context = Context(
    multiclass_intent_records=data,
    multilabel_utterance_records=[],
    test_utterance_records=[],
    device="cpu",
    mode="multiclass",
    multilabel_generation_config="",
    db_dir=db_dir,
    regex_sampling=0,
    seed=0,
)

In [4]:
from autointent.pipeline.optimization import load_config

config = load_config(
    config_path="/home/voorhs/repos/AutoIntent/tests/minimal-optimization/configs/multiclass.yaml",
    multilabel=False,
    logger=logger,
)

In [5]:
from pprint import pprint

pprint(config)

{'nodes': [{'metric': 'retrieval_hit_rate',
            'modules': [{'k': [10],
                         'model_name': ['sentence-transformers/all-MiniLM-L6-v2',
                                        'avsolatorio/GIST-small-Embedding-v0'],
                         'module_type': 'vector_db'}],
            'node_type': 'retrieval'},
           {'metric': 'scoring_roc_auc',
            'modules': [{'k': [5, 10],
                         'module_type': 'knn',
                         'weights': ['uniform', 'distance', 'closest']},
                        {'module_type': 'linear'},
                        {'k': [1, 3],
                         'model_name': ['cross-encoder/ms-marco-MiniLM-L-6-v2',
                                        'avsolatorio/GIST-small-Embedding-v0'],
                         'module_type': 'dnnc',
                         'train_head': [False, True]}],
            'node_type': 'scoring'},
           {'metric': 'prediction_accuracy',
            'modules': [{'mod

In [6]:
from autointent.configs.modules import MODULES_CONFIGS, create_search_space_dataclass
from typing import Any


def parse_search_space(node_type: str, search_space: dict[str, Any]):
    module_config = MODULES_CONFIGS[node_type][search_space["module_type"]]
    make_search_space_model = create_search_space_dataclass(module_config)
    return make_search_space_model(**search_space)

## Retrieval

In [7]:
node_type = "retrieval"

In [8]:
from autointent.configs.node import NodeOptimizerConfig

retrieval_optimizer_config = NodeOptimizerConfig(
    node_type=node_type,
    search_space=[parse_search_space(node_type, ss) for ss in config["nodes"][0]["modules"]],
    metric=config["nodes"][0]["metric"],
)

In [9]:
from autointent.nodes.optimization import NodeOptimizer

retrieval_optimizer: NodeOptimizer = instantiate(retrieval_optimizer_config)

In [10]:
retrieval_optimizer.fit(context)



## Scoring

In [11]:
node_type = "scoring"

In [13]:
scoring_optimizer_config = NodeOptimizerConfig(
    node_type=node_type,
    search_space=[parse_search_space(node_type, ss) for ss in config["nodes"][1]["modules"]],
    metric=config["nodes"][1]["metric"],
)

In [14]:
scoring_optimizer: NodeOptimizer = instantiate(scoring_optimizer_config)

In [15]:
scoring_optimizer.fit(context)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

## Prediction

In [16]:
node_type = "prediction"

In [17]:
prediction_optimizer_config = NodeOptimizerConfig(
    node_type=node_type,
    search_space=[parse_search_space(node_type, ss) for ss in config["nodes"][2]["modules"]],
    metric=config["nodes"][2]["metric"],
)

In [18]:
prediction_optimizer: NodeOptimizer = instantiate(prediction_optimizer_config)

In [19]:
prediction_optimizer.fit(context)

Your data contains out-of-scope utterances, but ArgmaxPredictor cannot detect them. Consider different predictor
