# Hydra configs for nodes

In [1]:
from hydra.utils import instantiate
from dataclasses import asdict
import logging

logger = logging.getLogger(__name__)

In [2]:
from autointent import Context
from autointent.pipeline.optimization import get_db_dir, get_run_name, load_data

run_name = get_run_name("multiclass-cpu")
db_dir = get_db_dir("", run_name)

data = load_data("/home/voorhs/repos/AutoIntent/tests/minimal-optimization/data/clinc_subset.json", multilabel=False)
context = Context(
    multiclass_intent_records=data,
    multilabel_utterance_records=[],
    test_utterance_records=[],
    device="cpu",
    mode="multiclass",
    multilabel_generation_config="",
    db_dir=db_dir,
    regex_sampling=0,
    seed=0,
)

In [3]:
from autointent.pipeline.pipeline import load_config

config = load_config(
    config_path="/home/voorhs/repos/AutoIntent/tests/minimal-optimization/configs/multiclass.yaml",
    mode="multiclass",
    logger=logger,
)

In [4]:
from pprint import pprint

pprint(config)

{'nodes': [{'metric': 'retrieval_hit_rate',
            'modules': [{'k': [10],
                         'model_name': ['sentence-transformers/all-MiniLM-L6-v2',
                                        'avsolatorio/GIST-small-Embedding-v0'],
                         'module_type': 'vector_db'}],
            'node_type': 'retrieval'},
           {'metric': 'scoring_roc_auc',
            'modules': [{'k': [5, 10],
                         'module_type': 'knn',
                         'weights': ['uniform', 'distance', 'closest']},
                        {'module_type': 'linear'},
                        {'k': [1, 3],
                         'model_name': ['cross-encoder/ms-marco-MiniLM-L-6-v2',
                                        'avsolatorio/GIST-small-Embedding-v0'],
                         'module_type': 'dnnc',
                         'train_head': [False, True]}],
            'node_type': 'scoring'},
           {'metric': 'prediction_accuracy',
            'modules': [{'mod

## Retrieval

In [5]:
from autointent.nodes import RetrievalNodeInfo

retrieval_node_info = RetrievalNodeInfo()

In [6]:
from autointent.configs.node import NodeOptimizerConfig

retrieval_optimizer_config = NodeOptimizerConfig(
    node_info=retrieval_node_info,
    search_space=config["nodes"][0]["modules"],
    metric=config["nodes"][0]["metric"],
)

In [7]:
from autointent.nodes.optimization import NodeOptimizer

retrieval_optimizer: NodeOptimizer = instantiate(retrieval_optimizer_config)

In [8]:
retrieval_optimizer.fit(context)



## Scoring

In [9]:
from autointent.nodes import ScoringNodeInfo

scoring_node_info = ScoringNodeInfo()

In [10]:
scoring_optimizer_config = NodeOptimizerConfig(
    node_info=scoring_node_info,
    search_space=config["nodes"][1]["modules"],
    metric=config["nodes"][1]["metric"],
)

In [11]:
scoring_optimizer: NodeOptimizer = instantiate(scoring_optimizer_config)

In [12]:
scoring_optimizer.fit(context)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

## Prediction

In [13]:
from autointent.nodes import PredictionNodeInfo

prediction_node_info = PredictionNodeInfo()

In [14]:
search_space = config["nodes"][2]["modules"]
search_space

[{'module_type': 'threshold', 'thresh': [0.5, [0.5, 0.5, 0.5]]},
 {'module_type': 'tunable'},
 {'module_type': 'argmax'},
 {'module_type': 'jinoos'}]

In [15]:
search_space.pop(0)
search_space

[{'module_type': 'tunable'},
 {'module_type': 'argmax'},
 {'module_type': 'jinoos'}]

In [16]:
prediction_optimizer_config = NodeOptimizerConfig(
    node_info=prediction_node_info,
    search_space=search_space,
    metric=config["nodes"][2]["metric"],
)

In [17]:
prediction_optimizer: NodeOptimizer = instantiate(prediction_optimizer_config)

In [18]:
prediction_optimizer.fit(context)

[I 2024-10-08 12:41:55,032] A new study created in memory with name: no-name-d6c41bce-e8f8-44d8-a446-527f18a8f418
Your data contains out-of-scope utterances, but ArgmaxPredictor cannot detect them. Consider different predictor
