# Hydra configs for modules

In [1]:
from hydra.utils import instantiate
from dataclasses import asdict

In [2]:
from autointent import Context
from autointent.pipeline.optimization import get_db_dir, get_run_name, load_data

run_name = get_run_name("multiclass-cpu")
db_dir = get_db_dir("", run_name)

data = load_data("/home/voorhs/repos/AutoIntent/tests/minimal-optimization/data/clinc_subset.json", multilabel=False)
context = Context(
    multiclass_intent_records=data,
    multilabel_utterance_records=[],
    test_utterance_records=[],
    device="cpu",
    mode="multiclass",
    multilabel_generation_config="",
    db_dir=db_dir,
    regex_sampling=0,
    seed=0,
)

## Retrieval

In [3]:
from autointent.modules import VectorDBModule
from autointent.configs.modules import VectorDBConfig
from autointent.metrics import retrieval_hit_rate

retriever_config = VectorDBConfig(k=3, model_name="sergeyzh/rubert-tiny-turbo")
retriever: VectorDBModule = instantiate(retriever_config)

retriever.fit(context)
metric_value = retriever.score(context, retrieval_hit_rate)
artifact = retriever.get_assets()

context.optimization_info.log_module_optimization(
    node_type="retrieval",
    module_type="vector_db",
    module_params=asdict(retriever_config),
    metric_value=metric_value,
    metric_name="retrieval_hit_rate",
    artifact=artifact,
)

## Scoring

In [4]:
from autointent.modules import KNNScorer
from autointent.configs.modules import KNNScorerConfig
# from autointent.configs.modules.scoring.knn import KNNWeightsType
from autointent.metrics import scoring_roc_auc

scorer_config = KNNScorerConfig(k=3, weights="distance")
scorer: KNNScorer = instantiate(scorer_config)

scorer.fit(context)
metric_value = scorer.score(context, scoring_roc_auc)
artifact = scorer.get_assets()

context.optimization_info.log_module_optimization(
    node_type="scoring",
    module_type="knn",
    module_params=asdict(scorer_config),
    metric_value=metric_value,
    metric_name="scoring_roc_auc",
    artifact=artifact,
)

In [5]:
from autointent.modules import LinearScorer
from autointent.configs.modules import LinearScorerConfig
from autointent.metrics import scoring_roc_auc

scorer_config = LinearScorerConfig()
scorer: LinearScorer = instantiate(scorer_config)

scorer.fit(context)
metric_value = scorer.score(context, scoring_roc_auc)
artifact = scorer.get_assets()

context.optimization_info.log_module_optimization(
    node_type="scoring",
    module_type="linear",
    module_params=asdict(scorer_config),
    metric_value=metric_value,
    metric_name="scoring_roc_auc",
    artifact=artifact,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [6]:
from autointent.modules import DNNCScorer
from autointent.configs.modules import DNNCScorerConfig
from autointent.metrics import scoring_roc_auc

scorer_config = DNNCScorerConfig(model_name="cross-encoder/ms-marco-MiniLM-L-6-v2", k=3, train_head=True)
scorer: DNNCScorer = instantiate(scorer_config)

scorer.fit(context)
metric_value = scorer.score(context, scoring_roc_auc)
artifact = scorer.get_assets()

context.optimization_info.log_module_optimization(
    node_type="scoring",
    module_type="dnnc",
    module_params=asdict(scorer_config),
    metric_value=metric_value,
    metric_name="scoring_roc_auc",
    artifact=artifact,
)

## Prediction

In [7]:
from autointent.modules import ThresholdPredictor
from autointent.configs.modules import ThresholdPredictorConfig
from autointent.metrics import prediction_accuracy

predictor_config = ThresholdPredictorConfig(thresh=0.5)
predictor: ThresholdPredictor = instantiate(predictor_config)

predictor.fit(context)
metric_value = predictor.score(context, prediction_accuracy)
artifact = predictor.get_assets()

context.optimization_info.log_module_optimization(
    node_type="prediction",
    module_type="threshold",
    module_params=asdict(predictor_config),
    metric_value=metric_value,
    metric_name="prediction_accuracy",
    artifact=artifact,
)

In [8]:
from autointent.modules import TunablePredictor
from autointent.configs.modules import TunablePredictorConfig
from autointent.metrics import prediction_accuracy

predictor_config = TunablePredictorConfig(n_trials=100)
predictor: TunablePredictor = instantiate(predictor_config)

predictor.fit(context)
metric_value = predictor.score(context, prediction_accuracy)
artifact = predictor.get_assets()

context.optimization_info.log_module_optimization(
    node_type="prediction",
    module_type="tunable",
    module_params=asdict(predictor_config),
    metric_value=metric_value,
    metric_name="prediction_accuracy",
    artifact=artifact,
)

[I 2024-10-07 11:29:23,904] A new study created in memory with name: no-name-6ae90f0e-5f5a-4484-a6d2-bec67e7f1cad


In [9]:
from autointent.modules import ArgmaxPredictor
from autointent.configs.modules import ArgmaxPredictorConfig
from autointent.metrics import prediction_accuracy

predictor_config = ArgmaxPredictorConfig()
predictor: ArgmaxPredictor = instantiate(predictor_config)

predictor.fit(context)
metric_value = predictor.score(context, prediction_accuracy)
artifact = predictor.get_assets()

context.optimization_info.log_module_optimization(
    node_type="prediction",
    module_type="argmax",
    module_params=asdict(predictor_config),
    metric_value=metric_value,
    metric_name="prediction_accuracy",
    artifact=artifact,
)

Your data contains out-of-scope utterances, but ArgmaxPredictor cannot detect them. Consider different predictor


In [10]:
from autointent.modules import JinoosPredictor
from autointent.configs.modules import JinoosPredictorConfig
from autointent.metrics import prediction_accuracy

predictor_config = JinoosPredictorConfig()
predictor: JinoosPredictor = instantiate(predictor_config)

predictor.fit(context)
metric_value = predictor.score(context, prediction_accuracy)
artifact = predictor.get_assets()

context.optimization_info.log_module_optimization(
    node_type="prediction",
    module_type="jinoos",
    module_params=asdict(predictor_config),
    metric_value=metric_value,
    metric_name="prediction_accuracy",
    artifact=artifact,
)