In [1]:
from pathlib import Path
from typing import Literal
from autointent.pipeline.optimization.utils import load_config

TaskType = Literal["multiclass", "multilabel", "description"]


def setup_environment() -> tuple[str, str]:
    logs_dir = Path.cwd() / "logs"
    db_dir = logs_dir / "db"
    dump_dir = logs_dir / "modules_dump"
    return db_dir, dump_dir, logs_dir

def get_search_space_path(task_type: TaskType):
    return Path("/home/voorhs/repos/AutoIntent/tests/assets/configs").joinpath(f"{task_type}.yaml")


def get_search_space(task_type: TaskType):
    path = get_search_space_path(task_type)
    return load_config(str(path), multilabel=task_type == "multilabel")

In [2]:
setup_environment()

(PosixPath('/home/voorhs/repos/AutoIntent/experiments/predict_with_metadata/logs/db'),
 PosixPath('/home/voorhs/repos/AutoIntent/experiments/predict_with_metadata/logs/modules_dump'),
 PosixPath('/home/voorhs/repos/AutoIntent/experiments/predict_with_metadata/logs'))

In [3]:
get_search_space("multiclass")

{'nodes': [{'node_type': 'retrieval',
   'metric': 'retrieval_hit_rate',
   'search_space': [{'module_type': 'vector_db',
     'k': [10],
     'embedder_name': ['sentence-transformers/all-MiniLM-L6-v2',
      'avsolatorio/GIST-small-Embedding-v0']}]},
  {'node_type': 'scoring',
   'metric': 'scoring_roc_auc',
   'search_space': [{'module_type': 'knn',
     'k': [5, 10],
     'weights': ['uniform', 'distance', 'closest']},
    {'module_type': 'linear'},
    {'module_type': 'dnnc',
     'cross_encoder_name': ['cross-encoder/ms-marco-MiniLM-L-6-v2',
      'avsolatorio/GIST-small-Embedding-v0'],
     'k': [1, 3],
     'train_head': [False, True]}]},
  {'node_type': 'prediction',
   'metric': 'prediction_accuracy',
   'search_space': [{'module_type': 'threshold',
     'thresh': [0.5, [0.5, 0.5, 0.5]]},
    {'module_type': 'tunable'},
    {'module_type': 'argmax'},
    {'module_type': 'jinoos'}]}]}

In [10]:
from autointent.context.utils import load_data


def get_dataset_path():
    return Path("/home/voorhs/repos/AutoIntent/tests/assets/data").joinpath("clinc_subset.json")


def get_dataset():
    return load_data(get_dataset_path())

In [11]:
task_type = "multiclass"

In [12]:
from autointent.pipeline.optimization import PipelineOptimizer
from autointent.configs.optimization_cli import LoggingConfig, VectorIndexConfig, EmbedderConfig

db_dir, dump_dir, logs_dir = setup_environment()
search_space = get_search_space(task_type)

pipeline_optimizer = PipelineOptimizer.from_dict_config(search_space)

pipeline_optimizer.set_config(LoggingConfig(dirpath=Path(logs_dir).resolve(), dump_modules=True))
pipeline_optimizer.set_config(VectorIndexConfig(db_dir=Path(db_dir).resolve(), device="cpu", save_db=True))
pipeline_optimizer.set_config(EmbedderConfig(batch_size=16, max_length=32))


dataset = get_dataset()
context = pipeline_optimizer.optimize_from_dataset(dataset, force_multilabel=(task_type == "multilabel"))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [13]:
from autointent.pipeline.inference import InferencePipeline


inference_pipeline = InferencePipeline.from_context(context)
prediction = inference_pipeline.predict_with_metadata(["123", "hello world"])

In [17]:
from pprint import pprint

pprint(prediction.predictions)
pprint(prediction.utterances)

[2, 2]
[InferencePipelineUtteranceOutput(utterance='123', prediction=2, regexp_prediction=None, regexp_prediction_metadata=None, score=[0.0, 0.4, 0.6], score_metadata={'neighbors': ['set my alarm for getting up', 'wake me up at noon tomorrow', 'i am nost sure why my account is blocked', 'i think my account is blocked but i do not know the reason', 'please set an alarm for mid day']}),
 InferencePipelineUtteranceOutput(utterance='hello world', prediction=2, regexp_prediction=None, regexp_prediction_metadata=None, score=[0.0, 0.4, 0.6], score_metadata={'neighbors': ['wake me up at noon tomorrow', 'set my alarm for getting up', 'please set an alarm for mid day', 'why is there a hold on my american saving bank account', 'i am nost sure why my account is blocked']})]


In [None]:
if task_type == "multilabel":
    assert prediction.shape == (2, len(dataset.intents))
else:
    assert prediction.shape == (2,)

context.dump()
context.vector_index_client.delete_db()