In [1]:
%load_ext autoreload
%autoreload 2

basic concepts

- dataset
- modules
- nodes
- pipeline

## Dataset

In [22]:
from autointent.utils import load_dataset

dataset = load_dataset("/home/voorhs/repos/AutoIntent/data/intent_records/hwu64.json")
dataset

Dataset({
    type: multiclass,
    utterances: {
        features: ["text", "label"],
        num_rows: 320,
    }
    intents: {
        features: ["id", "name", "tags", "regexp_full_match", "regexp_partial_match", "description"],
        num_rows: 64
    }
})

In [23]:
from pprint import pprint

pprint(dataset[1])
pprint(dataset[-1])

Utterance(text="checkout today alarm of meeting", label=0)
Utterance(text="is it going to snow tonight", label=63)


In [24]:
pprint(dataset[:3])

[Utterance(text="what alarms do i have set right now", label=0),
 Utterance(text="checkout today alarm of meeting", label=0),
 Utterance(text="report alarm settings", label=0)]


In [25]:
pprint(dataset.intents[0])

Intent(id=0, name="alarm_query")


In [26]:
ml_dataset = dataset.to_multilabel()
pprint(ml_dataset)
pprint(ml_dataset[1])
pprint(ml_dataset[-1])

Dataset({
    type: multilabel,
    utterances: {
        features: ["text", "label"],
        num_rows: 320,
    }
    intents: {
        features: ["id", "name", "tags", "regexp_full_match", "regexp_partial_match", "description"],
        num_rows: 64
    }
})
Utterance(text="checkout today alarm of meeting", label=[0])
Utterance(text="is it going to snow tonight", label=[63])


In [27]:
pprint(dataset.texts[:10])
pprint(dataset.labels[:10])

['what alarms do i have set right now',
 'checkout today alarm of meeting',
 'report alarm settings',
 'see see for me the alarms that you have set tomorrow morning',
 'is there an alarm for ten am',
 'delete alarm',
 'remove the latest alarm',
 'stop wake up calls for this week',
 'remove the first alarm',
 'remove my earliest alarm for tomorrow']
[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]


- обёртка над hugging face datasets, чтобы можно было так же как hf обращаться к семплам и загружать с диска / на диск / в хаб
- стратифицированный сплиттинг не только на train/test, а на train/val для каждого модуля и test в конце
- опция в конфиге: какому модулю какие партиции использовать для train/val

## Modules

In [None]:
from autointent.modules import LinearScorer, TunablePredictor

scorer = LinearScorer(...)  # hyperparams
scorer.fit(...)  # train_part_1 data
probs = scorer.predict(...)  # train_part_2 data

predictor = TunablePredictor(...)  # hyperparams
predictor.fit(probs, ...)  # train_part_2 data
predictor.predict(...)  # test data

## Node Optimizer

In [None]:
from autointent.nodes.optimization import NodeOptimizer

In [None]:
search_space = {
    [
        {"module_type": "knn", "k": [1, 3, 5, 10], "weights": ["uniform", "distance", "closest"]},
        {"module_type": "linear"},
        {
            "module_type": "dnnc",
            "model_name": ["BAAI/bge-reranker-base", "cross-encoder/ms-marco-MiniLM-L-6-v2"],
            "k": [1, 3, 5, 10],
        },
    ]
}

scoring_optimizer = NodeOptimizer(node_type="scoring", search_space=search_space, metric="scoring_roc_auc")
scoring_optimizer.fit(...)  # train_1, val_1
scorer = scoring_optimizer.best()
probs = scorer.predict(...)  # train_2

In [None]:
search_space = {
    {"module_type": "threshold", "thresh": [0.5, [0.5, 0.5, 0.5]]},
    {"module_type": "tunable", "n_trials": 100},
    {"module_type": "argmax"},
    {"module_type": "jinoos"},
}
prediction_optimizer = NodeOptimizer(node_type="prediction", search_space=search_space, metric="prediction_accuracy")
prediction_optimizer.fit(probs, ...)  # train_2, val_2
predictor = prediction_optimizer.best()
predictor.predict(...)  # test data

## Pipeline

In [None]:
from autointent.pipeline.optimization import PipelineOptimizer

In [None]:
config = {
    "nodes": [
        {
            "node_type": "scoring",
            "metric": "scoring_roc_auc",
            "search_space": [
                {"module_type": "knn", "k": [5, 10], "weights": ["uniform", "distance", "closest"]},
                {"module_type": "linear"},
                {
                    "module_type": "dnnc",
                    "model_name": ["cross-encoder/ms-marco-MiniLM-L-6-v2", "avsolatorio/GIST-small-Embedding-v0"],
                    "k": [1, 3],
                    "train_head": [False, True],
                },
            ],
        },
        {
            "node_type": "prediction",
            "metric": "prediction_accuracy",
            "search_space": [
                {"module_type": "threshold", "thresh": [0.5, [0.5, 0.5, 0.5]]},
                {"module_type": "tunable"},
                {"module_type": "argmax"},
                {"module_type": "jinoos"},
            ],
        },
    ]
}

pipeline_optimizer = PipelineOptimizer.from_dict_config(config)
pipeline_optimizer.optimize(...) # data with partitions: train_1, train_2, val_1, val_2, test

## CLI

In [None]:
# override default parameters
autointent data.train_path=default-multiclass \
           data.test_path=data/intent_records/banking77_test.json \
           seed=42

https://hydra.cc/docs/1.2/upgrades/1.0_to_1.1/automatic_schema_matching/

In [None]:
# use entire new config
autointent --config-path FULL_PATH/test_config --config-name config