In [None]:
import os
from git import Repo
import mlflow
from spacy.util import load_config
import json
import optuna
import shutil

import scripts.art_relevance.operations as ops
from scripts.utils.config import Config
from scripts.utils import load_spacy, flatten_config

# Setup MLFLow

In [None]:
# Provide an Experiment description that will appear in the UI
experiment_description = (
    "Prototype model architectures for article relevance classifier."
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "qjn",
    "task": "art_relevance",
    "mlflow.note.content": experiment_description,
}

In [None]:
mlflow.set_tracking_uri("http://127.0.0.1:8080")
mlflow.set_experiment("art_relevance_models")
mlflow.set_experiment_tags(experiment_tags)

In [None]:
def normalize_param_keys(params: dict):
    return {k.replace("@","_AT_"): v for k,v in params.items()}

In [None]:
def mlflow_log_eval(run_name, params, model_path, nested=False):
    
    # Evaluate model
    best_model_path = os.path.join(model_path, "model-best")
    metric_file = os.path.join(best_model_path, "meta.json")
    with open(metric_file) as fp:
        metrics = json.load(fp)['performance']
    track_metrics = dict(
        precision = metrics['cats_f_per_type']['CRIME']['p'],
        recall = metrics['cats_f_per_type']['CRIME']['r'],
        f1 = metrics['cats_f_per_type']['CRIME']['f'],
    )

    # Load model params
    repo = Repo(config._LOCAL_PROJECT_DIR, search_parent_directories=True)
    params['git_hash'] = repo.heads.main.commit.hexsha

    # Reshape params for logging
    params = flatten_config(params)
    params = {k.replace("@","_AT_"): v for k,v in params.items()}

    # TODO: Maybe integrate with dagster? Maybe not? https://docs.dagster.io/api/python-api/libraries/dagster-mlflow
    with mlflow.start_run(run_name=run_name, nested=nested) as run:
        mlflow.log_params(params)
        mlflow.log_metrics(track_metrics)
        mlflow.spacy.log_model(load_spacy(best_model_path), run_name)

    return track_metrics

# Setup Experiments

In [None]:
config = Config()
train_path = config.get_data_path("art_relevance.article_text_train")
dev_path = config.get_data_path("art_relevance.article_text_dev")
base_cfg = config.get_file_path("art_relevance.base_cfg")
full_cfg = config.get_file_path("art_relevance.full_cfg")
out_path = config.get_file_path("art_relevance.trained_model")
out_path_scratch = config.get_file_path("art_relevance.trained_model", scratch=True)

In [None]:
ops.init_config(base_cfg, full_cfg)

# Quickstart Model

In [None]:
ops.train(train_path, dev_path, full_cfg, out_path)

params = dict(load_config(full_cfg).interpolate())
mlflow_log_eval("quickstart_model", params, out_path)

# Hyperparams

In [None]:
def objective_base(trial, overrides):
    ops.train(train_path, dev_path, full_cfg, out_path_scratch, overrides)
    
    # Train will keep base config and apply overrides at run-time.
    # So we load the config with the overrides for logging.
    params = dict(load_config(full_cfg, overrides).interpolate())

    run_name = f"optuna_trial_{trial.number}"
    metrics = mlflow_log_eval(run_name, params, out_path_scratch, nested=True)

    return metrics['f1']

In [None]:
class ArchiveBestModelCallback:
    def __call__(self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial) -> None:
        best_scratch = os.path.join(out_path_scratch,'model-best')
        best_stable = os.path.join(out_path,'model-best')
        if best_stable == best_scratch:
            return # Nothing to do.
        if ((study.direction == optuna.study.StudyDirection.MAXIMIZE
            and study.best_value <= trial.value) or
            (study.direction == optuna.study.StudyDirection.MINIMIZE
             and study.best_value >= trial.value)):
            shutil.copytree(best_scratch, best_stable)

## Start Size

In [None]:
def objective(trial):
    hp_start_size = trial.suggest_int("training.batcher.size.start", 1, 100)  # Tune batch start
    overrides = {"training.batcher.size.start": hp_start_size}
    return objective_base(trial, overrides)
    

In [None]:
# Note: as currently configured, the optuna_db goes into the caller of create_study ie the notebook
study = optuna.create_study(study_name=experiment_tags['task'],
                            direction="maximize",
                            storage=config.get_param("art_relevance.optuna_db"),
                            load_if_exists=True)
with mlflow.start_run(run_name="opt_batch_start_size"):
    study.optimize(objective, n_trials=10, callbacks=[ArchiveBestModelCallback()])
    mlflow.log_params({
        f"best_{k}": v for k, v in study.best_params.items()
    })

## Base Model

In [None]:
def objective(trial):
    base_model = trial.suggest_categorical("paths.vectors", ["en_core_web_sm", "en_core_web_md"])
    overrides = {"paths.vectors": base_model,
                 "training.batcher.size.start": study.best_params['training.batcher.size.start']}
    return objective_base(trial, overrides)
    

In [None]:
# Note: as currently configured, the optuna_db goes into the caller of create_study ie the notebook
with mlflow.start_run(run_name="opt_base_model"):
    study.optimize(objective, n_trials=2, callbacks=[ArchiveBestModelCallback()])
    mlflow.log_params({
        f"best_{k}": v for k, v in study.best_params.items()
    })