In [None]:
import os
from git import Repo
from spacy.util import load_config
from spacy import Language

import mlflow
from mlflow.entities import ViewType
import optuna

from scripts.entity_recognition.components import block_matcher, intersection_matcher, street_vs_neighborhood
import scripts.entity_recognition.operations as ops
from scripts.utils.config import Config
from scripts.utils import flatten_config
from scripts.utils.spacy import load_spacy, load_metrics
from scripts.utils.optuna import ArchiveBestModelCallback

# Setup MLFLow

In [None]:
experiment_name = "entity_recognition_models"
task = "entity_recognition"

# Provide an Experiment description that will appear in the UI
experiment_description = (
    "Prototype model architectures for place / person recognition."
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "qjn",
    "task": task,
    "mlflow.note.content": experiment_description,
}

hyperparams = set([])

In [None]:
mlflow.set_tracking_uri("http://127.0.0.1:8081")
mlflow.set_experiment(experiment_name)
mlflow.set_experiment_tags(experiment_tags)

In [None]:
def mlflow_log_eval(run_name, params, model_path, nested=False):
    
    # Evaluate model
    metrics = load_metrics(model_path, 'ner')
    best_model_path = os.path.join(model_path, "model-best")

    # Load model params
    repo = Repo(config._LOCAL_PROJECT_DIR, search_parent_directories=True)
    params['git_hash'] = repo.heads.main.commit.hexsha

    # Reshape params for logging
    params = flatten_config(params)
    # XXX: This migth have a bug converting children to strings?
    params = {k.replace("@","_AT_"): v for k,v in params.items()}

    with mlflow.start_run(run_name=run_name, nested=nested) as run:
        mlflow.log_params(params)
        mlflow.log_metrics(metrics)
        mlflow.spacy.log_model(load_spacy(best_model_path), run_name)

    return metrics

# Setup Experiments

In [None]:
config = Config()
train_path = config.get_data_path(f"{task}.labels_train")
dev_path = config.get_data_path(f"{task}.labels_dev")
base_cfg = config.get_file_path(f"{task}.base_cfg")
full_cfg = config.get_file_path(f"{task}.full_cfg")
out_path = config.get_file_path(f"{task}.trained_model")
out_path_scratch = config.get_file_path(f"{task}.trained_model", scratch=True)
comm_area_path = config.get_data_path("geoms.comm_areas")
neighborhood_path = config.get_data_path("geoms.neighborhoods")
street_name_path = config.get_data_path("geoms.street_names")

In [None]:
Language.component('block_matcher', func=block_matcher)
Language.component('intersection_matcher', func=intersection_matcher)
Language.component('street_vs_neighborhood', func=street_vs_neighborhood)

# Quickstart Model

In [None]:
params = dict(load_config(full_cfg).interpolate())
mlflow_log_eval("quickstart_model", params, out_path);

# Hyperparams

In [None]:
def objective_base(trial, overrides = {}):
    print("Training with overrides:\n", overrides)
    ops.train(base_cfg, full_cfg, train_path, dev_path, out_path_scratch, comm_area_path, neighborhood_path, street_name_path, overrides)
    
    # Train will keep base config and apply overrides at run-time.
    # So we load the config with the overrides for logging.
    params = dict(load_config(full_cfg, overrides).interpolate())

    run_name = f"optuna_trial_{trial.number}"
    metrics = mlflow_log_eval(run_name, params, out_path_scratch, nested=True)

    return metrics['ents_f']

In [None]:
study = optuna.create_study(study_name=experiment_tags['task'],
                            direction="maximize",
                            storage=config.get_param(f"{task}.optuna_db"),
                            load_if_exists=True)

In [None]:
archiver = ArchiveBestModelCallback(out_path=out_path, out_path_scratch=out_path_scratch)

In [None]:
def get_best(): 
    best = mlflow.search_runs(
        experiment_names=[experiment_name],
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=1,
        order_by=["metrics.ents_f DESC"],
    ).iloc[0].T
    return best

def best_metrics():
    best = get_best()
    return best[best.index.str.startswith("metrics")].to_dict()

def best_params(keys):
    best = get_best()
    return {key: best.loc["params." + key] for key in keys}
    
def best_model():
    best = get_best()
    return os.path.join(best['artifact_uri'], best['tags.mlflow.runName'], 'model.spacy')

## Vector Source

In [None]:
hps = ["paths.vectors", "components.ner.source", "components.tok2vec.source", "initialize.vectors"]
hyperparams.update(hps)

def objective(trial):
    overrides = best_params(hyperparams)
    base_model = trial.suggest_categorical("base_model", ["en_core_web_sm", "en_core_web_md"])
    for hp in hps:
        overrides |= {hp: base_model}
    return objective_base(trial, overrides)
    

In [None]:
# Note: as currently configured, the optuna_db goes into the caller of create_study ie the notebook
with mlflow.start_run(run_name="opt_base_model"):
    study.optimize(objective, n_trials=2, callbacks=[archiver])

# Base Model 2

In [None]:
hps = ["paths.vectors", "components.ner.source", "components.tok2vec.source", "initialize.vectors"]
hyperparams.update(hps)

def objective(trial: optuna.Trial):
    overrides = best_params(hyperparams)
    static_vecs = trial.suggest_categorical("static_vecs", ["false","true"])
    for hp in hps:
        overrides |= {hp: "en_core_web_md"}
    return objective_base(trial, overrides | {"components.tok2vec.model.embed.include_static_vectors": static_vecs})
    

In [None]:
# Note: as currently configured, the optuna_db goes into the caller of create_study ie the notebook
with mlflow.start_run(run_name="opt_base_model_2"):
    study.optimize(objective, n_trials=2, callbacks=[archiver])

## Depth

In [None]:
hps = ['components.tok2vec.model.encode.width', 'components.tok2vec.model.encode.depth']
hyperparams.update(hps)

def objective(trial: optuna.Trial):
    overrides = best_params(hyperparams)
    width = trial.suggest_categorical("tok2vec.width", [96, 128, 160, 192, 224, 256])
    depth = trial.suggest_categorical("tok2vec.depth", [4, 8])
    overrides |= {hps[0]: width, hps[1]: depth}
    return objective_base(trial, overrides)
    

In [None]:
# Note: as currently configured, the optuna_db goes into the caller of create_study ie the notebook
with mlflow.start_run(run_name="opt_tok2vec_size"):
    study.optimize(objective, n_trials=4, callbacks=[archiver])