In [None]:
import os
import json
from git import Repo
import shutil
from pathlib import Path

import pandas as pd
import numpy as np
import secrets
import spacy
from spacy.util import load_config
from spacy.cli import evaluate, apply
from spacy.tokens import DocBin, Doc
from spacy.scorer import Scorer
import mlflow
from mlflow.entities import ViewType
import optuna
from thinc.api import Config as SpacyConfig

import scripts.art_relevance.operations as ops
from scripts.utils.config import Config
from scripts.utils import load_spacy, flatten_config, nest_config
from scripts.utils.spacy import load_metrics
from scripts.utils.optuna import ArchiveBestModelCallback

# Setup MLFLow

In [2]:
experiment_name = "art_relevance_models"

# Provide an Experiment description that will appear in the UI
experiment_description = (
    "Prototype model architectures for article relevance classifier."
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "qjn",
    "task": "art_relevance",
    "mlflow.note.content": experiment_description,
}

In [3]:
mlflow.set_tracking_uri("http://127.0.0.1:8080")
mlflow.set_experiment(experiment_name)
mlflow.set_experiment_tags(experiment_tags)

In [4]:
def normalize_param_keys(params: dict):
    # XXX: This migth have a bug converting children to strings?
    return {k.replace("@","_AT_"): v for k,v in params.items()}

In [None]:
def mlflow_log_eval(run_name, params, model_path, nested=False):
    
    # Evaluate model
    best_model_path = os.path.join(model_path, "model-best")
    metrics = load_metrics(model_path)

    # Load model params
    repo = Repo(config._LOCAL_PROJECT_DIR, search_parent_directories=True)
    params['git_hash'] = repo.heads.main.commit.hexsha

    # Reshape params for logging
    params = flatten_config(params)
    # XXX: This migth have a bug converting children to strings?
    params = {k.replace("@","_AT_"): v for k,v in params.items()}

    # TODO: Maybe integrate with dagster? Maybe not? https://docs.dagster.io/api/python-api/libraries/dagster-mlflow
    with mlflow.start_run(run_name=run_name, nested=nested) as run:
        mlflow.log_params(params)
        mlflow.log_metrics(metrics)
        mlflow.spacy.log_model(load_spacy(best_model_path), run_name)

    return metrics

# Setup Experiments

In [6]:
config = Config()
train_path = config.get_data_path("art_relevance.article_text_train")
dev_path = config.get_data_path("art_relevance.article_text_dev")
base_cfg = config.get_file_path("art_relevance.base_cfg")
full_cfg = config.get_file_path("art_relevance.full_cfg")
out_path = config.get_file_path("art_relevance.trained_model")
out_path_scratch = config.get_file_path("art_relevance.trained_model", scratch=True)

In [None]:
ops.init_config(base_cfg, full_cfg)

# Quickstart Model

In [None]:
ops.train(train_path, dev_path, full_cfg, out_path)

params = dict(load_config(full_cfg).interpolate())
mlflow_log_eval("quickstart_model", params, out_path)

# Hyperparams

In [None]:
def objective_base(trial, overrides = {}):
    ops.train(base_cfg, full_cfg, train_path, dev_path, out_path_scratch, overrides)
    
    # Train will keep base config and apply overrides at run-time.
    # So we load the config with the overrides for logging.
    params = dict(load_config(full_cfg, overrides).interpolate())

    run_name = f"optuna_trial_{trial.number}"
    metrics = mlflow_log_eval(run_name, params, out_path_scratch, nested=True)

    return metrics['CRIME']['f']

In [None]:
study = optuna.create_study(study_name=experiment_tags['task'],
                            direction="maximize",
                            storage=config.get_param("art_relevance.optuna_db"),
                            load_if_exists=True)

## Start Size

In [None]:
def objective(trial):
    hp_start_size = trial.suggest_int("training.batcher.size.start", 1, 100)  # Tune batch start
    overrides = {"training.batcher.size.start": hp_start_size}
    return objective_base(trial, overrides)
    

In [None]:
# Note: as currently configured, the optuna_db goes into the caller of create_study ie the notebook
archiver = ArchiveBestModelCallback(out_path=out_path, out_path_scratch=out_path_scratch)
with mlflow.start_run(run_name="opt_batch_start_size"):
    study.optimize(objective, n_trials=10, callbacks=[archiver])

## Vector source

In [None]:
def objective(trial):
    base_model = trial.suggest_categorical("paths.vectors", ["en_core_web_sm", "en_core_web_md"])
    overrides = {"paths.vectors": base_model,
                 "training.batcher.size.start": study.best_params['training.batcher.size.start']}
    return objective_base(trial, overrides)
    

In [None]:
# Note: as currently configured, the optuna_db goes into the caller of create_study ie the notebook
with mlflow.start_run(run_name="opt_base_model"):
    study.optimize(objective, n_trials=2, callbacks=[archiver])

**Note**: It looks like the hyperparameters aren't working, 
but we also only have 20 data points in the dev set so there aren't a lot of
ways the predictions can fall. In the next section, I verify that a different
hp set actually does return differering metrics per run. So the engineering is working.

## Bow Length


In [None]:
def objective(trial: optuna.trial.Trial):
    length = trial.suggest_int("components.textcat.model", 1, 18)
    overrides = {"paths.vectors": "en_core_web_sm",
                 "training.batcher.size.start": 32,
                 "components.textcat.model.length": 2**length}
    # load_config(base_cfg, overrides).to_disk(base_cfg)
    # ops.init_config(base_cfg, full_cfg)
    return objective_base(trial, overrides)

In [None]:
with mlflow.start_run(run_name="opt_linear_length"):
    study.optimize(objective, n_trials=9, callbacks=[archiver])

# Null Model

In [7]:
class NullModel:
    pos_percent = 0

    def train(self, data_path: str):
        blank = spacy.blank("en")
        docs = DocBin().from_disk(data_path).get_docs(blank.vocab)
        self.pos_percent = np.mean([d.cats['CRIME']==1 for d in docs])

    def eval(self, data_path):
        blank = spacy.blank("en")
        docs = list(DocBin().from_disk(data_path).get_docs(blank.vocab))
        trials = []
        for _ in range(100):
            rng = np.random.default_rng(seed=secrets.randbits(128))
                
            preds = np.where(rng.random((len(docs),)) < self.pos_percent, 
                            {'CRIME': 1, 'IRRELEVANT': 0}, 
                            {'IRRELEVANT': 1, 'CRIME': 0})
            tp = sum([d.cats['CRIME']==1 and p['CRIME']==1 for d,p in zip(docs, preds)])
            fp = sum([d.cats['CRIME']==0 and p['CRIME']==1 for d,p in zip(docs, preds)])
            fn = sum([d.cats['CRIME']==1 and p['CRIME']==0 for d,p in zip(docs, preds)])
            trials.append(dict(
                precision = tp / (tp + fp),
                recall = tp / (tp + fn),
                f1 = 2 * tp / (2 * tp + fp + fn)
            ))
        return pd.DataFrame.from_records(trials).mean().to_dict()
    
null_model = NullModel()
null_model.train(train_path)
metrics = null_model.eval(dev_path)
with mlflow.start_run(run_name="null_model_expectation", nested=False) as run:
        mlflow.log_metrics(metrics)
        mlflow.log_param("pos_proba", null_model.pos_percent)

🏃 View run null_model_expectation at: http://127.0.0.1:8080/#/experiments/937901472817136779/runs/652fdf563458421081a1315873a04d67
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/937901472817136779


# Evaluate

In [None]:
highest_accuracy_run = mlflow.search_runs(
        experiment_names=[experiment_name],
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=1,
        order_by=["metrics.f1 DESC"],
    )

### Metrics

In [None]:
# This isn't needed anymore because I found where mlflow saves the actual spacy models.
# But might be nice to have later. 
def mlflow_to_cfg(best_runs: pd.DataFrame) -> SpacyConfig:
    best_params = best_runs.iloc[0][best_runs.iloc[0].index.str.startswith('params')]
    best_params = best_params.replace("None","null")
    # XXX: This migth have a bug converting children to strings?
    best_params = {k.replace("_AT_","@"):v for k,v in best_params.to_dict().items()}
    best_params = nest_config(best_params)['params']
    del best_params['git_hash']
    best_config = SpacyConfig(best_params)
    best_config.to_disk(full_cfg)
    return best_config

In [None]:
best_model_uri = os.path.join(highest_accuracy_run.iloc[0]['artifact_uri'],"optuna_trial_25", "model.spacy")
best_metrics = evaluate(
    best_model_uri,
    dev_path,
)
best_metrics

### Visualize Predictions

In [None]:
apply(data_path=Path(dev_path), 
      output_file=Path("./preds.spacy"), 
      model=best_model_uri, 
      json_field="text", 
      batch_size=1,
      n_process=1)

In [None]:
docs_pred = DocBin().from_disk("./preds.spacy")
docs_gold = DocBin().from_disk(dev_path)

In [None]:
records = []
for pred,gold in zip(docs_pred.get_docs(spacy.blank("en").vocab),
                     docs_gold.get_docs(spacy.blank("en").vocab)):
    assert pred.text == gold.text
    records.append({'text': gold.text, 
                    'label': 'CRIME' if gold.cats['CRIME'] > gold.cats['IRRELEVANT'] else 'IRRELEVANT',
                    'IRRELEVANT': pred.cats['IRRELEVANT'],
                    'CRIME': pred.cats['CRIME']})
preds = pd.DataFrame.from_records(records)
preds['tp'] = (preds['CRIME'] > preds['IRRELEVANT']) & (preds['label'] == 'CRIME')
preds['fp'] = (preds['CRIME'] > preds['IRRELEVANT']) & (preds['label'] != 'CRIME')
preds['tn'] = (preds['CRIME'] <= preds['IRRELEVANT']) & (preds['label'] != 'CRIME')
preds['fn'] = (preds['CRIME'] <= preds['IRRELEVANT']) & (preds['label'] == 'CRIME')
preds

# Conclusions

At this point i'm guessing hp tuning won't help the model.
It would be better to either label more data and try again,
or move onto the next part, maybe re-optimizing in the interim for
precision to reduce the amount of sentences labels I have to manually throw out.