In [1]:
import os
from git import Repo
from pathlib import Path

import pandas as pd
import numpy as np
import secrets
import spacy
from spacy.util import load_config
from spacy.cli import apply
from spacy.scorer import Scorer, Example
from spacy.tokens import DocBin, Doc
import mlflow
from mlflow.entities import ViewType
import optuna

import scripts.sent_relevance.operations as ops
from scripts.utils.config import Config
from scripts.utils import flatten_config
from scripts.utils.spacy import load_spacy, train, load_metrics
from scripts.utils.optuna import ArchiveBestModelCallback

# Setup MLFLow

In [2]:
experiment_name = "sent_relevance_models"
task = "sent_relevance"

# Provide an Experiment description that will appear in the UI
experiment_description = (
    "Prototype model architectures for sentence relevance classifier."
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "qjn",
    "task": task,
    "mlflow.note.content": experiment_description,
}

hyperparams = set([])

In [3]:
mlflow.set_tracking_uri("http://127.0.0.1:8080")
mlflow.set_experiment(experiment_name)
mlflow.set_experiment_tags(experiment_tags)

2025-03-20 13:41:12,083 - urllib3.connectionpool - DEBUG - Starting new HTTP connection (1): 127.0.0.1:8080
2025-03-20 13:41:12,111 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/experiments/get-by-name?experiment_name=sent_relevance_models HTTP/1.1" 404 115
2025-03-20 13:41:12,117 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:41:12,143 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/experiments/create HTTP/1.1" 200 43
2025/03/20 13:41:12 INFO mlflow.tracking.fluent: Experiment with name 'sent_relevance_models' does not exist. Creating a new experiment.
2025-03-20 13:41:12,152 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:41:12,159 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/experiments/get?experiment_id=485071347047209202 HTTP/1.1" 200 296
2025-03-20 13:41:12,165 - urllib3.connectionpool - DEBUG - Resettin

In [4]:
def mlflow_log_eval(run_name, params, model_path, nested=False):
    
    # Evaluate model
    metrics = load_metrics(model_path)
    best_model_path = os.path.join(model_path, "model-best")

    # Load model params
    repo = Repo(config._LOCAL_PROJECT_DIR, search_parent_directories=True)
    params['git_hash'] = repo.heads.main.commit.hexsha

    # Reshape params for logging
    params = flatten_config(params)
    # XXX: This migth have a bug converting children to strings?
    params = {k.replace("@","_AT_"): v for k,v in params.items()}

    with mlflow.start_run(run_name=run_name, nested=nested) as run:
        mlflow.log_params(params)
        mlflow.log_metrics(metrics)
        mlflow.spacy.log_model(load_spacy(best_model_path), run_name)

    return metrics

# Setup Experiments

In [5]:
config = Config()
train_path = config.get_data_path(f"{task}.article_text_train")
dev_path = config.get_data_path(f"{task}.article_text_dev")
base_cfg = config.get_file_path(f"{task}.base_cfg")
full_cfg = config.get_file_path(f"{task}.full_cfg")
out_path = config.get_file_path(f"{task}.trained_model")
out_path_scratch = config.get_file_path(f"{task}.trained_model", scratch=True)

# Quickstart Model

In [6]:
train(train_path, dev_path, full_cfg, out_path)

params = dict(load_config(full_cfg).interpolate())
mlflow_log_eval("quickstart_model", params, out_path)

[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          39.32       60.44    0.60
  2     400          24.72       66.08    0.66
  3     600          16.28       73.09    0.73
  5     800          10.69       77.53    0.78
  7    1000           6.50       80.40    0.80
 10    1200           4.50       81.27    0.81
 13    1400           2.95       82.51    0.83
 17    1600           2.05       82.20    0.82
 23    1800           1.43       82.91    0.83
 29    2000           0.99       83.05    0.83
 37    2200           0.70       82.98    0.83
 48    2400           0.50       83.31    0.83
 60    2600           0.36       83.06  

2025-03-20 13:42:26,532 - git.util - DEBUG - Failed checking if running in CYGWIN due to: FileNotFoundError(2, 'No such file or directory')
2025-03-20 13:42:26,538 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)


Training time: 73.9s


2025-03-20 13:42:26,561 - git.cmd - DEBUG - Popen(['git', 'check-ignore', '/Users/eric/Dev/quantify-news/.venv/lib/python3.12/site-packages'], cwd=/Users/eric/Dev/quantify-news, stdin=None, shell=False, universal_newlines=False)
2025-03-20 13:42:26,588 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:42:26,609 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 962
2025-03-20 13:42:26,619 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:42:26,675 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:42:26,685 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:42:26,712 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:42:35,783 - urllib3.connectionpool - DEBUG - Resetting drop

🏃 View run quickstart_model at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/bba5128d3d054bf09b3267803d05f33e
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202


{'cats_micro_p': 0.8260869565,
 'cats_micro_r': 0.7169811321,
 'cats_micro_f': 0.7676767677,
 'cats_macro_p': 0.6993103448,
 'cats_macro_r': 0.4926436782,
 'cats_macro_f': 0.5532451819,
 'cats_f_per_type.WHO.p': 1.0,
 'cats_f_per_type.WHO.r': 0.4,
 'cats_f_per_type.WHO.f': 0.5714285714,
 'cats_f_per_type.WHAT.p': 0.6,
 'cats_f_per_type.WHAT.r': 0.6666666667,
 'cats_f_per_type.WHAT.f': 0.6315789474,
 'cats_f_per_type.WHERE.p': 1.0,
 'cats_f_per_type.WHERE.r': 0.5,
 'cats_f_per_type.WHERE.f': 0.6666666667,
 'cats_f_per_type.WHEN.p': 0.0,
 'cats_f_per_type.WHEN.r': 0.0,
 'cats_f_per_type.WHEN.f': 0.0,
 'cats_f_per_type.IRRELEVANT.p': 0.8965517241,
 'cats_f_per_type.IRRELEVANT.r': 0.8965517241,
 'cats_f_per_type.IRRELEVANT.f': 0.8965517241,
 'cats_score': 0.8350646821}

# Hyperparams

In [7]:
def objective_base(trial, overrides = {}):
    print("Training with overrides:\n", overrides)
    ops.train(base_cfg, full_cfg, train_path, dev_path, out_path_scratch, overrides)
    
    # Train will keep base config and apply overrides at run-time.
    # So we load the config with the overrides for logging.
    params = dict(load_config(full_cfg, overrides).interpolate())

    run_name = f"optuna_trial_{trial.number}"
    metrics = mlflow_log_eval(run_name, params, out_path_scratch, nested=True)

    return metrics['cats_macro_f']

In [8]:
study = optuna.create_study(study_name=experiment_tags['task'],
                            direction="maximize",
                            storage=config.get_param(f"{task}.optuna_db"),
                            load_if_exists=True)

[I 2025-03-20 13:42:37,372] Using an existing study with name 'sent_relevance' instead of creating a new one.


In [9]:
archiver = ArchiveBestModelCallback(out_path=out_path, out_path_scratch=out_path_scratch)

In [10]:
def get_best(): 
    best = mlflow.search_runs(
        experiment_names=[experiment_name],
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=1,
        order_by=["metrics.cats_macro_f DESC"],
    ).iloc[0].T
    return best

def best_metrics():
    best = get_best()
    return best[best.index.str.startswith("metrics")].to_dict()

def best_params(keys):
    best = get_best()
    return {key: best.loc["params." + key] for key in keys}
    
def best_model():
    best = get_best()
    return os.path.join(best['artifact_uri'], best['tags.mlflow.runName'], 'model.spacy')

## Start Size

In [11]:
hp = "training.batcher.size.start"
hyperparams.add(hp)

def objective(trial):
    hp_start_size = trial.suggest_int(hp, 1, 100)  # Tune batch start
    overrides = {hp: hp_start_size}
    return objective_base(trial, overrides)
    

In [12]:
# Note: as currently configured, the optuna_db goes into the caller notebook folder
with mlflow.start_run(run_name="opt_batch_start_size"):
    study.optimize(objective, n_trials=10, callbacks=[archiver])

2025-03-20 13:42:37,887 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:42:37,916 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 970


Training with overrides:
 {'training.batcher.size.start': 30}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          39.09       59.34    0.59
  1     400          26.06       67.08    0.67
  3     600          16.77       74.07    0.74
  4     800          10.77       76.57    0.77
  6    1000           7.04       80.23    0.80
  9    1200      

2025-03-20 13:44:02,312 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)


Training time: 82.3s


2025-03-20 13:44:02,338 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:44:02,361 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 13:44:02,371 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:44:02,442 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:44:02,453 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:44:02,480 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:44:09,692 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:44:09,721 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=ce5383ad15b243cb9cf666a8d0dc6017&run_id=ce5383ad15b243cb9cf666a8d0dc6017 HTTP/1.1"

🏃 View run optuna_trial_32 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/ce5383ad15b243cb9cf666a8d0dc6017
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202
Training with overrides:
 {'training.batcher.size.start': 30}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          39.09       59.34    0.59
  1 

2025-03-20 13:45:33,990 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)
2025-03-20 13:45:34,016 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1


Training time: 82.0s


2025-03-20 13:45:34,041 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 13:45:34,052 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:45:34,130 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:45:34,142 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:45:34,171 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:45:41,952 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:45:41,984 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=f9fef3649e5b43ed8367af618c698a09&run_id=f9fef3649e5b43ed8367af618c698a09 HTTP/1.1" 200 12950
2025-03-20 13:45:42,011 - urllib3.connectionpool - DEBUG - Resetting dropped connection:

🏃 View run optuna_trial_33 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/f9fef3649e5b43ed8367af618c698a09
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202
Training with overrides:
 {'training.batcher.size.start': 3}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.19       60.95    0.61
  1  

2025-03-20 13:46:26,944 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)


Training time: 42.7s


2025-03-20 13:46:26,974 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:46:27,020 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 13:46:27,044 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:46:27,132 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:46:27,143 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:46:27,173 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:46:34,602 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:46:34,633 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=2a70319755bf4f76a3e5a7fd4435e655&run_id=2a70319755bf4f76a3e5a7fd4435e655 HTTP/1.1"

🏃 View run optuna_trial_34 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/2a70319755bf4f76a3e5a7fd4435e655
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202
Training with overrides:
 {'training.batcher.size.start': 38}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       69.74    0.70
  1     200          38.24       59.79    0.60
  2 

2025-03-20 13:47:32,722 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)


Training time: 55.8s


2025-03-20 13:47:32,756 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:47:32,794 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 13:47:32,804 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:47:32,882 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:47:32,893 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:47:32,924 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:47:40,511 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:47:40,550 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=ea7cd88c85fd47e99ff97e348eb5f61c&run_id=ea7cd88c85fd47e99ff97e348eb5f61c HTTP/1.1"

🏃 View run optuna_trial_35 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/ea7cd88c85fd47e99ff97e348eb5f61c
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202
Training with overrides:
 {'training.batcher.size.start': 17}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.32       59.18    0.59
  1 

2025-03-20 13:48:59,133 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)
2025-03-20 13:48:59,163 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1


Training time: 76.3s


2025-03-20 13:48:59,198 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 13:48:59,210 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:48:59,286 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:48:59,297 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:48:59,320 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:49:06,974 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:49:07,009 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=aa599633b34045ccb2dfbbcac1256a2b&run_id=aa599633b34045ccb2dfbbcac1256a2b HTTP/1.1" 200 12950
2025-03-20 13:49:07,037 - urllib3.connectionpool - DEBUG - Resetting dropped connection:

🏃 View run optuna_trial_36 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/aa599633b34045ccb2dfbbcac1256a2b
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202
Training with overrides:
 {'training.batcher.size.start': 38}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       69.74    0.70
  1     200          38.24       59.79    0.60
  2 

2025-03-20 13:50:08,332 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)
2025-03-20 13:50:08,361 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1


Training time: 58.1s


2025-03-20 13:50:08,391 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 13:50:08,401 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:50:08,588 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:50:08,597 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:50:08,622 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:50:16,164 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:50:16,196 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=5e3ab35f619d48a9b9cf303fdbbf1d85&run_id=5e3ab35f619d48a9b9cf303fdbbf1d85 HTTP/1.1" 200 12958
2025-03-20 13:50:16,230 - urllib3.connectionpool - DEBUG - Resetting dropped connection:

🏃 View run optuna_trial_37 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/5e3ab35f619d48a9b9cf303fdbbf1d85
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202
Training with overrides:
 {'training.batcher.size.start': 44}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       69.74    0.70
  1     200          36.35       61.57    0.62
  2 

2025-03-20 13:51:33,556 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)


Training time: 74.9s


2025-03-20 13:51:33,608 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:51:33,647 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 13:51:33,660 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:51:33,740 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:51:33,753 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:51:33,825 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:51:41,796 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:51:41,826 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=a32ba84ae9d34b98803b4bec793e420e&run_id=a32ba84ae9d34b98803b4bec793e420e HTTP/1.1"

🏃 View run optuna_trial_38 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/a32ba84ae9d34b98803b4bec793e420e
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202
Training with overrides:
 {'training.batcher.size.start': 17}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.32       59.18    0.59
  1 

2025-03-20 13:52:58,491 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)


Training time: 74.3s


2025-03-20 13:52:58,536 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:52:58,589 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 13:52:58,599 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:52:58,688 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:52:58,696 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:52:58,751 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:53:07,157 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:53:07,211 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=0fca877effa8438daa92807c6a2b1209&run_id=0fca877effa8438daa92807c6a2b1209 HTTP/1.1"

🏃 View run optuna_trial_39 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/0fca877effa8438daa92807c6a2b1209
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202
Training with overrides:
 {'training.batcher.size.start': 10}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.21       60.24    0.60
  1 

2025-03-20 13:54:19,326 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)


Training time: 69.3s


2025-03-20 13:54:19,370 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:54:19,408 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 13:54:19,422 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:54:19,495 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:54:19,507 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:54:19,530 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:54:27,444 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:54:27,489 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=c1095f3fdda14487988173bad7746741&run_id=c1095f3fdda14487988173bad7746741 HTTP/1.1"

🏃 View run optuna_trial_40 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/c1095f3fdda14487988173bad7746741
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202
Training with overrides:
 {'training.batcher.size.start': 26}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          38.99       60.07    0.60
  1 

2025-03-20 13:55:39,070 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)
2025-03-20 13:55:39,122 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:55:39,159 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 13:55:39,172 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:55:39,242 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:55:39,253 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1


Training time: 69.0s


2025-03-20 13:55:39,291 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:55:48,856 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:55:48,910 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=38e32c1f943c4c5dbd82ae1d8c63d5c1&run_id=38e32c1f943c4c5dbd82ae1d8c63d5c1 HTTP/1.1" 200 12950
2025-03-20 13:55:48,946 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:55:48,956 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-model HTTP/1.1" 200 2
2025-03-20 13:55:48,984 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:55:49,025 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=38e32c1f943c4c5dbd82ae1d8c63d5c1&run_id=38e32c1f943c4c5dbd82ae1d8c63d5c1 HTTP/1.1" 200 13533
2025-03-2

🏃 View run optuna_trial_41 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/38e32c1f943c4c5dbd82ae1d8c63d5c1
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202
🏃 View run opt_batch_start_size at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/06bd48db33a245e2a747dcda59201d30
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202


## Vector source

In [13]:
hp = "paths.vectors"
hyperparams.add(hp)

def objective(trial):
    base_model = trial.suggest_categorical(hp, ["en_core_web_sm", "en_core_web_md"])
    overrides = {hp: base_model} | best_params(hyperparams)
    return objective_base(trial, overrides)
    

In [14]:
# Note: as currently configured, the optuna_db goes into the caller of create_study ie the notebook
with mlflow.start_run(run_name="opt_base_model"):
    study.optimize(objective, n_trials=2, callbacks=[archiver])

2025-03-20 13:55:49,571 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:55:49,599 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 958
2025-03-20 13:55:49,653 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:55:49,674 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/experiments/get-by-name?experiment_name=sent_relevance_models HTTP/1.1" 200 600
2025-03-20 13:55:49,683 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:55:50,034 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/search HTTP/1.1" 200 14683


Training with overrides:
 {'paths.vectors': 'en_core_web_sm', 'training.batcher.size.start': '17'}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.32       59.18    0.59
  1     400          25.98       62.87    0.63
  2     600          22.77       70.11    0.70
  3     800          15.34       73.57    0.74
  4    1000          10.67 

2025-03-20 13:57:05,728 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)


Training time: 73.2s


2025-03-20 13:57:05,761 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:57:05,787 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 13:57:05,798 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:57:05,858 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:57:05,867 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:57:05,885 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:57:13,800 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:57:13,850 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=a914aa71fe264a62a78d40ebec0365be&run_id=a914aa71fe264a62a78d40ebec0365be HTTP/1.1"

🏃 View run optuna_trial_42 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/a914aa71fe264a62a78d40ebec0365be
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202


2025-03-20 13:57:14,352 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/search HTTP/1.1" 200 14683


Training with overrides:
 {'paths.vectors': 'en_core_web_sm', 'training.batcher.size.start': '17'}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.32       59.18    0.59
  1     400          25.98       62.87    0.63
  2     600          22.77       70.11    0.70
  3     800          15.34       73.57    0.74
  4    1000          10.67 

2025-03-20 13:58:27,431 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)
2025-03-20 13:58:27,463 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1


Training time: 70.7s


2025-03-20 13:58:27,494 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 13:58:27,506 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:58:27,573 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:58:27,582 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:58:27,606 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:58:35,539 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:58:35,574 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=69f84d27710c4cd7a5846b00fd9622ac&run_id=69f84d27710c4cd7a5846b00fd9622ac HTTP/1.1" 200 12950
2025-03-20 13:58:35,600 - urllib3.connectionpool - DEBUG - Resetting dropped connection:

🏃 View run optuna_trial_43 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/69f84d27710c4cd7a5846b00fd9622ac
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202
🏃 View run opt_base_model at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/9f95f7377df34fc98e6530ea4349577a
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202


## Bow Length


In [15]:
hp = "components.textcat_multilabel.model.length"
hyperparams.add(hp)

def objective(trial: optuna.trial.Trial):
    length = trial.suggest_int(hp, 1, 18)
    overrides = ({hp: 2**length} |
                best_params(hyperparams))
    return objective_base(trial, overrides)

In [16]:
with mlflow.start_run(run_name="opt_linear_length"):
    study.optimize(objective, n_trials=9, callbacks=[archiver])

2025-03-20 13:58:36,093 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:58:36,116 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 964
2025-03-20 13:58:36,163 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:58:36,172 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/experiments/get-by-name?experiment_name=sent_relevance_models HTTP/1.1" 200 600
2025-03-20 13:58:36,179 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:58:36,508 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/search HTTP/1.1" 200 14683


Training with overrides:
 {'components.textcat_multilabel.model.length': '262144', 'training.batcher.size.start': '17', 'paths.vectors': 'en_core_web_sm'}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.32       59.18    0.59
  1     400          25.98       62.87    0.63
  2     600          22.77       70.11    0.70
  3     800       

2025-03-20 13:59:51,191 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)


Training time: 72.1s


2025-03-20 13:59:51,230 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:59:51,260 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 13:59:51,274 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:59:51,350 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:59:51,360 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:59:51,378 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 13:59:59,397 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 13:59:59,434 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=f79dae01650849d4997098fb9853af19&run_id=f79dae01650849d4997098fb9853af19 HTTP/1.1"

🏃 View run optuna_trial_44 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/f79dae01650849d4997098fb9853af19
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202


2025-03-20 13:59:59,964 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/search HTTP/1.1" 200 14683


Training with overrides:
 {'components.textcat_multilabel.model.length': '262144', 'training.batcher.size.start': '17', 'paths.vectors': 'en_core_web_sm'}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.32       59.18    0.59
  1     400          25.98       62.87    0.63
  2     600          22.77       70.11    0.70
  3     800       

2025-03-20 14:01:14,691 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)


Training time: 72.4s


2025-03-20 14:01:14,728 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:01:14,769 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 14:01:14,780 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:01:14,852 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:01:14,863 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:01:14,885 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:01:23,945 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:01:23,984 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=242d54f7cfa24e578878ac90cd7d5a79&run_id=242d54f7cfa24e578878ac90cd7d5a79 HTTP/1.1"

🏃 View run optuna_trial_45 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/242d54f7cfa24e578878ac90cd7d5a79
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202


2025-03-20 14:01:24,961 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/search HTTP/1.1" 200 14683


Training with overrides:
 {'components.textcat_multilabel.model.length': '262144', 'training.batcher.size.start': '17', 'paths.vectors': 'en_core_web_sm'}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.32       59.18    0.59
  1     400          25.98       62.87    0.63
  2     600          22.77       70.11    0.70
  3     800       

2025-03-20 14:02:37,775 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)


Training time: 70.3s


2025-03-20 14:02:37,841 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:02:38,006 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 14:02:38,023 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:02:38,099 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:02:38,111 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:02:38,134 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:02:47,062 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:02:47,139 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=bb495a2fc38241ecbed70aa8bc6e2bda&run_id=bb495a2fc38241ecbed70aa8bc6e2bda HTTP/1.1"

🏃 View run optuna_trial_46 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/bb495a2fc38241ecbed70aa8bc6e2bda
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202


2025-03-20 14:02:48,197 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/search HTTP/1.1" 200 14683


Training with overrides:
 {'components.textcat_multilabel.model.length': '262144', 'training.batcher.size.start': '17', 'paths.vectors': 'en_core_web_sm'}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.32       59.18    0.59
  1     400          25.98       62.87    0.63
  2     600          22.77       70.11    0.70
  3     800       

2025-03-20 14:04:00,948 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)
2025-03-20 14:04:00,985 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1


Training time: 70.2s


2025-03-20 14:04:01,020 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 14:04:01,031 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:04:01,160 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:04:01,170 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:04:01,195 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:04:09,344 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:04:09,377 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=55f8c6a2e976464293a80822469a6d42&run_id=55f8c6a2e976464293a80822469a6d42 HTTP/1.1" 200 12950
2025-03-20 14:04:09,409 - urllib3.connectionpool - DEBUG - Resetting dropped connection:

🏃 View run optuna_trial_47 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/55f8c6a2e976464293a80822469a6d42
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202


2025-03-20 14:04:10,362 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/search HTTP/1.1" 200 14683


Training with overrides:
 {'components.textcat_multilabel.model.length': '262144', 'training.batcher.size.start': '17', 'paths.vectors': 'en_core_web_sm'}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.32       59.18    0.59
  1     400          25.98       62.87    0.63
  2     600          22.77       70.11    0.70
  3     800       

2025-03-20 14:05:27,875 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)


Training time: 75.1s


2025-03-20 14:05:27,944 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:05:27,983 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 14:05:27,997 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:05:28,082 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:05:28,094 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:05:28,123 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:05:37,717 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:05:37,757 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=273f29122a3844cea00daa0a7948d249&run_id=273f29122a3844cea00daa0a7948d249 HTTP/1.1"

🏃 View run optuna_trial_48 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/273f29122a3844cea00daa0a7948d249
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202


2025-03-20 14:05:38,210 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:05:39,487 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/search HTTP/1.1" 200 14683


Training with overrides:
 {'components.textcat_multilabel.model.length': '262144', 'training.batcher.size.start': '17', 'paths.vectors': 'en_core_web_sm'}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.32       59.18    0.59
  1     400          25.98       62.87    0.63
  2     600          22.77       70.11    0.70
  3     800       

2025-03-20 14:06:56,853 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)


Training time: 74.3s


2025-03-20 14:06:56,889 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:06:56,928 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 14:06:56,938 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:06:57,149 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:06:57,158 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:06:57,180 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:07:04,929 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:07:04,964 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=8e5129b471f94bf783ef6ab770110a74&run_id=8e5129b471f94bf783ef6ab770110a74 HTTP/1.1"

🏃 View run optuna_trial_49 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/8e5129b471f94bf783ef6ab770110a74
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202


2025-03-20 14:07:05,623 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/search HTTP/1.1" 200 14683


Training with overrides:
 {'components.textcat_multilabel.model.length': '262144', 'training.batcher.size.start': '17', 'paths.vectors': 'en_core_web_sm'}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.32       59.18    0.59
  1     400          25.98       62.87    0.63
  2     600          22.77       70.11    0.70
  3     800       

2025-03-20 14:08:05,464 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)


Training time: 57.6s


2025-03-20 14:08:05,489 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:08:05,514 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 14:08:05,522 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:08:05,565 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:08:05,570 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:08:05,584 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:08:10,863 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:08:10,884 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=9c2eb558cef249119e774836e6c06932&run_id=9c2eb558cef249119e774836e6c06932 HTTP/1.1"

🏃 View run optuna_trial_50 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/9c2eb558cef249119e774836e6c06932
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202


2025-03-20 14:08:11,339 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/search HTTP/1.1" 200 14683


Training with overrides:
 {'components.textcat_multilabel.model.length': '262144', 'training.batcher.size.start': '17', 'paths.vectors': 'en_core_web_sm'}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.32       59.18    0.59
  1     400          25.98       62.87    0.63
  2     600          22.77       70.11    0.70
  3     800       

2025-03-20 14:09:02,487 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)
2025-03-20 14:09:02,512 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1


Training time: 49.7s


2025-03-20 14:09:02,533 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 14:09:02,542 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:09:02,590 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:09:02,597 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:09:02,612 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:09:07,719 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:09:07,743 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=0be6a60bb952444f93b0ac8b41911bff&run_id=0be6a60bb952444f93b0ac8b41911bff HTTP/1.1" 200 12950
2025-03-20 14:09:07,763 - urllib3.connectionpool - DEBUG - Resetting dropped connection:

🏃 View run optuna_trial_51 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/0be6a60bb952444f93b0ac8b41911bff
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202


2025-03-20 14:09:08,212 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/search HTTP/1.1" 200 14683


Training with overrides:
 {'components.textcat_multilabel.model.length': '262144', 'training.batcher.size.start': '17', 'paths.vectors': 'en_core_web_sm'}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.32       59.18    0.59
  1     400          25.98       62.87    0.63
  2     600          22.77       70.11    0.70
  3     800       

2025-03-20 14:10:05,079 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)


Training time: 55.4s


2025-03-20 14:10:05,115 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:10:05,152 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 14:10:05,163 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:10:05,221 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:10:05,228 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:10:05,250 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:10:11,711 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:10:11,740 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=3d57811e15fe4bca82c23081b0a1bfb1&run_id=3d57811e15fe4bca82c23081b0a1bfb1 HTTP/1.1"

🏃 View run optuna_trial_52 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/3d57811e15fe4bca82c23081b0a1bfb1
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202
🏃 View run opt_linear_length at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/e7f61d93fa0a4b56ad90f118eb6633f3
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202


# NGram

In [17]:
hp = "components.textcat_multilabel.model.ngram_size"
hyperparams.add(hp)

def objective(trial: optuna.trial.Trial):
    length = trial.suggest_int(hp, 1, 2)
    overrides = ({hp: length} | best_params(hyperparams))
    return objective_base(trial, overrides)

In [18]:
with mlflow.start_run(run_name="opt_ngram"):
    study.optimize(objective, n_trials=2, callbacks=[archiver])

2025-03-20 14:10:12,220 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:10:12,237 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 948
2025-03-20 14:10:12,269 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:10:12,277 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/experiments/get-by-name?experiment_name=sent_relevance_models HTTP/1.1" 200 600
2025-03-20 14:10:12,283 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:10:12,643 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/search HTTP/1.1" 200 14683


Training with overrides:
 {'components.textcat_multilabel.model.ngram_size': '1', 'training.batcher.size.start': '17', 'paths.vectors': 'en_core_web_sm', 'components.textcat_multilabel.model.length': '262144'}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.32       59.18    0.59
  1     400          25.98       62.87    0.63
  2     60

2025-03-20 14:11:10,466 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)
2025-03-20 14:11:10,498 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:11:10,522 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 14:11:10,530 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:11:10,576 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:11:10,584 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:11:10,602 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:11:16,600 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:11:1

🏃 View run optuna_trial_53 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/9e284812e4974a19b50fc60175284c80
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202


2025-03-20 14:11:17,172 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/search HTTP/1.1" 200 14683


Training with overrides:
 {'components.textcat_multilabel.model.ngram_size': '1', 'training.batcher.size.start': '17', 'paths.vectors': 'en_core_web_sm', 'components.textcat_multilabel.model.length': '262144'}
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
/Users/eric/Dev/quantify-news/scripts/sent_relevance/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy
[38;5;4mℹ Saving to output directory:
/Users/eric/Dev/quantify-news/models/sent_relevance[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat_multilabel'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTC...  CATS_SCORE  SCORE 
---  ------  -------------  ----------  ------
  0       0           0.25       71.43    0.71
  0     200          40.32       59.18    0.59
  1     400          25.98       62.87    0.63
  2     60

2025-03-20 14:12:13,398 - git.cmd - DEBUG - Popen(['git', 'cat-file', '--batch-check'], cwd=/Users/eric/Dev/quantify-news, stdin=<valid stream>, shell=False, universal_newlines=False)
2025-03-20 14:12:13,429 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:12:13,452 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1074
2025-03-20 14:12:13,461 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:12:13,570 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:12:13,577 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:12:13,592 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:12:19,474 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:12:1

🏃 View run optuna_trial_54 at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/88042ea6a2904452b7501fe9d25c3cb8
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202
🏃 View run opt_ngram at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/173b494a571a4b98a57c73b4a6234408
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202


# Null Model

In [19]:
class NullModel:

    def train(self, data_path: str):
        blank = spacy.blank("en")
        docs = DocBin().from_disk(data_path).get_docs(blank.vocab)
        probs = {}
        n_docs = 0
        for d in docs:
             n_docs += 1
             for c in d.cats:
                  probs[c] = probs.setdefault(c, 0) + d.cats[c]
        self.probs = {c: p / n_docs for c,p in probs.items()}
        self.n_docs = n_docs

    def eval(self, data_path):
        blank = spacy.blank("en")
        golds = list(DocBin().from_disk(data_path).get_docs(blank.vocab))
        rng = np.random.default_rng(seed=secrets.randbits(128))
        scores = pd.DataFrame.from_records([self.eval_trial(rng, golds) for _ in range(100)])
        scores = scores.select_dtypes('number')
        return scores.mean().to_dict()

    def eval_trial(self, rng, golds):
        preds = pd.concat([
            pd.Series(rng.choice(2, size=self.n_docs, p=[pc, 1-pc]), name=c)
            for c, pc in self.probs.items()
        ], axis=1)

        examples = []
        for row,gold in zip(preds.iterrows(), golds):
            d = Doc(gold.vocab).from_bytes(gold.to_bytes())
            for c in d.cats:
                d.cats[c] = row[1][c]
            examples.append(Example(d, gold))

        scorer = Scorer(default_pipeline=['textcat_multilabel'])
        return flatten_config(scorer.score_cats(examples, "cats", labels=preds.columns.to_list()))


In [20]:
null_model = NullModel()
null_model.train(train_path)
metrics = null_model.eval(dev_path)
with mlflow.start_run(run_name="null_model_expectation", nested=False) as run:
        mlflow.log_metrics(metrics)
        mlflow.log_params(null_model.probs)

2025-03-20 14:12:33,128 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:12:33,144 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 974
2025-03-20 14:12:33,150 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:12:33,168 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:12:33,174 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:12:33,182 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 200 2
2025-03-20 14:12:33,188 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:12:33,201 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/runs/get?run_uuid=38d3b3773f86499fbae95e2936c35a6d&run_id=38d3b3773f86499fbae95e2936c35a6d HTTP/1.1" 

🏃 View run null_model_expectation at: http://127.0.0.1:8080/#/experiments/485071347047209202/runs/38d3b3773f86499fbae95e2936c35a6d
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/485071347047209202


# Evaluate

## Params

In [21]:
best_params(hyperparams)

2025-03-20 14:12:33,290 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:12:33,296 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/experiments/get-by-name?experiment_name=sent_relevance_models HTTP/1.1" 200 600
2025-03-20 14:12:33,301 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:12:33,710 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/search HTTP/1.1" 200 14683


{'training.batcher.size.start': '17',
 'paths.vectors': 'en_core_web_sm',
 'components.textcat_multilabel.model.ngram_size': '1',
 'components.textcat_multilabel.model.length': '262144'}

### Metrics

In [22]:
best_metrics()

2025-03-20 14:12:33,792 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:12:33,799 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/experiments/get-by-name?experiment_name=sent_relevance_models HTTP/1.1" 200 600
2025-03-20 14:12:33,805 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:12:34,178 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/search HTTP/1.1" 200 14683


{'metrics.cats_macro_p': 0.6993103448,
 'metrics.cats_micro_r': 0.7169811321,
 'metrics.cats_macro_r': 0.4926436782,
 'metrics.cats_f_per_type.WHAT.f': 0.6315789474,
 'metrics.cats_f_per_type.WHEN.f': 0.0,
 'metrics.cats_macro_f': 0.5532451819,
 'metrics.cats_f_per_type.WHEN.p': 0.0,
 'metrics.cats_f_per_type.WHO.r': 0.4,
 'metrics.cats_f_per_type.WHERE.f': 0.6666666667,
 'metrics.cats_f_per_type.WHEN.r': 0.0,
 'metrics.cats_f_per_type.WHO.p': 1.0,
 'metrics.cats_f_per_type.WHERE.p': 1.0,
 'metrics.cats_f_per_type.WHERE.r': 0.5,
 'metrics.cats_score': 0.8371653214,
 'metrics.cats_f_per_type.IRRELEVANT.f': 0.8965517241,
 'metrics.cats_f_per_type.WHAT.r': 0.6666666667,
 'metrics.cats_f_per_type.IRRELEVANT.r': 0.8965517241,
 'metrics.cats_micro_p': 0.8260869565,
 'metrics.cats_f_per_type.IRRELEVANT.p': 0.8965517241,
 'metrics.cats_micro_f': 0.7676767677,
 'metrics.cats_f_per_type.WHO.f': 0.5714285714,
 'metrics.cats_f_per_type.WHAT.p': 0.6}

### Visualize Predictions

In [23]:
apply(data_path=Path(dev_path), 
      output_file=Path("./preds.spacy"), 
      model=best_model(), 
      json_field="text", 
      batch_size=1,
      n_process=1)

2025-03-20 14:12:34,348 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:12:34,359 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "GET /api/2.0/mlflow/experiments/get-by-name?experiment_name=sent_relevance_models HTTP/1.1" 200 600
2025-03-20 14:12:34,366 - urllib3.connectionpool - DEBUG - Resetting dropped connection: 127.0.0.1
2025-03-20 14:12:34,739 - urllib3.connectionpool - DEBUG - http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/search HTTP/1.1" 200 14683


[38;5;2m✔ Loaded model
/Users/eric/Dev/quantify-news/mlruns/485071347047209202/88042ea6a2904452b7501fe9d25c3cb8/artifacts/optuna_trial_54/model.spacy[0m


In [24]:
docs_pred = DocBin().from_disk("./preds.spacy")
docs_gold = DocBin().from_disk(dev_path)

In [28]:
compare = pd.DataFrame.from_records([({'text': gold.text} 
                           | {k+".x": v for k,v in gold.cats.items()} 
                           | {k+".y": 1 if v > .5 else 0 for k,v in pred.cats.items()} )
    for pred,gold in zip(docs_pred.get_docs(spacy.blank("en").vocab),
                        docs_gold.get_docs(spacy.blank("en").vocab))])
compare.sort_index(axis=1, ascending=False)

Unnamed: 0,text,WHO.y,WHO.x,WHERE.y,WHERE.x,WHEN.y,WHEN.x,WHAT.y,WHAT.x,IRRELEVANT.y,IRRELEVANT.x
0,"The other driver, a 23-year-old man, was cited...",0,1,0,0,0,0,0,1,1,0
1,"It states the boat ""shall not be operated wate...",0,0,0,0,0,0,0,0,1,1
2,champion James Holzhauer on working for the Cu...,0,0,0,0,0,0,0,0,1,1
3,#####,0,0,0,0,0,0,0,0,1,1
4,“It seems that CDOT is focused on tipping the ...,0,0,0,0,0,0,0,0,1,1
5,"The visitors apparently fell from Taft Point, ...",0,0,0,0,0,0,0,0,1,1
6,There are songs that I wish were made and I wa...,0,0,0,0,0,0,0,0,1,1
7,"Around that time, two people were shot a few b...",1,1,1,1,1,0,1,1,0,0
8,The shooter suffered a self-inflicted gunshot ...,1,1,0,0,0,0,1,1,0,0
9,[1] **,0,0,0,0,0,0,0,0,1,1


# Conclusions

At this point i'm guessing hp tuning won't help the model.
Though I haven't tried any of the more advanced model architectures yet. 
I'm not sure if this task is a) trainable or b) useful in the contest of the pipeline.
It would be better to either label more data and try again,
or move onto the next part. 