In [None]:
%pip install optuna

In [None]:
import shutil
import subprocess
import os
import time
import json

import pandas as pd
import numpy as np
from tqdm import tqdm

import spacy
import optuna
from thinc.api import Config

In [None]:
COLAB = True

In [None]:
DATA_DIR = "gdrive/MyDrive/Work/quantify-news/data/" if COLAB else "data/"
PROJECT_DIR = "gdrive/MyDrive/Work/quantify-news/" if COLAB else "./"
TMP_DIR = "/content/" if COLAB else "./"

GPU_ID = "0" if COLAB else "-1"

DATA_TRAIN_BIN_PATH = DATA_DIR + "ner_train.spacy"
DATA_DEV_BIN_PATH = DATA_DIR + "ner_dev.spacy"
DATA_TEST_BIN_PATH = DATA_DIR + "ner_test.spacy"
PRED_BIN_PATH = DATA_DIR + "ner_pred.spacy"

SPACY_CONFIG_PATH = PROJECT_DIR + ("spacy_base_config_colab.cfg" if COLAB else "spacy_base_config.cfg")
SPACY_FULL_CONFIG_PATH = PROJECT_DIR + "spacy_config.cfg"

# For spacy training inner loop
TRAINED_MODEL_PATH = TMP_DIR + "models/"
BEST_MODEL_TRAIN_PATH = TRAINED_MODEL_PATH + "model-best/"
BEST_MODEL_RESULTS_PATH = BEST_MODEL_TRAIN_PATH + "meta.json"

# For optuna outer loop
BEST_MODEL_OPT_PATH = PROJECT_DIR + "models/model-best/"
METRICS_DIR = PROJECT_DIR + "metrics/"
METRICS_OUTPUT_PATH = METRICS_DIR + "metrics.json"
SAMPLE_OUTPUT_PATH = METRICS_DIR + "metrics_sample/"

HP_STUDY_NAME = "ner_hp"
HP_HISTORY_PATH = "sqlite:///{}/{}.db".format(PROJECT_DIR, HP_STUDY_NAME)
PARAMS_OUTPUT_PATH = PROJECT_DIR + "best_params.json"


In [None]:
if COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')
    assert spacy.require_gpu()

    import locale
    print(locale.getpreferredencoding())
    def getpreferredencoding(do_setlocale=True):
        return 'UTF-8'
    locale.getpreferredencoding = getpreferredencoding

Mounted at /content/gdrive
UTF-8


In [None]:
spacy_config = Config().from_disk(SPACY_CONFIG_PATH)

In [None]:
SPACY_BASE_MODEL = spacy_config['components']['ner']['source']

In [None]:
try:
    nlp_base = spacy.load(SPACY_BASE_MODEL)
except:
    spacy.cli.download(SPACY_BASE_MODEL)
    nlp_base = spacy.load(SPACY_BASE_MODEL)

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Fine-Tune Spacy

In [None]:
!python -m spacy init fill-config {SPACY_CONFIG_PATH} {SPACY_FULL_CONFIG_PATH}

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
gdrive/MyDrive/Work/quantify-news/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train \
    {SPACY_FULL_CONFIG_PATH} \
    --gpu-id {GPU_ID} \
    --output {TRAINED_MODEL_PATH} \
    --verbose
# --code ./entity_remapper.py \

[38;5;2m✔ Created output directory: /content/models[0m
[38;5;4mℹ Saving to output directory: /content/models[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2025-03-11 21:34:09,251] [INFO] Set up nlp object from config
[2025-03-11 21:34:09,265] [DEBUG] Loading corpus from path: gdrive/MyDrive/Work/quantify-news/data/ner_dev.spacy
[2025-03-11 21:34:09,267] [DEBUG] Loading corpus from path: gdrive/MyDrive/Work/quantify-news/data/ner_train.spacy
[2025-03-11 21:34:09,268] [INFO] Pipeline: ['ner']
[2025-03-11 21:34:09,268] [INFO] Resuming training for: ['ner']
[2025-03-11 21:34:09,276] [INFO] Created vocabulary
[2025-03-11 21:34:11,200] [INFO] Added vectors: en_core_web_md
[2025-03-11 21:34:11,307] [INFO] Finished initializing nlp object
[2025-03-11 21:34:11,307] [INFO] Initialized pipeline components: []
[38;5;2m✔ Initialized pipeline[0m
[1m
[2025-03-11 21:34:11,322] [DEBUG] Loading corpus from path: gdrive/MyDrive/Work/quantify-news/data/ner_dev.spacy
[2025-03-11 21:34:11,324] [DEBUG] Loading

## Hyperparameter tuning

In [None]:
def shell(command, time_fmt=None):
    start = time.time()
    process = subprocess.Popen(command, shell=False,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.STDOUT,
                               text=True,
                               encoding='utf-8',
                               bufsize=1)
    for line in process.stdout:
        print(line, end='', flush=True)
    process.wait()
    end = time.time()

    print(time_fmt.format(end - start))


In [None]:
def objective(trial):
    start_size = trial.suggest_int("start_size", 100, 500)  # Tune batch start
    stop_size = trial.suggest_int("stop_size", 1000, 5000)  # Tune batch stop
    learning_rate = trial.suggest_float("lr", 1e-5, 1e-3, log=True)

    command = [
        "python", "-m", "spacy",
        "train", SPACY_FULL_CONFIG_PATH,
        "--gpu-id", GPU_ID,
        "--training.batcher.size.start", str(start_size),
        "--training.batcher.size.stop", str(stop_size),
        "--training.optimizer.learn_rate", str(learning_rate),
        "--output", TRAINED_MODEL_PATH,
    ]
    # TODO: I can use spacy.cli.train.train here instead of shell proc!
    shell(command, "Training time: {:.4f} sec")

    with open(BEST_MODEL_RESULTS_PATH) as fp:
        result = json.load(fp)

    return result['performance']['ents_f']



# class SaveBestModelCallback:
#     def __call__(self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial) -> None:
#         if study.best_value <= trial.value:
#             shutil.copytree(BEST_MODEL_TRAIN_PATH, BEST_MODEL_OPT_PATH)

In [None]:
# study = optuna.create_study(study_name=HP_STUDY_NAME, direction="maximize",
#                             storage=HP_HISTORY_PATH, load_if_exists=True,
#                             callbacks=[SaveBestModelCallback()])

In [None]:
# study.optimize(objective, n_trials=20)

In [None]:
# print("Best F1 (on val):")
# print(study.best_value)
# print("Best params:")
# print(study.best_params)

# Evaluate

## Visualize sample

In [None]:
# if not os.path.exists(SAMPLE_OUTPUT_PATH):
#     os.makedirs(SAMPLE_OUTPUT_PATH)

In [None]:
# !python -m spacy \
#   benchmark accuracy \
#   {BEST_MODEL_TRAIN_PATH} {DATA_DEV_BIN_PATH} \
#   --gpu-id {GPU_ID} \
#   --output {METRICS_OUTPUT_PATH} \
#   --displacy-path {SAMPLE_OUTPUT_PATH}

## Run and test confusion matrix

In [None]:
from spacy.tokens import DocBin

In [None]:
!python -m spacy apply \
    {BEST_MODEL_OPT_PATH} \
    {DATA_DEV_BIN_PATH} \
    {PRED_BIN_PATH} \
    --gpu-id {GPU_ID} \
    --force

[38;5;4mℹ Using GPU: 0[0m
[38;5;2m✔ Loaded model gdrive/MyDrive/Work/quantify-news/models/model-best/[0m
448it [00:10, 42.83it/s]


In [None]:
nlp = spacy.load(BEST_MODEL_OPT_PATH)

In [None]:
pred_docs = list(DocBin().from_disk(PRED_BIN_PATH).get_docs(nlp.vocab))

In [None]:
gold_docs = list(DocBin().from_disk(DATA_DEV_BIN_PATH).get_docs(nlp_base.vocab))

In [None]:
assert len(pred_docs) == len(gold_docs)

In [None]:
from spacy.scorer import Scorer
from spacy.training import Example

scorer = Scorer()
examples = [Example(infer, gold) for infer, gold in zip(pred_docs, gold_docs)]
scores = scorer.score(examples)
print("F1: ", scores['ents_f'])
print("Precision: ", scores['ents_p'])
print("Recall: ", scores['ents_r'])


F1:  0.9418653283145128
Precision:  0.8901185770750988
Recall:  1.0


XXX: this actually looks like it's doing well! it has great recall.
actually it doen't have any false negatives. and the false positives are all
IMO locations. so maybe don't need to do anything else?

Ok but i'm confused because i replicated this on the dev data and also didn't
get a false negative. but the trial didn't have 100% recall. so what is
going on here?