In [1]:
%pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.1-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.8/231.8 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [2]:
import shutil
import subprocess
import os
import time
import json

import pandas as pd
import numpy as np
from tqdm import tqdm

import spacy
import optuna
from thinc.api import Config

In [3]:
COLAB = True

In [23]:
DATA_DIR = "gdrive/MyDrive/Work/quantify-news/" if COLAB else "data/"
PROJECT_DIR = "gdrive/MyDrive/Work/quantify-news/" if COLAB else "./"
TMP_DIR = "/content/" if COLAB else "./"

GPU_ID = "0" if COLAB else "-1"

DATA_TRAIN_BIN_PATH = DATA_DIR + "ner_train.spacy"
DATA_DEV_BIN_PATH = DATA_DIR + "ner_dev.spacy"
DATA_TEST_BIN_PATH = DATA_DIR + "ner_test.spacy"
INFER_BIN_PATH = DATA_DIR + "ner_inf.spacy"

SPACY_CONFIG_PATH = PROJECT_DIR + ("spacy_base_config_colab.cfg" if COLAB else "spacy_base_config.cfg")
SPACY_FULL_CONFIG_PATH = PROJECT_DIR + "spacy_config.cfg"

MODEL_OUTPUT_PATH = TMP_DIR + "models/"
LAST_MODEL_PATH = MODEL_OUTPUT_PATH + "model-last/"
LAST_MODEL_RESULTS_PATH = LAST_MODEL_PATH + "meta.json"
BEST_MODEL_PATH = PROJECT_DIR + "models/model-best/" # For optuna, not spacy
METRICS_DIR = PROJECT_DIR + "metrics/"
METRICS_OUTPUT_PATH = METRICS_DIR + "metrics.json"
SAMPLE_OUTPUT_PATH = METRICS_DIR + "metrics_sample/"

HP_STUDY_NAME = "ner_hp"
HP_HISTORY_PATH = "sqlite:///{}/{}.db".format(PROJECT_DIR, HP_STUDY_NAME)
PARAMS_OUTPUT_PATH = PROJECT_DIR + "best_params.json"


In [6]:
if COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')
    # assert spacy.require_gpu()

    import locale
    print(locale.getpreferredencoding())
    def getpreferredencoding(do_setlocale=True):
        return 'UTF-8'
    locale.getpreferredencoding = getpreferredencoding

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
UTF-8


In [7]:
spacy_config = Config().from_disk(SPACY_CONFIG_PATH)

In [8]:
SPACY_BASE_MODEL = spacy_config['components']['ner']['source']

In [9]:
try:
    _ = spacy.load(SPACY_BASE_MODEL)
except:
    spacy.cli.download(SPACY_BASE_MODEL)

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Fine-Tune Spacy

In [10]:
!python -m spacy init fill-config {SPACY_CONFIG_PATH} {SPACY_FULL_CONFIG_PATH}

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
gdrive/MyDrive/Work/quantify-news/spacy_config.cfg
You can now add your data and train your pipeline:
python -m spacy train spacy_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
# !python -m spacy train {SPACY_FULL_CONFIG_PATH} --gpu-id {GPU_ID} --output {MODEL_OUTPUT_PATH}

## Hyperparameter tuning

In [None]:
def shell(command, time_fmt=None):
    start = time.time()
    process = subprocess.Popen(command, shell=False,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.STDOUT,
                               text=True,
                               encoding='utf-8',
                               bufsize=1)
    for line in process.stdout:
        print(line, end='', flush=True)
    process.wait()
    end = time.time()

    print(time_fmt.format(end - start))


In [None]:
def objective(trial):
    start_size = trial.suggest_int("start_size", 100, 500)  # Tune batch start
    stop_size = trial.suggest_int("stop_size", 1000, 5000)  # Tune batch stop
    learning_rate = trial.suggest_float("lr", 1e-5, 1e-3, log=True)

    command = [
        "python", "-m", "spacy",
        "train", SPACY_FULL_CONFIG_PATH,
        "--gpu-id", GPU_ID,
        "--training.batcher.size.start", str(start_size),
        "--training.batcher.size.stop", str(stop_size),
        "--training.optimizer.learn_rate", str(learning_rate),
        "--output", MODEL_OUTPUT_PATH,
    ]
    # TODO: I can use spacy.cli.train.train here instead of shell proc!
    shell(command, "Training time: {:.4f} sec")

    with open(LAST_MODEL_RESULTS_PATH) as fp:
        result = json.load(fp)

    return result['performance']['ents_f']



class SaveBestModelCallback:
    def __call__(self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial) -> None:
        if study.best_value <= trial.value:
            shutil.copytree(LAST_MODEL_PATH, BEST_MODEL_PATH)

In [None]:
study = optuna.create_study(study_name=HP_STUDY_NAME, direction="maximize",
                            storage=HP_HISTORY_PATH, load_if_exists=True,
                            callbacks=[SaveBestModelCallback()])

In [None]:
study.optimize(objective, n_trials=20)

# Evaluate

In [None]:
print("Best F1 (on val):")
print(study.best_value)
print("Best params:")
print(study.best_params)

## Visualize sample

In [15]:
if not os.path.exists(SAMPLE_OUTPUT_PATH):
    os.makedirs(SAMPLE_OUTPUT_PATH)

In [18]:
!python -m spacy \
  benchmark accuracy \
  {BEST_MODEL_PATH} {DATA_TEST_BIN_PATH} \
  --output {METRICS_OUTPUT_PATH} \
  --displacy-path {SAMPLE_OUTPUT_PATH}

[38;5;4mℹ Using CPU[0m
[1m

TOK     100.00
NER P   74.64 
NER R   51.52 
NER F   60.97 
SPEED   15911 

[1m

               P       R       F
NEWS_LOC   74.64   51.52   60.97

<IPython.core.display.HTML object>
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/spacy/__main__.py", line 4, in <module>
    setup_cli()
  File "/usr/local/lib/python3.11/dist-packages/spacy/cli/_util.py", line 87, in setup_cli
    command(prog_name=COMMAND)
  File "/usr/local/lib/python3.11/dist-packages/click/core.py", line 1161, in __call__
    return self.main(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/typer/core.py", line 740, in main
    return _main(
           ^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/typer/core.py", line 195, in _main
    rv = self.invoke(ctx)
         ^^^^^^^^^^^^^^^^
  F

## Run and test confusion matrix

In [29]:
from spacy.tokens import DocBin

In [54]:
from dataclasses import dataclass

@dataclass
class RowScore:
    true_pos: int
    false_pos: int
    false_neg: int

    def __init__(self, true_pos, false_pos, false_neg):
        self.true_pos = true_pos #dataclass ought to handle this but doesn't in colab?
        self.false_pos = false_pos
        self.false_neg = false_neg
        if true_pos + false_pos == 0:
            self.precision = 0
        else:
            self.precision = true_pos / (true_pos + false_pos)

        if true_pos + false_neg == 0:
            self.recall = 0
        else:
            self.recall = true_pos / (true_pos + false_neg)

        if self.precision == 0 or self.recall == 0:
            self.f1 = 0
        else:
            self.f1 = 2 / (1 / self.precision + 1 / self.recall)

    def __str__(self):
        return f"RowScore(precision={self.precision:.2e}, recall={self.recall:.2e}, f1={self.f1:.2e})"

    def __add__(self, other):
        if not isinstance(other, RowScore):
            raise TypeError("other must be RowScore")
        return RowScore(self.true_pos + other.true_pos,
                        self.false_pos + other.false_pos,
                        self.false_neg + other.false_neg)

    def score(self):
        print(self.true_pos, self.false_pos, self.false_neg)
        print("Strict index and text matches:")
        print("Precision: {:.4f}".format(self.precision))
        print("Recall: {:.4f}".format(self.recall))
        print("F1: {:.4f}".format(self.f1))
        return self.f1

In [74]:
@dataclass
class Entity:
    label: str
    start: int
    end: int
    text: str

def score_row(d_pred, d_true):
    y_pred = {e.text for e in d_pred.ents}
    y_true = {e.text for e in d_true.ents}
    true_pos = len(y_true & y_pred)
    false_pos = len(y_pred - y_true)
    false_neg = len(y_true - y_pred)
    if false_pos:
        print("False pos:", y_pred - y_true)
    if false_neg:
        print("False neg:", y_true - y_pred)
    return RowScore(true_pos, false_pos, false_neg)

In [70]:
!python -m spacy apply \
    {BEST_MODEL_PATH} \
    {DATA_DEV_BIN_PATH} \ # TODO: Switch back to test path
    {INFER_BIN_PATH} \
    --gpu-id "-1" \
    --force

[38;5;4mℹ Using CPU[0m
[38;5;2m✔ Loaded model gdrive/MyDrive/Work/quantify-news/models/model-best/[0m
422it [00:10, 38.44it/s]


In [27]:
nlp = spacy.load(BEST_MODEL_PATH)

In [75]:
inf_doc = DocBin().from_disk(INFER_BIN_PATH).get_docs(nlp.vocab)

In [76]:
gold_doc = DocBin().from_disk(DATA_DEV_BIN_PATH).get_docs(nlp.vocab)  # TODO: Switch back to test path

In [77]:
total_score = RowScore(0,0,0)
for id, gd in zip(inf_doc, gold_doc):
    total_score += score_row(id, gd)
total_score.score()

False pos: {'4300 block of'}
False pos: {'East Garfield Park neighborhood on the West Side.'}
False pos: {'East Garfield Park neighborhood on the West Side.'}
False pos: {'2320 E. 93rd St.,'}
False pos: {'1800 block of South Farrar Drive,'}
False pos: {'East Garfield Park neighborhood on the West Side.'}
False pos: {'300 block of N. Parkside;', '900 block of N. Le Claire;', '1000 block of N. Waller.'}
False pos: {'3900 block of West Monroe'}
False pos: {'Back of the Yards neighborhood'}
False pos: {'East Garfield Park neighborhood on the West Side,'}
False pos: {'West  Chatham neighborhood'}
False pos: {'Gresham'}
False pos: {'7200 block of South University,'}
False pos: {'400 block of South Kostner,'}
False pos: {'5500 block of South Washtenaw,', '3100 block of West 71st Street,'}
False pos: {'Back of the Yards'}
False pos: {'2100 block of South Wabash'}
False pos: {'2189 75th St,'}
False pos: {'1800 block of North Ridgeway,'}
False pos: {'100 block of North Leamington Avenue,'}
False

0.9560489352061623

XXX: this actually looks like it's doing well! it has great recall.
actually it doen't have any false negatives. and the false positives are all
IMO locations. so maybe don't need to do anything else?

Ok but i'm confused because i replicated this on the dev data and also didn't
get a false negative. but the trial didn't have 100% recall. so what is
going on here?