In [None]:
%pip install optuna

In [None]:
import shutil
import subprocess
import os
import time
import json

import pandas as pd
import numpy as np
from tqdm import tqdm

import spacy
import optuna
from thinc.api import Config

In [None]:
COLAB = True

In [None]:
DATA_DIR = "gdrive/MyDrive/Work/quantify-news/" if COLAB else "data/"
PROJECT_DIR = "gdrive/MyDrive/Work/quantify-news/" if COLAB else "./"
TMP_DIR = "/content/" if COLAB else "./"

DATA_TRAIN_BIN_PATH = DATA_DIR + "ner_train.spacy"
DATA_DEV_BIN_PATH = DATA_DIR + "ner_dev.spacy"
DATA_TEST_BIN_PATH = DATA_DIR + "ner_test.spacy"

GPU_ID = "0" if COLAB else "-1"

SPACY_CONFIG_PATH = PROJECT_DIR + ("spacy_base_config_colab.cfg" if COLAB else "spacy_base_config.cfg")
SPACY_FULL_CONFIG_PATH = PROJECT_DIR + "spacy_config.cfg"

MODEL_OUTPUT_PATH = TMP_DIR + "models/"
LAST_MODEL_PATH = MODEL_OUTPUT_PATH + "model-last/"
LAST_MODEL_RESULTS_PATH = LAST_MODEL_PATH + "meta.json"
BEST_MODEL_PATH = PROJECT_DIR + "model-best/" # For optuna, not spacy
METRICS_OUTPUT_PATH = PROJECT_DIR + "metrics.json"
SAMPLE_OUTPUT_PATH = PROJECT_DIR + "metrics_sample"

HP_STUDY_NAME = "ner_hp"
HP_HISTORY_PATH = "sqlite:///{}/{}.db".format(PROJECT_DIR, HP_STUDY_NAME)
PARAMS_OUTPUT_PATH = PROJECT_DIR + "best_params.json"


In [None]:
if COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')
    assert spacy.require_gpu()

    import locale
    print(locale.getpreferredencoding())
    def getpreferredencoding(do_setlocale=True):
        return 'UTF-8'
    locale.getpreferredencoding = getpreferredencoding

In [None]:
spacy_config = Config().from_disk(SPACY_CONFIG_PATH)

In [None]:
SPACY_BASE_MODEL = spacy_config['components']['ner']['source']

In [None]:
try:
    _ = spacy.load(SPACY_BASE_MODEL)
except:
    spacy.cli.download(SPACY_BASE_MODEL)

# Fine-Tune Spacy

In [None]:
!python -m spacy init fill-config {SPACY_CONFIG_PATH} {SPACY_FULL_CONFIG_PATH}

In [None]:
# !python -m spacy train {SPACY_FULL_CONFIG_PATH} --gpu-id {GPU_ID} --output {MODEL_OUTPUT_PATH}

## Hyperparameter tuning

In [None]:
def shell(command, time_fmt=None):
    start = time.time()
    process = subprocess.Popen(command, shell=False,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.STDOUT,
                               text=True,
                               encoding='utf-8',
                               bufsize=1)
    for line in process.stdout:
        print(line, end='', flush=True)
    process.wait()
    end = time.time()

    print(time_fmt.format(end - start))


In [None]:
def objective(trial):
    start_size = trial.suggest_int("start_size", 100, 500)  # Tune batch start
    stop_size = trial.suggest_int("stop_size", 1000, 5000)  # Tune batch stop
    learning_rate = trial.suggest_float("lr", 5e-5, 1e-2, log=True)

    command = [
        "python", "-m", "spacy",
        "train", SPACY_FULL_CONFIG_PATH,
        "--gpu-id", {GPU_ID},
        "--training.batcher.size.start", str(start_size),
        "--training.batcher.size.stop", str(stop_size),
        "--training.optimizer.learn_rate", str(learning_rate),
        "--output", MODEL_OUTPUT_PATH,
    ]
    # TODO: I can use spacy.cli.train.train here instead of shell proc!
    shell(command, "Training time: {:.4f} sec")

    with open(LAST_MODEL_RESULTS_PATH) as fp:
        result = json.load(fp)

    return result['performance']['ents_f']



class SaveBestModelCallback:
    def __call__(self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial) -> None:
        if study.best_value <= trial.value:
            shutil.copytree(LAST_MODEL_PATH, BEST_MODEL_PATH)

In [None]:
study = optuna.create_study(study_name=HP_STUDY_NAME, direction="maximize",
                            storage=HP_HISTORY_PATH, load_if_exists=True,
                            callbacks=[SaveBestModelCallback()])

XXX: Hyperparameter tuning isn't it. All non-zero runs get around 59% F1 score.
TODO is to look at the output of any non-zero training run and check
where the model is doing ok or failing.

In [None]:
study.optimize(objective, n_trials=20)

# Evaluate

In [None]:
print("Best F1 (on val):")
print(study.best_value)
print("Best params:")
print(study.best_params)

In [None]:
if not os.path.exists(SAMPLE_OUTPUT_PATH):
    os.mkdir(SAMPLE_OUTPUT_PATH)

In [None]:
!python -m spacy \
  benchmark accuracy \
  {BEST_MODEL_PATH} {DATA_TEST_BIN_PATH} \
  --output {METRICS_OUTPUT_PATH} \
  --displacy-path {SAMPLE_OUTPUT_PATH}