In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import spacy
import subprocess
import os

import time
import optuna
import json

from thinc.api import Config


In [None]:
COLAB = False

In [None]:
DATA_DIR = "gdrive/MyDrive/Work/quantify-news/" if COLAB else "data/"
PROJECT_DIR = "gdrive/MyDrive/Work/quantify-news/" if COLAB else "./"

DATA_TRAIN_BIN_PATH = DATA_DIR + "ner_train.spacy"
DATA_DEV_BIN_PATH = DATA_DIR + "ner_dev.spacy"
DATA_TEST_BIN_PATH = DATA_DIR + "ner_test.spacy"


SPACY_CONFIG_PATH = PROJECT_DIR + ("spacy_base_config_colab.cfg" if COLAB else "spacy_base_config.cfg")
SPACY_FULL_CONFIG_PATH = PROJECT_DIR + "spacy_config.cfg"

MODEL_OUTPUT_PATH = PROJECT_DIR + "models"
BEST_MODEL_PATH = MODEL_OUTPUT_PATH + "/model-best"
METRICS_OUTPUT_PATH = PROJECT_DIR + "metrics.json"
SAMPLE_OUTPUT_PATH = PROJECT_DIR + "metrics_sample"


In [None]:
if COLAB:
    from google.colab import drive
    drive.mount('/content/gdrive')
    assert spacy.require_gpu()

    # Fixes encoding issue on Colab with GPUs
    import locale
    print(locale.getpreferredencoding())
    def getpreferredencoding(do_setlocale=True):
        return 'UTF-8'
    locale.getpreferredencoding = getpreferredencoding

# Download base model

In [None]:
spacy_config = Config().from_disk(SPACY_CONFIG_PATH)

In [None]:
SPACY_BASE_MODEL = spacy_config['components']['ner']['source']

In [None]:
try:
    _ = spacy.load(SPACY_BASE_MODEL)
except:
    spacy.cli.download(SPACY_BASE_MODEL)

# Fine-Tune Spacy

In [None]:

command = [
    "python", "-m", "spacy", 
    "init", "fill-config", 
    SPACY_CONFIG_PATH, SPACY_FULL_CONFIG_PATH,
]

subprocess.run(" ".join(command), check=True, shell=True)


VSCode transparently prints subprocess stdout but Colab doesn't. 
But if we run as a shell command then it prints!

In [None]:
!python -m spacy train {SPACY_FULL_CONFIG_PATH} --gpu-id 0 --output {MODEL_OUTPUT_PATH}

## Hyperparameter tuning

In [None]:
def objective(trial):
    start_size = trial.suggest_int("start_size", 100, 500)  # Tune batch start
    stop_size = trial.suggest_int("stop_size", 1000, 5000)  # Tune batch stop
    learning_rate = trial.suggest_float("lr", 1e-5, 1e-3, log=True)

    command = [
        "python", "-m", "spacy", 
        "train", SPACY_FULL_CONFIG_PATH,
        "--training.batcher.size.start", str(start_size),
        "--training.batcher.size.stop", str(stop_size),
        "--training.optimizer.learn_rate", str(learning_rate),
        "--output", MODEL_OUTPUT_PATH,
    ]

    start = time.time()
    # subprocess.run(" ".join(command), check=True, shell=True)
    # Force buffering in real time in Colab
    process = subprocess.Popen(" ".join(command), shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1)
    for line in process.stdout:
        print(line, end='')
    process.wait()
    end = time.time()
    print(f"Training time: {end - start:.4f} sec")
    with open(MODEL_OUTPUT_PATH + "/model-best/meta.json") as fp:
        result = json.load(fp)
    return result['performance']['ents_f']

In [None]:
study = optuna.create_study(study_name="ner_hp", direction="maximize")
study.optimize(objective, n_trials=20)
print(study.best_params)

# Evaluate

**TODO**: Optuna probably overwrote the best model . Need to rerun with best params.
Or instruct it not to overwrite it.

In [None]:
if not os.path.exists(SAMPLE_OUTPUT_PATH):
    os.mkdir(SAMPLE_OUTPUT_PATH)

command = [
    "python", "-m", "spacy",
    "benchmark", "accuracy",
    BEST_MODEL_PATH, DATA_TEST_BIN_PATH,
    "--output", METRICS_OUTPUT_PATH,
    "--displacy-path", SAMPLE_OUTPUT_PATH
    ]

subprocess.run(" ".join(command), check=True, shell=True)
