### The following models are trained in this notebook:
### Model 0: Blank spacy model using patterns with EntityRuler
### Model 1: pre-trained spacy model trained using training data AND revision data
### Model 2: pre-trained spacy model trained using training data only (1 epoch, catastrophic forgetting)
### Model 3: Blank spacy model trained using training data only
### Model 4: pre-trained spacy model using patterns with EntityRuler

In [11]:
import numpy as np
import pandas as pd
import datetime as dt
# other modules, code
from html import unescape
import unicodedata
import ast

import spacy
import srsly
from spacy import displacy
from spacy.training import docs_to_json, offsets_to_biluo_tags, biluo_tags_to_spans
from spacy.pipeline import EntityRuler
from spacy.tokens import DocBin

from spacy.training import Example
from spacy.util import minibatch, compounding
from spacy.tokens import Span
from toolz import partition_all

In [12]:
def get_list_from_docbin(filename):
    nlp = spacy.blank('en')
    nlp.add_pipe('sentencizer')
    doc_bin = DocBin().from_disk(filename)

    list_data = []
    labels = set()
    for doc in doc_bin.get_docs(nlp.vocab):
        spans = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        list_data.append((doc.text, {'entities': spans}))
        for ent in doc.ents:
            labels.add(ent.label_)
    return (list_data, labels)

## ----------ML MODEL TRAINING----------

### Initialize basic arguments
TODO: Need to share the basic arguments globally across all notebooks for consistency

In [13]:
# Set seed
np.random.seed(42)
months = ['May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct']
n_start = 0
n_end = 80 #40
# number of months * number of examples from each month (n_end  - n_start); n_end excluded
model_suffix = len(months) * (n_end - n_start)
sub_group = 'train'

### Annotated files generated from Doccano & rehearsal data notebook

In [14]:
# SET this flag to False if not using SYMPTOM as part of NER model
keep_symptom_ent = False

if not keep_symptom_ent:
    model_suffix = str(model_suffix) + '_NS_v2'

# final annotated output from Doccano
train_filename = './data/json/nsamples_480_v2_2021_6m_doccano.jsonl' #'./nsamples_240_2021_6m_doccano.jsonl'
val_filename = './data/json/val_nsamples_240_doccano.jsonl'

# annotated rehearsal filenames from generate_rehearsal_data.ipynb
rehearsal_train_filename = './data/json/nlp_rehearsal_1000.json'
rehearsal_val_filename = './data/json/test_nlp_rehearsal_1000.json'

old_ptrns_fname = f'./data/patterns/old_patterns_240.csv' #f'./old_patterns_60.csv'
new_ptrns_fname = f'./data/patterns/old_patterns_{model_suffix}.csv'

### binary data sets generated from preprocessing

In [9]:
docbin_train = f'./outputs/train_n_{model_suffix}.spacy'
docbin_val = f'./outputs/val_n_240_NS.spacy' #f'./outputs/val_n_{model_suffix}.spacy'
docbin_rhrsl_train = f'./outputs/rhrsl_train_n_240_NS.spacy' #f'./outputs/rhrsl_train_n_{model_suffix}.spacy'
docbin_rhrsl_val = f'./outputs/rhrsl_val_n_240_NS.spacy' #f'./outputs/rhrsl_val_n_{model_suffix}.spacy'

epochs = 40

In [10]:
train_data, labels = get_list_from_docbin(docbin_train)
val_data, _ = get_list_from_docbin(docbin_val)

rhrsl_train_data, _ = get_list_from_docbin(docbin_rhrsl_train)
rhrsl_val_data, _ = get_list_from_docbin(docbin_rhrsl_val)

## MODEL 1

### Model training / fine-tuning using minibatch, pre-trained model: pseudo rehearsal

This model is trained using the pre-trained model from spacy. To avoid the problem of catastrophic forgetting, this model uses rehearsal data in addition to the train dataset. Rehearsal data is entity data generated using the default pre-trained spacy model. Using such examples in the training set helps the model 'remember' to predict the entities it was originally trained on.

In [152]:
nlp_fine_tune = spacy.load('en_core_web_sm') #Model 1

# get ner component from pipeline
ner = nlp_fine_tune.get_pipe('ner')

#Add custom labels to NER component
for l in labels:
    ner.add_label(l)

#check which labels are present in rehearsal train data
labels_rehearsal = set()
for _, annotations in rhrsl_train_data:
    for ent in annotations['entities']:
        labels_rehearsal.add(ent[2])
labels_rehearsal;

In [153]:
# Get names of other pipes to disable them during training to train only NER
other_pipes = [pipe for pipe in nlp_fine_tune.pipe_names if pipe != 'ner']

optimizer_ft = nlp_fine_tune.resume_training()
# optimizer_ft.learn_rate = 0.0005
combined_data = train_data + rhrsl_train_data
with nlp_fine_tune.disable_pipes(*other_pipes):
    for i in range(epochs):
        losses = {}
        np.random.shuffle(combined_data)
        batches = minibatch(combined_data, size=compounding(4.0, 16.0, 1.001))
        for batch in batches:
            examples = []
            texts, annotations = zip(*batch)
            for j in range(len(texts)):
                # create Example
                doc = nlp_fine_tune.make_doc(texts[j])
                example = Example.from_dict(doc, {"entities": annotations[j]['entities']})
                examples.append(example)
            
            # Update the model
            nlp_fine_tune.update(examples, sgd=optimizer_ft, drop=0.2, losses=losses)
        print (f'losses at iteration {i} - {dt.datetime.now()}: {losses}')

losses at iteration 0 - 2021-12-20 02:11:12.363157: {'ner': 5576.467792162419}
losses at iteration 1 - 2021-12-20 02:11:44.061580: {'ner': 4332.883394004506}
losses at iteration 2 - 2021-12-20 02:12:15.859403: {'ner': 4047.6502895367644}
losses at iteration 3 - 2021-12-20 02:12:47.976512: {'ner': 3189.793058603752}
losses at iteration 4 - 2021-12-20 02:13:20.359124: {'ner': 2636.6828106756566}
losses at iteration 5 - 2021-12-20 02:13:52.581651: {'ner': 2381.0459341477003}
losses at iteration 6 - 2021-12-20 02:14:24.672039: {'ner': 1959.2390458196207}
losses at iteration 7 - 2021-12-20 02:14:56.631376: {'ner': 1785.542078595968}
losses at iteration 8 - 2021-12-20 02:15:28.191288: {'ner': 1511.8160156939323}
losses at iteration 9 - 2021-12-20 02:15:59.506220: {'ner': 1338.808810117102}
losses at iteration 10 - 2021-12-20 02:16:31.393060: {'ner': 1271.8543044296684}
losses at iteration 11 - 2021-12-20 02:17:02.661805: {'ner': 1128.3592588404174}
losses at iteration 12 - 2021-12-20 02:17:3

In [154]:
nlp_fine_tune.to_disk(f'./models/model_1_n_{model_suffix}_LR_0005')

In [150]:
optimizer_ft.learn_rate

0.001

In [146]:
# # Save combined dataset used for training the model: rehearsal data + train data
# fine_tune_entities_dct = convert_to_spacy_BILOU_format(combined_data, 'fine_tune_tmp1_1000_spacy_format.json')

## MODEL 2: 

### Model training / fine-tuning using minibatch, pre-trained model: no rehearsal, 1 epoch ONLY

In [83]:
nlp_fine_tune_no_rhrsl = spacy.load('en_core_web_sm') #Model 2

# get ner component from pipeline
ner = nlp_fine_tune_no_rhrsl.get_pipe('ner')

#Add custom labels to NER component
for l in labels:
    ner.add_label(l)

In [84]:
# Get names of other pipes to disable them during training to train only NER
other_pipes = [pipe for pipe in nlp_fine_tune_no_rhrsl.pipe_names if pipe != 'ner']

optimizer_ft = nlp_fine_tune_no_rhrsl.resume_training()
with nlp_fine_tune_no_rhrsl.disable_pipes(*other_pipes):
    for i in range(1):
        losses = {}
        np.random.shuffle(train_data)
        batches = minibatch(train_data, size=compounding(4.0, 16.0, 1.001))
        for batch in batches:
            examples = []
            texts, annotations = zip(*batch)
            for j in range(len(texts)):
                # create Example
                doc = nlp_fine_tune_no_rhrsl.make_doc(texts[j])
                example = Example.from_dict(doc, {"entities": annotations[j]['entities']})
                examples.append(example)
            
            # Update the model
            nlp_fine_tune_no_rhrsl.update(examples, sgd=optimizer_ft, drop=0.2, losses=losses)
        print (f'losses at iteration {i} - {dt.datetime.now()}: {losses}')

losses at iteration 0 - 2021-12-16 22:23:57.377006: {'ner': 3081.422375679847}


In [85]:
nlp_fine_tune_no_rhrsl.to_disk(f'./models/model_2_n_{model_suffix}')

## MODEL 3

### Model training using minibatch, Blank spacy model

In [147]:
nlp_blank = spacy.blank('en') # Model 3

nlp_blank.add_pipe('ner', last=True)
ner = nlp_blank.get_pipe('ner')
#Add custom labels to NER component
for l in labels:
    ner.add_label(l)

In [148]:
# Get names of other pipes to disable them during training to train only NER
other_pipes = [pipe for pipe in nlp_blank.pipe_names if pipe != 'ner']

optimizer_blnk = nlp_blank.begin_training()

with nlp_blank.disable_pipes(*other_pipes):
    for i in range(epochs):
        losses = {}
        np.random.shuffle(train_data)
        batches = minibatch(train_data, size=compounding(4.0, 16.0, 1.001))
        for batch in batches:
            examples = []
            texts, annotations = zip(*batch)
            for j in range(len(texts)):
                # create Example
                doc = nlp_blank.make_doc(texts[j])
                example = Example.from_dict(doc, {"entities": annotations[j]['entities']})
                examples.append(example)
            
            # Update the model
            nlp_blank.update(examples, sgd=optimizer_blnk, drop=0.2, losses=losses)
        print (f'losses at iteration {i} - {dt.datetime.now()}: {losses}')

losses at iteration 0 - 2021-12-20 01:58:31.025332: {'ner': 8646.016031546464}
losses at iteration 1 - 2021-12-20 01:58:45.443145: {'ner': 1747.1857977169348}
losses at iteration 2 - 2021-12-20 01:58:59.793075: {'ner': 1483.7506279624583}
losses at iteration 3 - 2021-12-20 01:59:14.321139: {'ner': 1267.7645954702518}
losses at iteration 4 - 2021-12-20 01:59:28.629892: {'ner': 797.0254093600046}
losses at iteration 5 - 2021-12-20 01:59:43.143051: {'ner': 640.9149698029494}
losses at iteration 6 - 2021-12-20 01:59:57.643876: {'ner': 502.67452125092115}
losses at iteration 7 - 2021-12-20 02:00:12.395149: {'ner': 479.7778080427033}
losses at iteration 8 - 2021-12-20 02:00:27.097337: {'ner': 408.4687196867791}
losses at iteration 9 - 2021-12-20 02:00:41.792577: {'ner': 348.01954025892303}
losses at iteration 10 - 2021-12-20 02:00:56.446528: {'ner': 318.60855383865794}
losses at iteration 11 - 2021-12-20 02:01:11.091614: {'ner': 246.3675319984598}
losses at iteration 12 - 2021-12-20 02:01:25

In [149]:
nlp_blank.to_disk(f'./models/model_3_n_{model_suffix}')

## MODEL 6: 

### Model training pre-trained model: use only rehearsal data

In [92]:
nlp_fine_tune_only_rhrsl = spacy.load('en_core_web_sm') #Model 2

# get ner component from pipeline
ner = nlp_fine_tune_only_rhrsl.get_pipe('ner')

#Add custom labels to NER component
for l in labels:
    ner.add_label(l)

In [93]:
# Get names of other pipes to disable them during training to train only NER
other_pipes = [pipe for pipe in nlp_fine_tune_only_rhrsl.pipe_names if pipe != 'ner']

optimizer_ft = nlp_fine_tune_only_rhrsl.resume_training()
with nlp_fine_tune_only_rhrsl.disable_pipes(*other_pipes):
    for i in range(epochs):
        losses = {}
        np.random.shuffle(rhrsl_train_data)
        batches = minibatch(rhrsl_train_data, size=compounding(4.0, 16.0, 1.001))
        for batch in batches:
            examples = []
            texts, annotations = zip(*batch)
            for j in range(len(texts)):
                # create Example
                doc = nlp_fine_tune_only_rhrsl.make_doc(texts[j])
                example = Example.from_dict(doc, {"entities": annotations[j]['entities']})
                examples.append(example)
            
            # Update the model
            nlp_fine_tune_only_rhrsl.update(examples, sgd=optimizer_ft, drop=0.2, losses=losses)
        print (f'losses at iteration {i} - {dt.datetime.now()}: {losses}')

losses at iteration 0 - 2021-12-16 23:15:34.427482: {'ner': 1745.24130276536}
losses at iteration 1 - 2021-12-16 23:15:48.032187: {'ner': 1347.4078114786373}
losses at iteration 2 - 2021-12-16 23:16:01.632819: {'ner': 1042.8760053135156}
losses at iteration 3 - 2021-12-16 23:16:15.189968: {'ner': 850.890962165866}
losses at iteration 4 - 2021-12-16 23:16:28.906513: {'ner': 768.8895616634314}
losses at iteration 5 - 2021-12-16 23:16:42.685366: {'ner': 643.108800899378}
losses at iteration 6 - 2021-12-16 23:16:56.334516: {'ner': 536.0692065412068}
losses at iteration 7 - 2021-12-16 23:17:10.040106: {'ner': 539.9952681452605}
losses at iteration 8 - 2021-12-16 23:17:23.692236: {'ner': 474.26512351170066}
losses at iteration 9 - 2021-12-16 23:17:37.513360: {'ner': 392.3544184032194}
losses at iteration 10 - 2021-12-16 23:17:51.228502: {'ner': 384.8074740658636}
losses at iteration 11 - 2021-12-16 23:18:04.854083: {'ner': 344.8514528440462}
losses at iteration 12 - 2021-12-16 23:18:18.51713

In [94]:
nlp_fine_tune_only_rhrsl.to_disk(f'./models/model_6_n_{model_suffix}')

### Model training using single examples, Blank spacy model: COMMENTED

In [158]:
# nlp_b = spacy.blank('en')

# nlp_b.add_pipe('ner', last=True)
# ner = nlp_b.get_pipe('ner')
# #Add custom labels to NER component
# for l in labels:
#     ner.add_label(l)

In [159]:
# optimizer_b = nlp_b.begin_training()

# for i in range(1):
#     losses = {}
#     np.random.shuffle(train_data)
#     for raw_text, entity_offsets in train_data:
#         doc = nlp_b.make_doc(raw_text)
#         example = Example.from_dict(doc, {"entities": entity_offsets['entities']})
#         nlp_b.update([example], sgd=optimizer_b, losses=losses)
#     print (f'losses at iteration {i} - {dt.datetime.now()}: {losses}')