### This notebook performs the following tasks:
### 1. Re-format Doccano entity label output to spaCy format
### 2. Check for alignment between spaCy tokens and Doccano entitiy annotations

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
# other modules, code
from html import unescape
import unicodedata
import ast

import spacy
import srsly
from spacy import displacy
from spacy.training import docs_to_json, offsets_to_biluo_tags, biluo_tags_to_spans
from spacy.pipeline import EntityRuler
from spacy.tokens import DocBin

from spacy.training import Example
from spacy.util import minibatch, compounding
from spacy.tokens import Span
from toolz import partition_all

In [2]:
def reformat_doccano(df, colname = 'data', symptom_ent = True):
    if symptom_ent:
        #reformat jsonl label format from doccano to import into spacy
        df['label'] = df['label'].apply(lambda x: {'entities': [(e[0], e[1], e[2]) for e in x]})
    else:
        #reformat jsonl label format from doccano to import into spacy
        df['label'] = df['label'].apply(lambda x: {'entities': [(e[0], e[1], e[2]) for e in x if e[2] != 'SYMPTOM']})
    #create tuple from text, entities and assign to a new column.
    df['combined'] = list(zip(df[colname], df['label']))
    #create a list of tuples for trianing as required by spacy
    data_list = [obs for obs in df['combined']]
    return data_list

In [3]:
def import_reformat_doccano(filename, colname, keep_symptom = True):
    #read in df after annotation in doccano, exported as jsonl file
    df_annotated = pd.read_json(filename, lines=True)

    #reformat doccano annotations to spacy format as a list of tuples
    list_annotated = reformat_doccano(df_annotated, colname, keep_symptom)

    #generate lists containing raw_texts, annotations used for debugging
    list_raw_text = [i[0] for i in list_annotated]
    list_annotations = [i[1] for i in list_annotated]
    
    return [list_annotated, (list_raw_text, list_annotations)]

In [4]:
def check_for_alignment(spacy_data_list):
    nlp = spacy.blank('en')
    #checking if character offsets from doccano are aligned with tokenization done by spacy
    for row, (raw_text, entity_offsets) in enumerate(spacy_data_list):
        print_raw_txt = False
        doc = nlp.make_doc(raw_text)
        example = Example.from_dict(doc, {"entities": entity_offsets['entities']})
        #print (entity_offsets['entities'])
        check_this = spacy.training.offsets_to_biluo_tags(nlp.make_doc(raw_text), entity_offsets['entities'])
        for idx, t in enumerate(doc):
            if check_this[idx] == '-':
                print_raw_txt = True
                print (t.text, t.idx, check_this[idx])
        if print_raw_txt:
            print (f'---Tags for example #{row}---')
            print(raw_text)

In [5]:
def convert_to_spacy_BILOU_format(data_list, fname_prefix):
    #Generate .jsonl files for training, eval from train_data
    nlp = spacy.blank('en')
    nlp.add_pipe('sentencizer')
    db = DocBin() #docbin object for storing data on disk
    docs = []
    entities_dct = {}
    for text, annot in data_list:
        doc = nlp(text)
        tags = offsets_to_biluo_tags(doc, annot['entities'])
        entities = biluo_tags_to_spans(doc, tags)
        doc.ents = entities
        docs.append(doc)
        db.add(doc)
        #loop over all the entities and group into their labeled groups for entityruler patterns
        for ent in entities:
            try:
                entities_dct[ent.label_].add(ent.text.lower())
            except:
                entities_dct[ent.label_] = set()
                entities_dct[ent.label_].add(ent.text.lower())

    # commenting JSONL output since we can use docbin
    # srsly.write_json(fname_prefix + '_spacy_format.json', [docs_to_json(docs)])
    fname = fname_prefix + '.spacy'
    db.to_disk(fname)
    
    return entities_dct

In [6]:
# count the number of entities present in the data
def ent_counts(data_list):
    ent_count_dict = {}
    for _, annotations in data_list:
        for ent in annotations['entities']:
            try:
                ent_count_dict[ent[2]] += 1
            except:
                ent_count_dict[ent[2]] = 1
    return ent_count_dict

In [7]:
def ents_to_patterns(entities_dct):
    # generate patterns for entityruler based on entities annotated in training dataset
    # assumes entites are LOWERCASE; generate tokens by SPLITTING ON WHITESPACE
    patterns = []
    for label, entities_set in entities_dct.items():
        for ent_tokens in entities_set:
            # Initialize a dictionary for token-patterns as describe in spacy documentation
            label_dct = {}
            label_dct['label'] = label
            label_dct['pattern'] = [{'LOWER': t} for t in ent_tokens.split()]
            patterns.append(label_dct)
    return patterns

### Initialize basic arguments
TODO: Need to share the basic arguments globally across all notebooks for consistency

In [8]:
# Set seed
np.random.seed(42)
months = ['May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct']
n_start = 0
n_end = 80 #40
# number of months * number of examples from each month (n_end  - n_start); n_end excluded
model_suffix = len(months) * (n_end - n_start)
sub_group = 'train'

### Annotated files generated from Doccano & rehearsal data notebook

In [9]:
# SET this flag to False if not using SYMPTOM as part of NER model
keep_symptom_ent = False

if not keep_symptom_ent:
    model_suffix = str(model_suffix) + '_NS_v2'

# final annotated output from Doccano
train_filename = './data/json/nsamples_480_v2_2021_6m_doccano.jsonl' #'./nsamples_240_2021_6m_doccano.jsonl'
val_filename = './data/json/val_nsamples_240_doccano.jsonl'

# annotated rehearsal filenames from generate_rehearsal_data.ipynb
rehearsal_train_filename = './data/json/nlp_rehearsal_1000.json'
rehearsal_val_filename = './data/json/test_nlp_rehearsal_1000.json'

old_ptrns_fname = f'./data/patterns/old_patterns_240.csv' #f'./old_patterns_60.csv'
new_ptrns_fname = f'./data/patterns/old_patterns_{model_suffix}.csv'

### Read, reformat TRAIN, VAL data from Doccano

In [10]:
# Datasets with DISEASE entity
train_data, (train_text, _) = import_reformat_doccano(train_filename, 'data', keep_symptom_ent)
val_data, (val_text, _) = import_reformat_doccano(val_filename, 'data', keep_symptom_ent)

In [11]:
rhrsl_train_data, _ = import_reformat_doccano(rehearsal_train_filename, 'text')
rhrsl_val_data, _ = import_reformat_doccano(rehearsal_val_filename, 'text')

### Check if entities annotated in Doccano are aligned after tokenization in spacy

In [12]:
check_for_alignment(train_data)
check_for_alignment(val_data)
check_for_alignment(rhrsl_train_data)
check_for_alignment(rhrsl_val_data)
#another way to check for alignment; this was recommendation from spacy warning message
# list(zip(doc, check_this));

### Convert TRAIN, VAL data into spaCy readable .JSON format, spacy binary format
### Also generate / return sets of entities annotated for each label as a dict

In [13]:
train_entities_dct = convert_to_spacy_BILOU_format(train_data, f'./outputs/train_n_{model_suffix}')
_ = convert_to_spacy_BILOU_format(val_data, f'./outputs/val_n_{model_suffix}')

In [14]:
_ = convert_to_spacy_BILOU_format(rhrsl_train_data, f'./outputs/rhrsl_train_n_{model_suffix}')
_ = convert_to_spacy_BILOU_format(rhrsl_val_data, f'./outputs/rhrsl_val_n_{model_suffix}')

In [15]:
_ = convert_to_spacy_BILOU_format(train_data + rhrsl_train_data, f'./outputs/combo_train_n_{model_suffix}')
_ = convert_to_spacy_BILOU_format(val_data + rhrsl_val_data, f'./outputs/combo_val_n_{model_suffix}')

In [16]:
ent_counts(train_data)

{'DISEASE': 748, 'DRUG': 698}

In [17]:
ent_counts(val_data)

{'DISEASE': 379, 'DRUG': 271}

In [18]:
ent_counts(rhrsl_train_data);
ent_counts(rhrsl_val_data);

## ----------RULE BASED MODEL----------

### Read in patterns from last iteration

In [19]:
#Read in old patterns; Make sure to use the literal_eval otherwise patterns will be read-in as strings
old_patterns_df = pd.read_csv(old_ptrns_fname, converters={'pattern':ast.literal_eval})

In [20]:
old_patterns = []
for label, pattern in old_patterns_df.itertuples(index=False):
    dct = {}
    if keep_symptom_ent:
        dct = {'label': label, 'pattern': pattern}
    else:
        if label != 'SYMPTOM':
            dct = {'label': label, 'pattern': pattern}
    if (not keep_symptom_ent) and (label == 'SYMPTOM'):
        pass
    else:
        old_patterns.append(dct)
old_patterns;

### Generate patterns for EntityRuler based on entity sets generated previously

In [21]:
patterns = ents_to_patterns(train_entities_dct)

In [22]:
# see all the labels / entity-types in this dataset
labels = set()
for p in patterns:
    labels.add(p['label'])
labels

{'DISEASE', 'DRUG'}

In [23]:
#check for duplication in entity patterns
check_duplicates = pd.DataFrame(patterns + old_patterns)
check_duplicates['pattern'] = check_duplicates['pattern'].map(lambda x: str(x))
check_duplicates[check_duplicates.duplicated()].shape

(466, 2)

## ----------ML Model using spacy EntityRuler----------

### Model 0: Blank spacy model with EntityRuler

In [240]:
nlp_blank_rule = spacy.blank('en')
ruler = nlp_blank_rule.add_pipe('entity_ruler')
ruler.add_patterns(patterns + old_patterns)
nlp_blank_rule.to_disk(f'./models/model_0_n_{model_suffix}')

### Model 4: pre-trained spacy model with EntityRuler

In [212]:
# load default spacy model with pre-trained NER
nlp_rule_based = spacy.load('en_core_web_sm') # Model 4
ruler = nlp_rule_based.add_pipe('entity_ruler', before='ner')
ruler.add_patterns(patterns + old_patterns)
nlp_rule_based.to_disk(f'./models/model_4_n_{model_suffix}')

### Model 5: pre-trained spacy mode, NER only

In [168]:
nlp = spacy.load('en_core_web_sm')
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
for pipe in other_pipes:
    nlp.remove_pipe(pipe)
nlp.pipe_names
nlp.to_disk(f'./models/model_5_n_{model_suffix}')

### Save generated patterns to disk for re-use in annotating next batch of training, val data

In [241]:
patterns_df = pd.DataFrame(patterns + old_patterns)
patterns_df.to_csv(new_ptrns_fname, index=False)