# BERT for Named Entity Recognition

### This notebook builds off the official BERT Github, and draws inspiration from;
#### https://towardsdatascience.com/building-a-multi-label-text-classifier-using-bert-and-tensorflow-f188e0ecdc5d
#### https://github.com/kyzhouhzau/BERT-NER 

### Before getting started you will want to go and download your choice of BERT base model from https://github.com/google-research/bert
For this notebook we use Bert-Base, Cased
If you can fit Bert-Large into GPU memory then I congratulate you as you are a very wealthy person.
The zip file of the model will six files in it and your model will need them all

#### HARDWARE REQUIREMENTS #### 
I trained the model on 4 x Tesla 16 GB GPUs + 52 GB ram for speed (20 minutes or so). When serving the pretrained model using a checkpoint already pretrained on the NER data you can just use regular RAM and probably can get away with 32 GB of regular RAM and no GPU.

In [1]:
import os
import collections
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
import numpy as np

In [3]:
##install bert if not already done
!pip install bert-tensorflow

Collecting bert-tensorflow
  Using cached https://files.pythonhosted.org/packages/a6/66/7eb4e8b6ea35b7cc54c322c816f976167a43019750279a8473d355800a93/bert_tensorflow-1.0.1-py2.py3-none-any.whl
Collecting six (from bert-tensorflow)
  Using cached https://files.pythonhosted.org/packages/73/fb/00a976f728d0d1fecfe898238ce23f502a721c0ac0ecfedb80e0d88c64e9/six-1.12.0-py2.py3-none-any.whl
Installing collected packages: six, bert-tensorflow
Successfully installed bert-tensorflow-1.0.1 six-1.12.0


In [2]:
import bert
from bert import run_classifier
from bert import optimization
from bert import modeling
from bert import tokenization
from bert.tokenization import FullTokenizer

W0726 03:28:31.596731 139806183552768 deprecation_wrapper.py:119] From /home/jupyter/.local/lib/python3.5/site-packages/bert/optimization.py:87: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [3]:
from tqdm import tqdm
from tensorflow.keras import backend as K
from tensorflow.keras import Model
# Initialize session
sess = tf.Session()

In [4]:
##use downloaded BERT model, change path accordingly
BERT_VOCAB= 'BERTcased/vocab.txt'
BERT_INIT_CHKPNT = 'BERTcased/bert_model.ckpt'
BERT_CONFIG = 'BERTcased/bert_config.json'

In [7]:
## There are two ways to create the special tokenizer that BERT requires
##1. Use the hub_module as below (online)
bert_path = "https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    bert_module =  hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ]
    )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=False)
tokenizer = create_tokenizer_from_hub_module()

##2. Create from the downloaded vocab.txt and init chkpnt file as below (for offline on prem)
#tokenization.validate_case_matches_checkpoint(True,BERT_INIT_CHKPNT)
#tokenizer = tokenization.FullTokenizer(
#      vocab_file=BERT_VOCAB, do_lower_case=True)


In [8]:
#test our tokenizer is working
tokenizer.tokenize("This here's an example of using the BERT tokenizer. Like it very much.")

['This',
 'here',
 "'",
 's',
 'an',
 'example',
 'of',
 'using',
 'the',
 'B',
 '##ER',
 '##T',
 'token',
 '##izer',
 '.',
 'Like',
 'it',
 'very',
 'much',
 '.']

In [9]:
##IMPORT DATA
##change path accordingly
data = pd.read_csv('datasets/ner_dataset.csv', encoding="ISO-8859-1")
data.head()
#Below isn't the end dataset format we want, see block 11 output for the optimal dataframe structure

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [10]:
#GET a list of all the possible labels for our entities by aggregating the label(tag) column
s = data['Tag']
tag_labels = s.unique().tolist()
print(tag_labels)

['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim', 'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve', 'I-eve', 'I-nat']


In [11]:
ID = 'id'
DATA_COLUMN = 'Word'
LABEL_COLUMNS = ['Tag']
#account for special tags that will be added by BERT tokenizer
bert_tags = ["X","[CLS]","[SEP]"]
padding_tag = ["[PAD]",]
#below are the two important variables, we will reference these a lot.
LABELS = padding_tag + tag_labels + bert_tags
num_labels = len(LABELS)
print(num_labels)

21


In [12]:
#Create an dictionary for our labels so we can replace them with integers (for our neural net) and look them up later.
label_map = {}
for (i,label) in enumerate(LABELS):
  label_map[label] = i
print(label_map)

{'I-nat': 17, 'B-eve': 15, 'O': 1, 'I-eve': 16, '[PAD]': 0, '[SEP]': 20, 'I-org': 7, 'I-art': 10, 'B-geo': 2, 'X': 18, 'B-nat': 14, 'I-gpe': 12, 'B-gpe': 3, 'I-geo': 5, 'B-org': 6, 'B-art': 9, 'I-per': 11, 'I-tim': 13, 'B-per': 4, '[CLS]': 19, 'B-tim': 8}


In [13]:
#This particular dataset has blanks in the sentence column like a pivot table, so we need to fill them in
#Note that the new dataframe is reordered alphabetically (we aren't missing rows)
#fill in the blanks for Sentence
data.loc[:,'Sentence #'].fillna(method='ffill', inplace = True)
#group each sentence and make arrays of other columns
data = data.groupby('Sentence #').agg(list)
data.head(5)

Unnamed: 0_level_0,Word,POS,Tag
Sentence #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sentence: 1,"[Thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo..."
Sentence: 10,"[Iranian, officials, say, they, expect, to, ge...","[JJ, NNS, VBP, PRP, VBP, TO, VB, NN, TO, JJ, J...","[B-gpe, O, O, O, O, O, O, O, O, O, O, O, O, O,..."
Sentence: 100,"[Helicopter, gunships, Saturday, pounded, mili...","[NN, NNS, NNP, VBD, JJ, NNS, IN, DT, NNP, JJ, ...","[O, O, B-tim, O, O, O, O, O, B-geo, O, O, O, O..."
Sentence: 1000,"[They, left, after, a, tense, hour-long, stand...","[PRP, VBD, IN, DT, NN, JJ, NN, IN, NN, NNS, .]","[O, O, O, O, O, O, O, O, O, O, O]"
Sentence: 10000,"[U.N., relief, coordinator, Jan, Egeland, said...","[NNP, NN, NN, NNP, NNP, VBD, NNP, ,, NNP, ,, J...","[B-geo, O, O, B-per, I-per, O, B-tim, O, B-geo..."


In [14]:
#STANDARD BERT DEFINITIONS as per BERT github
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, labels=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            labels: (Optional) [string]. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.labels = labels


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_ids, is_real_example=True, is_predict=False):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids,
        self.is_real_example=is_real_example

In [15]:
# We'll set sequences to be at most 128 tokens long. Increasing this will hit your memory very hard!
MAX_SEQ_LENGTH = 128

In [16]:
#STANDARD BERT DEFINITIONS with slight tweak
def create_examples(df, labels_available=True, pretokenized=False):
    """Creates examples for the training and dev sets."""
    examples = []
    #process dataframe to produce individual input examples where text_a is an array of words and labels is an array of NER-Tag labels
    for (i, row) in tqdm(enumerate(df.values)):
        guid = i
        #grab your input text that you wish to label
        #your input text is assumed to be in word tokenized comma-seperated form ['this','is','my','sentence']
        text_a = row[0]
        if labels_available:
            #Grab whatever column has your array of target labels ['I','O','I']
            labels = row[2]
        else:
            if pretokenized:
                labels = row[1]
            else:
                labels = ['[PAD]']*len(row[0])
        examples.append(
            InputExample(guid=guid, text_a=text_a, labels=labels))
    return examples

## PREPARE DATA FOR TRAINING ##

In [17]:
#SPLIT TRAINING AND TEST DATA
TRAIN_VAL_RATIO = 0.9
LEN = data.shape[0]
SIZE_TRAIN = int(TRAIN_VAL_RATIO*LEN)

x_train = data[:SIZE_TRAIN]
x_val = data[SIZE_TRAIN:]

train_examples = create_examples(x_train)

43163it [00:00, 511852.78it/s]


In [18]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 1.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
SAVE_SUMMARY_STEPS = 500

In [19]:
#STANDARD BERT DEFINITIONS - with tweaks
class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
    When running eval/predict on the TPU, we need to pad the number of examples
    to be a multiple of the batch size, because the TPU requires a fixed batch
    size. The alternative is to drop the last batch, which is bad because it means
    the entire output data won't be generated.
    We use this class instead of `None` because treating `None` as padding
    battches could cause silent errors.
    """
    
    
def convert_single_example(ex_index, example, max_seq_length,
                           tokenizer, labelmap = label_map, pretokenized=False):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        #we add this 'org_to_tok_map' to eventually keep track of which input word relates to what output label
        #note this gets messy as BERT will split some words like 'running' into 'run' '##ing'
        orig_to_tok_map = [0,]
        return InputFeatures(
            input_ids=[0] * max_seq_length,
            input_mask=[0] * max_seq_length,
            segment_ids=[0] * max_seq_length,
            label_ids=[0] * max_seq_length,
            is_real_example=False), orgin_to_tok_map
    
    orig_to_tok_map = []
    tokens_a = []
    label_ids = []
    Xindex = labelmap['X']
    #becuase the NER dataset is already split by word, we are going to tokenize one word at a time
    #however, remember each word has a Tag, and some of these words might be split 'run','#ing' by BERT tokenizer
    #so we need to add sufficient 'X' tags to labellist as we go (Bert uses the 'X' tag to represent the second half of these split words)
    
    if pretokenized:
        tokens_a = example.text_a
        finlabels = []
        for token in tokens_a:
            if token in ['[SEP]','[CLS]','X']:
                finlabels.append(int(labelmap[token]))
            else:
                finlabels.append(0)
        label_ids.extend(finlabels) 
    else:    
        for label,word in zip(example.labels,example.text_a):
            orig_to_tok_map.append(len(tokens_a)+1)
            #convert labels from text ('O','I-geo' etc) to integers
            #remember tokenizer returns an array, even if we give it one word
            sub_words = tokenizer.tokenize(word)
            tokens_a.extend(sub_words)
            labelss = np.array([label for x in sub_words])
            #add 'X' label for all subword suffixes that BERT produces
            labelss[1:] = 'X'
            #give first subword component the actual label ('I-org' etc but as an integer)
            finlabels = [int(labelmap[tok]) for tok in labelss]
            label_ids.extend(finlabels)              
    
    tokens_b = None
    if example.text_b:
        tokens_b = tokenizer.tokenize(example.text_b)

    if tokens_b:
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for [CLS] and [SEP] with "- 2"
        if not pretokenized:
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[0:(max_seq_length - 2)]
                label_ids = label_ids[0:(max_seq_length - 2)]

    # The convention in BERT is:
    # (a) For sequence pairs:
    #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
    #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
    # (b) For single sequences:
    #  tokens:   [CLS] the dog is hairy . [SEP]
    #  type_ids: 0     0   0   0  0     0 0
    #
    # Where "type_ids" are used to indicate whether this is the first
    # sequence or the second sequence. The embedding vectors for `type=0` and
    # `type=1` were learned during pre-training and are added to the wordpiece
    # embedding vector (and position vector). This is not *strictly* necessary
    # since the [SEP] token unambiguously separates the sequences, but it makes
    # it easier for the model to learn the concept of sequences.
    #
    # For classification tasks, the first vector (corresponding to [CLS]) is
    # used as the "sentence vector". Note that this only makes sense because
    # the entire model is fine-tuned.
    if pretokenized:
        tokens = tokens_a
    else:
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        label_ids = [labelmap["[CLS]"]] + label_ids + [labelmap["[SEP]"]]
    segment_ids = [0] * len(tokens)    

    if tokens_b:
        for token in tokens_b:
            tokens.append(token)
            segment_ids.append(1)
        tokens.append("[SEP]")
        segment_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)
        label_ids.append(0)
        tokens.append('[PAD]')

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    assert len(label_ids) == max_seq_length
    assert len(tokens) == max_seq_length
    
    if ex_index < 1:
        tf.logging.info("*** Example ***")
        tf.logging.info("guid: %s" % (example.guid))
        tf.logging.info("tokens: %s" % " ".join(
            [tokenization.printable_text(x) for x in tokens]))
        tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
        tf.logging.info("label: %s (ids = %s)" % (example.labels, str(label_ids)))    
    
    feature = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_ids=label_ids,
        is_real_example=True)
    
    return feature,orig_to_tok_map


def file_based_convert_examples_to_features(
        examples, max_seq_length, tokenizer, output_file):
    """Convert a set of `InputExample`s to a TFRecord file."""

    writer = tf.python_io.TFRecordWriter(output_file)

    for (ex_index, example) in tqdm(enumerate(examples)):
        #if ex_index % 10000 == 0:
            #tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))

        feature,org_to_tok_map = convert_single_example(ex_index, example,
                                         max_seq_length, tokenizer)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
            return f

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_int_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        features["is_real_example"] = create_int_feature(
            [int(feature.is_real_example)])
        if isinstance(feature.label_ids, list):
            label_ids = feature.label_ids
        else:
            label_ids = feature.label_ids[0]
        features["label_ids"] = create_int_feature(label_ids)

        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
        writer.write(tf_example.SerializeToString())
    writer.close()


def file_based_input_fn_builder(input_file, seq_length, is_training,
                                drop_remainder):
    """Creates an `input_fn` closure to be passed to TPUEstimator."""

    name_to_features = {
        "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
        "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "label_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "is_real_example": tf.FixedLenFeature([], tf.int64),
    }

    def _decode_record(record, name_to_features):
        """Decodes a record to a TensorFlow example."""
        example = tf.parse_single_example(record, name_to_features)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.to_int32(t)
            example[name] = t

        return example

    def input_fn(params):
        """The actual input function."""
        batch_size = params["batch_size"]

        # For training, we want a lot of parallel reading and shuffling.
        # For eval, we want no shuffling and parallel reading doesn't matter.
        d = tf.data.TFRecordDataset(input_file)
        if is_training:
            d = d.repeat()
            d = d.shuffle(buffer_size=100)

        d = d.apply(
            tf.contrib.data.map_and_batch(
                lambda record: _decode_record(record, name_to_features),
                batch_size=batch_size,
                drop_remainder=drop_remainder))

        return d

    return input_fn


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

In [20]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_examples) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [21]:
train_file = os.path.join('./workingBERTNERrisky', "train.tf_record")
#filename = Path(train_file)
if not os.path.exists(train_file):
    open(train_file, 'w').close()

In [22]:
file_based_convert_examples_to_features(
            train_examples, MAX_SEQ_LENGTH, tokenizer, train_file)
tf.logging.info("***** Running training *****")
tf.logging.info("  Num examples = %d", len(train_examples))
tf.logging.info("  Batch size = %d", BATCH_SIZE)
tf.logging.info("  Num steps = %d", num_train_steps)

43163it [00:43, 992.94it/s]


In [23]:
train_input_fn = file_based_input_fn_builder(
    input_file=train_file,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True)

## DEFINE MODEL ##

In [24]:
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                 labels, num_labels, use_one_hot_embeddings, is_predicting=False):
    """Creates a classification model."""
    model = modeling.BertModel(
        config=bert_config,
        is_training=is_training,
        input_ids=input_ids,
        input_mask=input_mask,
        token_type_ids=segment_ids,
        use_one_hot_embeddings=use_one_hot_embeddings)
    
    #because we are classifying every word in the sequence we use the below, instead of 
    #output_layer = model.get_pooled_output() - for example level tagging rather than token tagging
    output_layer = model.get_sequence_output()

    hidden_size = output_layer.shape[-1].value

    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
        "output_bias", [num_labels], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
            # I.e., 0.1 dropout
            output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)
        
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
        predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
        if is_predicting:
            return(predicted_labels,log_probs)
        
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
        return (loss, predicted_labels, log_probs)


def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu,
                     use_one_hot_embeddings):
    """Returns `model_fn` closure for TPUEstimator."""

    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        #tf.logging.info("*** Features ***")
        #for name in sorted(features.keys()):
            #tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]
        is_real_example = None
        
        if "is_real_example" in features:
             is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
        else:
             is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
        
        if not is_predicting:   
            #CREATE THE MODEL
            (loss, predicted_labels, log_probs) = create_model(
                bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
                num_labels, use_one_hot_embeddings, is_predicting=False)
            #CALCULATE EVALUATION METRICS     

        tvars = tf.trainable_variables()
        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        #tf.logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            #tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            train_op = optimization.create_optimizer(
                loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss,
                train_op=train_op,
                scaffold=scaffold_fn)
        elif mode == tf.estimator.ModeKeys.EVAL:
            # Calculate evaluation metrics. 
            def metric_fn(label_ids, predicted_labels):
                accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
                recall = tf.metrics.recall(
                    label_ids,
                    predicted_labels)
                precision = tf.metrics.precision(
                    label_ids,
                    predicted_labels) 
                true_pos = tf.metrics.true_positives(
                    label_ids,
                    predicted_labels)
                true_neg = tf.metrics.true_negatives(
                    label_ids,
                    predicted_labels)   
                false_pos = tf.metrics.false_positives(
                    label_ids,
                    predicted_labels)  
                false_neg = tf.metrics.false_negatives(
                    label_ids,
                    predicted_labels)
                return {
                    "eval_accuracy": accuracy,
                    "precision": precision,
                    "recall": recall,
                    "true_positives": true_pos,
                    "true_negatives": true_neg,
                    "false_positives": false_pos,
                    "false_negatives": false_neg
                }

            eval_metrics = metric_fn(label_ids, predicted_labels)
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                loss=loss,
                eval_metric_ops=eval_metrics,
                scaffold=scaffold_fn)
        else:
            (predicted_labels, log_probs) = create_model(bert_config,
            is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings, is_predicting=True
            )       
            output_spec = tf.estimator.EstimatorSpec(
                mode=mode,
                predictions={"probabilities": log_probs, "labels":predicted_labels},
                scaffold=scaffold_fn)
        return output_spec

    return model_fn

In [25]:
OUTPUT_DIR = "./workingBERTNERrisky/output"
# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    keep_checkpoint_max=1,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [26]:
bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)
model_fn = model_fn_builder(
  bert_config=bert_config,
  num_labels= num_labels,
  init_checkpoint=BERT_INIT_CHKPNT,
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps,
  use_tpu=False,
  use_one_hot_embeddings=False)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

## PERFORM TRAINING ##

In [27]:
print('Beginning Training!')
current_time = datetime.now()
tf.logging.set_verbosity(tf.logging.INFO)
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

I0726 03:35:12.763883 139806183552768 estimator.py:360] Skipping training since max_steps has already saved.


Beginning Training!
Training took time  0:00:00.155731


## PERFORM EVALUATION ##

In [28]:
eval_file = os.path.join('./workingBERTNERrisky', "eval.tf_record")
#filename = Path(train_file)
if not os.path.exists(eval_file):
    open(eval_file, 'w').close()

eval_examples = create_examples(x_val)
file_based_convert_examples_to_features(
    eval_examples, MAX_SEQ_LENGTH, tokenizer, eval_file)

4796it [00:00, 427970.17it/s]
0it [00:00, ?it/s]I0726 03:35:18.140251 139806183552768 <ipython-input-19-5c09d3e76962>:130] *** Example ***
I0726 03:35:18.141622 139806183552768 <ipython-input-19-5c09d3e76962>:131] guid: 0
I0726 03:35:18.142499 139806183552768 <ipython-input-19-5c09d3e76962>:133] tokens: [CLS] The four nations on Cha ##vez ' s tour include Argentina , Uruguay , Ecuador , and Bolivia . [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [63]:
# This tells the estimator to run through the entire set.
eval_steps = None

eval_drop_remainder = False
eval_input_fn = file_based_input_fn_builder(
    input_file=eval_file,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

I0717 08:28:14.795304 140289763866368 estimator.py:1145] Calling model_fn.
I0717 08:28:18.770100 140289763866368 estimator.py:1147] Done calling model_fn.
I0717 08:28:18.789358 140289763866368 evaluation.py:255] Starting evaluation at 2019-07-17T08:28:18Z
I0717 08:28:19.230360 140289763866368 monitored_session.py:240] Graph was finalized.
I0717 08:28:19.248621 140289763866368 saver.py:1280] Restoring parameters from ./workingBERTNERrisky/output/model.ckpt-1348
I0717 08:28:20.103464 140289763866368 session_manager.py:500] Running local_init_op.
I0717 08:28:20.167800 140289763866368 session_manager.py:502] Done running local_init_op.
I0717 08:29:02.748021 140289763866368 evaluation.py:275] Finished evaluation at 2019-07-17-08:29:02
I0717 08:29:02.749196 140289763866368 estimator.py:2039] Saving dict for global step 1348: eval_accuracy = 0.99503005, false_negatives = 0.0, false_positives = 0.0, global_step = 1348, loss = 0.01687263, precision = 1.0, recall = 1.0, true_negatives = 484293.0

In [64]:
output_eval_file = os.path.join("./workingBERTNERrisky", "eval_results.txt")
with tf.gfile.GFile(output_eval_file, "w") as writer:
    tf.logging.info("***** Eval results *****")
    for key in sorted(result.keys()):
        tf.logging.info("  %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))

I0717 08:29:08.858383 140289763866368 <ipython-input-64-3d6fa7ef3418>:3] ***** Eval results *****
I0717 08:29:08.859773 140289763866368 <ipython-input-64-3d6fa7ef3418>:5]   eval_accuracy = 0.99503005
I0717 08:29:08.861100 140289763866368 <ipython-input-64-3d6fa7ef3418>:5]   false_negatives = 0.0
I0717 08:29:08.861933 140289763866368 <ipython-input-64-3d6fa7ef3418>:5]   false_positives = 0.0
I0717 08:29:08.862548 140289763866368 <ipython-input-64-3d6fa7ef3418>:5]   global_step = 1348
I0717 08:29:08.863182 140289763866368 <ipython-input-64-3d6fa7ef3418>:5]   loss = 0.01687263
I0717 08:29:08.863795 140289763866368 <ipython-input-64-3d6fa7ef3418>:5]   precision = 1.0
I0717 08:29:08.864350 140289763866368 <ipython-input-64-3d6fa7ef3418>:5]   recall = 1.0
I0717 08:29:08.864983 140289763866368 <ipython-input-64-3d6fa7ef3418>:5]   true_negatives = 484293.0
I0717 08:29:08.865566 140289763866368 <ipython-input-64-3d6fa7ef3418>:5]   true_positives = 129595.0


## PREPARE PREDICTIONS DATA ##

In [29]:
def convert_examples_to_features(examples,  max_seq_length, tokenizer, labelmap=label_map, pretokenized=False):
    """Loads a data file into a list of `InputBatch`s."""
    features = []
    origin_tokens = []
    for (ex_index, example) in tqdm(enumerate(examples)):
        feature, origin_token = convert_single_example(ex_index, example, max_seq_length, tokenizer, pretokenized=pretokenized)
        features.append(feature)
        origin_tokens.append(origin_token)
        
    return features, origin_tokens

In [30]:
def input_fn_builder(features, seq_length, is_training, drop_remainder):
  """Creates an `input_fn` closure to be passed to TPUEstimator."""

  all_input_ids = []
  all_input_mask = []
  all_segment_ids = []
  all_label_ids = []

  for feature in features:
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_segment_ids.append(feature.segment_ids)
    all_label_ids.append(feature.label_ids)

  def input_fn(params):
    """The actual input function."""
    batch_size = params["batch_size"]
    
    #normally this would be the number of 
    num_examples = len(features)

    # This is for demo purposes and does NOT scale to large data sets. We do
    # not use Dataset.from_generator() because that uses tf.py_func which is
    # not TPU compatible. The right way to load data is with TFRecordReader.
    d = tf.data.Dataset.from_tensor_slices({
        "input_ids":
            tf.constant(
                all_input_ids, shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_mask":
            tf.constant(
                all_input_mask,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "segment_ids":
            tf.constant(
                all_segment_ids,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "label_ids":
            tf.constant(all_label_ids, shape=[num_examples, seq_length], dtype=tf.int32),
    })

    if is_training:
      d = d.repeat()
      d = d.shuffle(buffer_size=100)

    d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
    return d

  return input_fn

In [31]:
inv_map = {v: k for k, v in label_map.items()}
def getPredictionDf(dataframe):
    #assuming dataframe of which first column is ['Word'] and each row contains a comma seperated list of words ["This","is","my","sentence","."]
    labels = LABELS
    sentences = dataframe['Word'].tolist()
    predict_examples = create_examples(dataframe, False)
    predict_features, predict_tokens = convert_examples_to_features(predict_examples, MAX_SEQ_LENGTH, tokenizer)
    predict_input_fn = input_fn_builder(features=predict_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
    predictions = estimator.predict(predict_input_fn)
    outputs = []
    for sentence,token_indexes,predicto in zip(sentences, predict_tokens, predictions):
        retdict = {}
        retdict['sentence'] = sentence
        predict_array = predicto['labels']
        endlabels = []
        for word, tok_index in zip(sentence,token_indexes):
            label_text = inv_map[predict_array[tok_index]]
            endlabels.append(label_text)
        retdict['labels'] = endlabels
        outputs.append(retdict)
    #return [(token, prediction['probabilities'], labels[prediction['labels']]) for token, prediction in zip(predict_tokens,predictions)]
    return outputs

In [101]:
import string
import re
def simpletokenizer(text):
    text = text.replace('\n',' \n ')
    text = text.replace('. ', ' . ')
    text = re.sub('\.([A-Z])\s\.', r'.\1.', text)
    puncnostop = string.punctuation.replace('.','')
    text = text.translate(str.maketrans({key: " {0} ".format(key) for key in puncnostop}))
    text = tf.keras.preprocessing.text.text_to_word_sequence(text, filters='', lower=False, split=' ')
    return(text)

def grouper(simpletokens,length,dataframe):
    for i in range(0,len(simpletokens), length):
        batch = simpletokens[i:i+length]
        while len(batch) < MAX_SEQ_LENGTH:
            batch.append('[PAD]')
        dataframe.append({'Word':batch},ignore_index=True)
    return dataframe

def convertDocStringtoTokens(docstring):
    #takes a block of text (string) of any length (ie an entire document) and returns word level tokens, BERTtokens as a df and an index linking the two
    labels = LABELS
    #pad punctuation with whitespace then split on whitespace
    #we will use these tokens to rebuild our marked up version of the text
    wordtokens = simpletokenizer(docstring)
    #this map keeps track of the index of the original token compared to the BERT token
    word_to_tok_map = []
    BERTparentlabels = []
    BERTparent = []
    BERTlabels = []
    BERTtokens = []
    x1 = 0
    y1 = 0
    for word in wordtokens:
        #add the special tags to begin and end sequences
        #we stop 10 short of the max sequence length as we don't want to risk splitting tokens for a single entity across input sequences
        if x1 >= (MAX_SEQ_LENGTH-10):
            BERTtokens.append('[SEP]')
            BERTlabels.append('[SEP]')            
            while len(BERTtokens)<128:
                #PAD out the remainder of the sequence up to 127 tokens then add the final 'sep'
                BERTtokens.append('[PAD]')
                BERTlabels.append('[PAD]')          
            BERTparent.append(BERTtokens)
            BERTparentlabels.append(BERTlabels)
            BERTtokens=[]
            BERTlabels=[]
            x1 = 0
            y1 += 128
        if x1 == 0:
            BERTtokens.append('[CLS]')
            BERTlabels.append('[CLS]')
            x1 = 1
        word_to_tok_map.append(len(BERTtokens)+y1)
        #tokenise using the BERT tokenizer (running = 'run','##ing')
        #remember tokenizer returns an array, even if we give it one word
        sub_words = tokenizer.tokenize(word)
        extras = ['X']*(len(sub_words)-1)
        BERTtokens.extend(sub_words)
        BERTlabels.append('[PAD]')
        BERTlabels.extend(extras)
        x1 += len(sub_words)
    if len(BERTtokens)!=128:
        BERTtokens.append('[SEP]')
        BERTlabels.append('[SEP]')
    while len(BERTtokens)<128:
        #PAD out the remainder of the sequence up to 127 tokens then add the final 'sep'
        BERTtokens.append('[PAD]')
        BERTlabels.append('[PAD]')    
    BERTparent.append(BERTtokens)
    BERTparentlabels.append(BERTlabels)
    dataframe = pd.DataFrame({'Labels':BERTparentlabels})
    dataframe.insert(0,'Word',BERTparent)
    return wordtokens, word_to_tok_map, dataframe
    
def getPredictionDocString(docstring):
    #takes a block of text (string) of any length (ie an entire document) and returns marked up text and entity list
    wordtokens, word_to_tok_map, dataframe = convertDocStringtoTokens(docstring)
    labels = LABELS
    predict_examples = create_examples(dataframe, False, True)
    predict_features, predict_tokens = convert_examples_to_features(predict_examples, MAX_SEQ_LENGTH, tokenizer,pretokenized=True)
    predict_input_fn = input_fn_builder(features=predict_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
    predictions = estimator.predict(predict_input_fn)
    outputs = []
    #rejoin all the predictions for each sequence as though they were one document
    allpredictions = []
    for predicto in predictions:
        predict_array = predicto['labels']
        for labelint in predict_array:
            allpredictions.append(inv_map[labelint])
    entitylist = []
    labellist = []
    entitydict ={}
    markup = ''
    #This next bit is kind of herendous but necessary to stitch back together words to labels, rather than BERT tokens to labels
    #We also construct the marked up text by rejoining words with whitespace and spans with the label class
    for word,token_index,predicted in zip(wordtokens, word_to_tok_map, allpredictions):
        wordlabel = allpredictions[token_index]
        if wordlabel not in ['O','X','[SEP]','[PAD]','[CLS]']:
            if wordlabel[0] == 'I':
                joinedentity = entitylist.pop() + ' ' + word
                entitylist.append(joinedentity)
                markup = markup[:-7] + ' ' + word + '</span>'
            else:
                if word == '\n':
                    markup = markup + word
                else:    
                    entitylist.append(word)
                    labellist.append(wordlabel)
                    markup = markup + ' <span class="' + wordlabel.lower() + '">'+word+'</span>'
        elif wordlabel not in ['[SEP]','[PAD]','[CLS]']:
            if word in string.punctuation:
                if word in ['.','”']:
                    markup = markup + '. '
                elif word == '“':
                    markup = markup + ' "'
                else:
                    markup = markup + word
            else:
                markup = markup + ' ' + word
                
    for entity,label in zip(entitylist, labellist):
        entitydict[entity] = label
    return entitydict, markup

## MAKE A PREDICTION ##

### ... using a document string ###

Note this is assuming one long document and there is a slow spin-up time each tensorflow calls estimator.predict(). Estimator.predict() causes the model to save its state before running. 

To classify a batch of documents efficiently you must adjust this code to call predict once or infrequently by sending the 128-length sequences for all docs together as a big batch, then later splitting them apart. 

In [102]:
testdoc = 'Despite criticism from Democrats that his comments about the four minority congresswomen are racist, Trump went on an extended diatribe about the lawmakers, saying they were welcome to leave the country if they did not like his policies on issues such as immigration and defending Israel.\n“So these Congresswomen, their comments are helping to fuel the rise of a dangerous, militant hard left,” the Republican president said to roars from the crowd in North Carolina, a state seen as key to his re-election.\nTrump tweeted over the weekend that the four progressive representatives, known as “the squad” - Ilhan Omar of Minnesota, Alexandria Ocasio-Cortez of New York, Rashida Tlaib of Michigan and Ayanna Pressley of Massachusetts - should “go back” where they came from, even though all are U.S. citizens and three are U.S.-born.\nThe aim, one source close to Trump said, was to make Democrats look as far left as possible to moderate voters as he girds for a tough re-election battle in November 2020.\n“He is trying to make them the face of the Democratic Party as we move closer into the 2020 cycle and he’s trying to highlight them as a fringe crowd as much as possible so it turns off your middle-of-the-road voters,” the source said.'

In [103]:
entities, markup = getPredictionDocString(testdoc)
print(entities)
print(markup)

3it [00:00, 8366.30it/s]
0it [00:00, ?it/s]I0726 06:18:08.623424 139806183552768 <ipython-input-19-5c09d3e76962>:130] *** Example ***
I0726 06:18:08.624508 139806183552768 <ipython-input-19-5c09d3e76962>:131] guid: 0
I0726 06:18:08.625475 139806183552768 <ipython-input-19-5c09d3e76962>:133] tokens: [CLS] Despite criticism from Democrats that his comments about the four minority congress ##wo ##men are racist , Trump went on an extended di ##at ##ri ##be about the law ##makers , saying they were welcome to leave the country if they did not like his policies on issues such as immigration and defending Israel . “ So these Congress ##wo ##men , their comments are helping to fuel the rise of a dangerous , militant hard left , ” the Republican president said to roar ##s from the crowd in North Carolina , a state seen as key to his re - election . Trump t ##weet ##ed over the weekend that the four progressive representatives , [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
I0726 

length of tokens is  128
['[CLS]', 'Despite', 'criticism', 'from', 'Democrats', 'that', 'his', 'comments', 'about', 'the', 'four', 'minority', 'congress', '##wo', '##men', 'are', 'racist', ',', 'Trump', 'went', 'on', 'an', 'extended', 'di', '##at', '##ri', '##be', 'about', 'the', 'law', '##makers', ',', 'saying', 'they', 'were', 'welcome', 'to', 'leave', 'the', 'country', 'if', 'they', 'did', 'not', 'like', 'his', 'policies', 'on', 'issues', 'such', 'as', 'immigration', 'and', 'defending', 'Israel', '.', '“', 'So', 'these', 'Congress', '##wo', '##men', ',', 'their', 'comments', 'are', 'helping', 'to', 'fuel', 'the', 'rise', 'of', 'a', 'dangerous', ',', 'militant', 'hard', 'left', ',', '”', 'the', 'Republican', 'president', 'said', 'to', 'roar', '##s', 'from', 'the', 'crowd', 'in', 'North', 'Carolina', ',', 'a', 'state', 'seen', 'as', 'key', 'to', 'his', 're', '-', 'election', '.', 'Trump', 't', '##weet', '##ed', 'over', 'the', 'weekend', 'that', 'the', 'four', 'progressive', 'represent

I0726 06:18:11.704499 139806183552768 estimator.py:1147] Done calling model_fn.
I0726 06:18:12.228865 139806183552768 monitored_session.py:240] Graph was finalized.
I0726 06:18:12.232276 139806183552768 saver.py:1280] Restoring parameters from ./workingBERTNERrisky/output/model.ckpt-1348
I0726 06:18:13.430268 139806183552768 session_manager.py:500] Running local_init_op.
I0726 06:18:13.492712 139806183552768 session_manager.py:502] Done running local_init_op.


{'Alexandria Ocasio': 'B-geo', 'November 2020': 'B-tim', 'Michigan': 'B-geo', 'Massachusetts': 'B-geo', 'Democratic Party': 'B-org', 'New York': 'B-geo', 'Minnesota': 'B-geo', 'Israel': 'B-geo', 'Trump': 'B-per', 'Rashida Tlaib': 'B-per', 'Congresswomen': 'B-org', 'North Carolina': 'B-geo', 'weekend': 'B-tim', 'Ilhan Omar': 'B-per', 'Ayanna Pressley': 'B-per', 'U.S.': 'B-geo', '2020': 'B-tim'}
 Despite criticism from Democrats that his comments about the four minority congresswomen are racist, <span class="b-per">Trump</span> went on an extended diatribe about the lawmakers, saying they were welcome to leave the country if they did not like his policies on issues such as immigration and defending <span class="b-geo">Israel</span>.  these <span class="b-org">Congresswomen</span>, their comments are helping to fuel the rise of a dangerous, militant hard left, ” the Republican president said to roars from the crowd in <span class="b-geo">North Carolina</span>, a state seen as key to his r