This notebook explores sequence labeling using the Twitter NER dataset from the [W-NUT 2016 shared task](https://noisy-text.github.io/2016/ner-shared-task.html#resource).

In [1]:
import numpy as np
from keras.layers import Dense, Input, Embedding, TimeDistributed, Layer, Multiply, Concatenate, Dropout, LSTM, Bidirectional
from keras.models import Model, Sequential
from keras import backend as K
import tensorflow as tf
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [2]:
def load_embeddings(filename, max_vocab_size):

    """ Load pre-trained word embeddings, reserving 0 for padding symbol and 1 for UNK """
    
    vocab={}
    embeddings=[]
    with open(filename) as file:
        
        cols=file.readline().split(" ")
        num_words=int(cols[0])
        size=int(cols[1])
        embeddings.append(np.zeros(size))  # 0 = 0 padding if needed
        embeddings.append(np.zeros(size))  # 1 = UNK
        vocab["_0_"]=0
        vocab["_UNK_"]=1
        
        for idx,line in enumerate(file):

            if idx+2 >= max_vocab_size:
                break

            cols=line.rstrip().split(" ")
            val=np.array(cols[1:])
            word=cols[0]
            
            embeddings.append(val)
            vocab[word]=idx+2

    return np.array(embeddings), vocab

In [3]:
def get_word_ids(sentences, word_vocab, label_vocab):
    
    """ Function to convert a list of sentences (where each sentence is a list of (word, tag) tuples)
    into:
    -- a list of padded sequences of word ids
    -- a list of padded sequence of tag ids
    -- a list of sequence lengths (the original token count for each sentence)
    
    Pads each sequence to the maximum sequence length observed in the sentences input
    
    """
    
    words_ids=[]
    sent_lengths=[]
    tags_ids=[]
    
    output_dim=len(label_vocab)+1
    
    max_length=0
    for sentence in sentences:
        if len(sentence) > max_length:
            max_length=len(sentence)
    
    for sentence in sentences:
        wids=[]
        tids=[]
        
        for word, tag in sentence:
            val = word_vocab[word.lower()] if word.lower() in word_vocab else 1
            wids.append(val)
            y = to_categorical(label_vocab[tag], num_classes=output_dim)
            tids.append(y)
        
        
        for i in range(len(wids),max_length):
            wids.append(0)
            tids.append(to_categorical(0, num_classes=output_dim))
            
        words_ids.append(wids)
        tags_ids.append(tids)
        sent_lengths.append(len(sentence))
 
    return np.array(words_ids), np.array(tags_ids), np.array(sent_lengths)

In [4]:
def read_tsv(filename):
    
    """ Read input in two-column TSV, one line per word, with sentences delimited by a blank line """
    
    sentences=[]
    sentence=[]
    with open(filename) as file:
        for line in file:
            cols=line.rstrip().split("\t")
            if len(cols) < 2:
                if len(sentence) > 0:
                    sentences.append(sentence)
                sentence=[]
                continue
                
            word=cols[0]
            tag=cols[1]
            
            sentence.append((word, tag))
            
        if len(sentence) > 0:
            sentences.append(sentence)
            
    return sentences

In [5]:
def get_tag_vocab(sentences):
    tags={}
    # 0 is for masking
    tid=1
    for sentence in sentences:
        for word, tag in sentence:
            if tag not in tags:
                tags[tag]=tid
                tid+=1
    return tags

In [6]:
data=read_tsv("../data/twitter-ner/ner.train.txt")
devData=read_tsv("../data/twitter-ner/ner.dev.txt")

In [7]:
tag_vocab=get_tag_vocab(data)
rev_tags={}
for t in tag_vocab:
    rev_tags[tag_vocab[t]]=t

print(tag_vocab)

{'O': 1, 'B-facility': 2, 'I-facility': 3, 'B-company': 4, 'B-person': 5, 'B-tvshow': 6, 'B-other': 7, 'I-other': 8, 'B-sportsteam': 9, 'I-person': 10, 'B-geo-loc': 11, 'B-movie': 12, 'I-movie': 13, 'I-tvshow': 14, 'B-product': 15, 'I-company': 16, 'B-musicartist': 17, 'I-musicartist': 18, 'I-geo-loc': 19, 'I-product': 20, 'I-sportsteam': 21}


In [15]:
embeddings, word_vocab=load_embeddings("../data/glove.twitter.27B.100d.50K.txt.w2v", 100000)

In [16]:
trainX, trainY, trainS=get_word_ids(data, word_vocab, tag_vocab)
devX, devY, devS=get_word_ids(devData, word_vocab, tag_vocab)

Let's train a bidirectional LSTM for sequence labeling to make predictions about the NER tag for each word in a sentence.  Explore the effect of the lstm size and dropout rate.

In [17]:
def create_bilstm(embeddings, output_dim, lstm_size=25, dropout_rate=0.25):
    
    vocab_size, word_embedding_dim=embeddings.shape

    word_sequence_input = Input(shape=(None,), dtype='int32')
    sentence_lengths = Input(shape=(None,), dtype='int32')

    word_embedding_layer = Embedding(vocab_size,
                                    word_embedding_dim,
                                    weights=[embeddings],
                                    trainable=False, mask_zero=True)

    embedded_sequences = word_embedding_layer(word_sequence_input)
    bi_lstm = Bidirectional(LSTM(lstm_size, return_sequences=True, activation='relu', dropout=dropout_rate), merge_mode='concat')(embedded_sequences)
    preds = TimeDistributed(Dense(output_dim, activation="softmax"))(bi_lstm)

    model = Model(inputs=[word_sequence_input, sentence_lengths], outputs=preds)

    model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=["acc"])

    return model

In [18]:
def train(model, modelName):
    print (model.summary())

    early_stopping = EarlyStopping(monitor='val_loss',
    min_delta=0,
    patience=10,
    verbose=0, 
    mode='auto')

    checkpoint = ModelCheckpoint(modelName, monitor='val_loss', verbose=0, save_best_only=True, mode='min')
    
    model.fit([trainX, trainS], trainY, 
            validation_data=([devX, devS], devY),
            epochs=30, batch_size=32,
            callbacks=[checkpoint, early_stopping])
    

Let's train a model on the data and save the one that performs best on the validation data in `bilstm_sequence_labeling.hdf5`

In [19]:
model=create_bilstm(embeddings, len(tag_vocab)+1)
train(model, "bilstm_sequence_labeling.hdf5")

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, None, 100)         5000200   
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 50)          25200     
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 22)          1122      
Total params: 5,026,522
Trainable params: 26,322
Non-trainable params: 5,000,200
_________________________________________________________________
None
Instructions for updating:
Use tf.cast instead.
Train on 1900 sampl

We can explore the performance of the model by predicting the NER tags for a new sequence.

In [20]:
model=create_bilstm(embeddings, len(tag_vocab)+1)
model.load_weights("bilstm_sequence_labeling.hdf5")

In [21]:
def predict(text, model, rev_tags):
    text=text.split(" ")
    wids=[]
    for t in text:
        if t.lower() in word_vocab:
            wids.append(word_vocab[t.lower()])
        else:
            wids.append(0)

    wids=np.array(wids)
    lengths=np.array([len(wids)])


    # lengths=np.expand_dims(lengths, 0)
    preds=model.predict([[wids], [lengths]])
    y_classes = preds.argmax(axis=-1)


    predicted=[rev_tags[t] for t in y_classes[0]]
    for w, t in zip(text, predicted):
        print("%s\t%s" % (w,t))

In [22]:
text="Bill Gates is the founder of Microsoft"
predict(text, model, rev_tags)

Bill	B-person
Gates	B-person
is	O
the	O
founder	B-person
of	I-other
Microsoft	B-product


Q1: You'll notice above that the model gets a token-level validation accuracy around 95 simply due to the high presence of the majority class ("O").  That's not a very helpful metric  in this case. Implement F-score for NER.  Remember, the F-score for NER is based on *chunks*; for more, see section 11.3.2 in: of SLP3 [chapter 11](https://web.stanford.edu/~jurafsky/slp3/11.pdf)

In [23]:
def calculateF1(gold_sequences, predicted_sequences):
    
    """ Function to calculate the precision, recall and F-score over labeled chunks in the gold and predicted
    input sequences.  Each input parameter contains a list of label sequences (one label for each word in the
    sentence). In the following example, `gold_sequences` and `predicted_sequences` both contain two sentences
    (the first has 7 words/tags, and the second has 3 words/tags):
    
    gold_sequences=[["B-PER", "I-PER", "O", "O", "O", "O", "B-ORG"], ["O", "O", "O"]]
    predicted_sequences=[["B-PER", "O", "O", "O", "B-PER", "O", "B-ORG"], ["O", "O", "O"]]
    
    Returns tuple of (precision, recall, F-score)
    
    """
    def get_entities(sequences):
        
        ents=[]

        for s_idx in range(len(sequences)):
            
            sent=sequences[s_idx]
            
            start=None
            startCat=None

            for w_idx in range(len(sent)):
                tag=sent[w_idx]
                parts=tag.split("-")
                BIO="O"
                if len(parts) == 2:
                    BIO=parts[0]
                    cat=parts[1]

                if BIO == "B" or BIO == "O":
                    if start != None:
                        end=w_idx-1

                        ents.append((s_idx, start, end, startCat))

                        start=None
                        startCat=None
                        end=None

                if BIO == "B":
                    start=w_idx
                    startCat=cat

            if start != None:
                ents.append((s_idx, start, len(sent)-1, startCat))

        return ents
        
                    
    gold_ents=get_entities(gold_sequences)
    pred_ents=get_entities(predicted_sequences)

    g_set=set(gold_ents)
    p_set=set(pred_ents)
    
    precision=0
    if len(p_set) > 0:
        precision=float(len(g_set.intersection(p_set)))/len(p_set)
    recall=0
    if len(g_set) > 0:
        recall=float(len(g_set.intersection(p_set)))/len(g_set)
    
    F1=0
    if precision + recall > 0:
        F1=2*precision*recall/(precision+recall)

    return precision, recall, F1

In [24]:
# Example from class on 4/4

precision, recall, F1=calculateF1([["B-PER", "I-PER", "O", "O", "O", "O", "B-ORG"], ["O", "O", "O"]], [["B-PER", "O", "O", "O", "B-PER", "O", "B-ORG"], ["O", "O", "O"]])
print("P: %.3f, R: %.3f, F: %.3f" % (precision, recall, F1))

P: 0.333, R: 0.500, F: 0.400


Keras by default calculates metrics like accuracy at the batch level (averaging the metric across batches).  F-score, however, is a metric properly calculated over an entire dataset; we can incorporate that into learning by defining a callback function that prints out the validation F-score at the end of each epoch.  Once you've implemented `calculateF1` above, execute the following cells to see the validation F-score while training.

In [25]:
class F_score(Callback):
    
    def __init__(self, reverse_tag_vocab):
        self.reverse_tag_vocab=reverse_tag_vocab
        
    def on_epoch_end(self, epoch, logs={}):
        
        valX=self.validation_data[0]
        valS=self.validation_data[1]
        valY=self.validation_data[2]
        
        predictions=self.model.predict([valX, valS])
        y_classes = predictions.argmax(axis=-1)
        truth = valY.argmax(axis=-1)

        preds=[]
        golds=[]

        s,w=y_classes.shape
        for i in range(s):
            sent_preds=[]
            sent_golds=[]
            for j in range(int(valS[i])):
                sent_golds.append(self.reverse_tag_vocab[truth[i,j]])
                sent_preds.append(self.reverse_tag_vocab[y_classes[i,j]])
            preds.append(sent_preds)
            golds.append(sent_golds)
        
        precision, recall, F1=calculateF1(golds, preds)
        print("P: %.3f, R: %.3f, F: %.3f" % (precision, recall, F1))
    
        return

In [26]:
def train(model, modelName):
    print (model.summary())

    early_stopping = EarlyStopping(monitor='val_loss',
    min_delta=0,
    patience=10,
    verbose=0, 
    mode='auto')

    f_score=F_score(rev_tags)
    checkpoint = ModelCheckpoint(modelName, monitor='val_loss', verbose=0, save_best_only=True, mode='min')
    
    model.fit([trainX, trainS], trainY, 
            validation_data=([devX, devS], devY),
            epochs=30, batch_size=32,
            callbacks=[f_score, checkpoint, early_stopping])
    

In [27]:
model=create_bilstm(embeddings, len(tag_vocab)+1)
train(model, "bilstm_sequence_labeling.hdf5")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, None)              0         
_________________________________________________________________
embedding_3 (Embedding)      (None, None, 100)         5000200   
_________________________________________________________________
bidirectional_3 (Bidirection (None, None, 50)          25200     
_________________________________________________________________
time_distributed_3 (TimeDist (None, None, 22)          1122      
Total params: 5,026,522
Trainable params: 26,322
Non-trainable params: 5,000,200
_________________________________________________________________
None
Train on 1900 samples, validate on 240 samples
Epoch 1/30
P: 0.000, R: 0.000, F: 0.000
Epoch 2/30
P: 0.000, R: 0.000, F: 0.000
Epoch 3/30
P: 0.000, R: 0.000, F: 0.000
Epoch 4/30
P: 0.100, R: 0.007, F: 0.014
Epoch 5/30
P: 0.343, R: 0.090, F: 0.142
Epoch 6