In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import csv
import sys
import nltk

### Data Preparation

Data is from here https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/data, download ner_dataset.csv from the ZIP archive.



In [21]:
validation_sentence = 'While speaking on Channels Television on Thursday April 5 2018 Adesina said the fund is not just to intensify the military fight against Boko Haram but to fight other forms of insecurity in the country'

validation_tags = ['O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'B-TIM', 'I-TIM', 'I-TIM', 'I-TIM', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                  'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

Below we parse the file to load sentences and tags into different lists. Also, we only want sentences not more than 35 words long (same length as our validation sentence above)

In [22]:
sentences = []
tags = []
max_length = len(validation_tags)

with open('data/ner_dataset.csv', 'rb') as csvfile:
    ner_data = csv.reader(csvfile, delimiter=',')
    sentence = []
    tag = []
    for row in ner_data:
        
        sentence.append(row[1])
        tag.append(row[3].upper())
        
        if row[1] == '.':
            if len(sentence) <= max_length:
                sentences.append(sentence)
                tags.append(tag)
            sentence = []
            tag = []

Below is sample entries of `sentences` and `tags`

In [23]:
print sentences[:2]
print
print tags[:2]

[['Word', 'Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.'], ['Families', 'of', 'soldiers', 'killed', 'in', 'the', 'conflict', 'joined', 'the', 'protesters', 'who', 'carried', 'banners', 'with', 'such', 'slogans', 'as', '"', 'Bush', 'Number', 'One', 'Terrorist', '"', 'and', '"', 'Stop', 'the', 'Bombings', '.']]

[['TAG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GEO', 'O', 'O', 'O', 'O', 'O', 'B-GEO', 'O', 'O', 'O', 'O', 'O', 'B-GPE', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]


We'll need to create a vocabulary from our sentences i.e a set of unique words. We'll do same for the tags too

In [24]:
unique_tags = list(set(t for tagset in tags for t in tagset))
vocabulary = list(set(word for sentence in sentences for word in sentence))

In [25]:
print unique_tags

['I-ART', 'I-EVE', 'B-EVE', 'B-GPE', 'B-TIM', 'I-TIM', 'B-ORG', 'B-ART', 'O', 'B-GEO', 'I-GPE', 'TAG', 'I-GEO', 'B-PER', 'I-PER', 'I-ORG', 'I-NAT', 'B-NAT']


In [26]:
print vocabulary[:10]
print 'Number of words in vocabulary', len(vocabulary)

['heavily-fortified', 'mid-week', '1,800', 'Pronk', 'woods', 'Safarova', 'Nampo', 'hanging', 'trawling', 'five-nation']
Number of words in vocabulary 33105


In [27]:
train_sentences = sentences[:int(.7 * len(sentences))]
train_tags = tags[:int(.7 * len(tags))]

test_sentences = sentences[int(.7 * len(tags) + 1):]
test_tags = tags[int(.7 * len(tags) + 1):]

In [28]:
len(train_sentences), len(test_sentences), len(sentences)

(31732, 13599, 45332)

### Model Architecture
Simple LSTM network with a softmax at the end

In [42]:
# Parameters
learning_rate = 0.001
batch_size = 32
target_size = len(unique_tags)
display_size = 50

# Network Parameters
n_features = len(vocabulary)
sequence_length = 10
n_hidden = 128 # hidden layer num of features

tf.reset_default_graph()

# tf Graph input
X = tf.placeholder('float', [None, max_length, n_features], name='X')
Y = tf.placeholder('float', [None, max_length, target_size], name='Y')

# Define weights
weights = {
    'out': tf.Variable(tf.random_normal([n_hidden, target_size]))
}
biases = {
    'out': tf.Variable(tf.random_normal([target_size]))
}

In [43]:

cell = tf.contrib.rnn.LSTMCell(n_hidden, state_is_tuple=True)

val, state = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

val = tf.transpose(val, [1, 0, 2])
last = tf.gather(val, int(val.get_shape()[0]) - 1)


prediction = tf.nn.softmax(tf.matmul(last, weights['out']) + biases['out'])
prediction = tf.reshape(prediction, [batch_size, -1, target_size])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(Y * tf.log(prediction), [1, 2]))

# prediction = tf.matmul(last, weights['out']) + biases['out']
# cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=Y))

minimize = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cross_entropy)

mistakes = tf.not_equal(tf.argmax(Y, 1), tf.argmax(prediction, 1))
error = tf.reduce_mean(tf.cast(mistakes, tf.float32))


In [44]:
init = tf.global_variables_initializer()
num_batches = int(len(train_sentences)) / batch_size
epoch = 1
print 'Number of batches:', num_batches

Number of batches: 469


In [45]:
len(train_sentences)

15024

### Run graph using one-hot encoding of words

In [None]:
with tf.Session() as sess:
    sess.run(init)
    for i in range(epoch):        
        
        for j in range(num_batches):
            ptr = 0
            batch_X = []
            batch_Y = []
            for _ in range(batch_size):
                x, y = (train_sentences[ptr: ptr + 1], 
                        train_tags[ptr: ptr + 1])            

                x_one_hot = []

                for s in x[0]:
                    x_one_hot.append(np.eye(len(vocabulary))[vocabulary.index(s)])
                    
                for remainder in range(max_length - len(x_one_hot)):
                    x_one_hot.append([0]*len(vocabulary))
                    
                batch_X.append(x_one_hot)              

                y_one_hot = []

                for t in y[0]:
                    y_one_hot.append(np.eye(target_size)[unique_tags.index(t)])
                    
                for remainder in range(max_length - len(y_one_hot)):
                    y_one_hot.append(np.eye(target_size)[unique_tags.index('O')])
                    
                batch_Y.append(y_one_hot)

                ptr += 1
            
            _, entropy, preds = sess.run([minimize, cross_entropy, prediction],{X: np.array(batch_X).reshape(batch_size, max_length, len(vocabulary)), Y: np.array(batch_Y).reshape(batch_size, max_length, target_size)})
            
            if j % display_size == 0:
                print 'Loss at batch {0}'.format(j), entropy
            
#             valid_x_one_hot = []
            
#             for v in validation_sentence.split(' ')[:10]:
#                 try:
#                     valid_x_one_hot.append(np.eye(len(vocabulary))[vocabulary.index(v)])
#                 except:
#                     valid_x_one_hot.append([0]*len(vocabulary))

#             valid_y_one_hot = []

#             for vt in validation_tags[:10]:
#                 valid_y_one_hot.append(np.eye(target_size)[unique_tags.index(vt)])
            
#             preds = sess.run([prediction],{X: np.array(valid_x_one_hot).reshape(1, 10, len(vocabulary)), Y: np.array(valid_y_one_hot).reshape(1, 10, 18)})
#             print preds

        print "Epoch ",str(i)
    

Loss at batch 0 87.09978
Loss at batch 50 9.664997


### Word Embeddings
We'll use Google's word2vec which you can grab from here https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit.
To load the word embeddings, we'll neeed another tool, `gensim`.

In [10]:
from gensim.models import word2vec, KeyedVectors

Load the word vectors like so. This operations takes a good while on my laptop; core i5.

In [11]:
w2v = KeyedVectors.load_word2vec_format('/Users/h/Projects/Machine-Learning/GoogleNews-vectors-negative300.bin.gz', binary=True)

Below is what `boy` is represented according to the embedding

In [12]:
w2v.word_vec('boy')

array([ 2.35351562e-01,  1.65039062e-01,  9.32617188e-02, -1.28906250e-01,
        1.59912109e-02,  3.61328125e-02, -1.16699219e-01, -7.32421875e-02,
        1.38671875e-01,  1.15356445e-02,  1.87500000e-01, -2.91015625e-01,
        1.70898438e-02, -1.84570312e-01, -2.87109375e-01,  2.54821777e-03,
       -2.19726562e-01,  1.77734375e-01, -1.20605469e-01,  5.39550781e-02,
        3.78417969e-02,  2.49023438e-01,  1.76757812e-01,  2.69775391e-02,
        1.21093750e-01, -3.51562500e-01, -5.83496094e-02,  1.22070312e-01,
        5.97656250e-01, -1.60156250e-01,  1.08398438e-01, -2.40478516e-02,
       -1.16699219e-01,  3.58886719e-02, -2.37304688e-01,  1.15234375e-01,
        5.27343750e-01, -2.18750000e-01, -4.54101562e-02,  3.30078125e-01,
        3.75976562e-02, -5.51757812e-02,  3.26171875e-01,  6.74438477e-03,
        3.71093750e-01,  3.68652344e-02,  6.68945312e-02,  5.17578125e-02,
       -4.76074219e-02, -7.91015625e-02,  4.46777344e-02,  1.67968750e-01,
        5.51757812e-02, -

### Run graph with words represented as word2vec
Same as architecture as pervious except `n_features` is now the dimension of the vector returned by word2vec

In [29]:
# Parameters
learning_rate = 0.001
batch_size = 32
target_size = len(unique_tags)
display_size = 50

# Network Parameters
n_features = 300 # dimension of the vector return by word2vec
sequence_length = max_length
n_hidden = 128 # hidden layer num of features

tf.reset_default_graph()

# tf Graph input
X = tf.placeholder('float', [None, max_length, n_features], name='X')
Y = tf.placeholder('float', [None, max_length, target_size], name='Y')

# Define weights
weights = {
    'out': tf.Variable(tf.random_normal([n_hidden, target_size]))
}
biases = {
    'out': tf.Variable(tf.random_normal([target_size]))
}

In [30]:

cell = tf.contrib.rnn.LSTMCell(n_hidden, state_is_tuple=True)

val, state = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

val = tf.transpose(val, [1, 0, 2])
last = tf.gather(val, int(val.get_shape()[0]) - 1)


prediction = tf.nn.softmax(tf.matmul(last, weights['out']) + biases['out'])
prediction = tf.reshape(prediction, [batch_size, -1, target_size])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(Y * tf.log(prediction), [1, 2]))

# prediction = tf.matmul(last, weights['out']) + biases['out']
# cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=Y))

minimize = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cross_entropy)

mistakes = tf.not_equal(tf.argmax(Y, 1), tf.argmax(prediction, 1))
error = tf.reduce_mean(tf.cast(mistakes, tf.float32))


In [31]:
init = tf.global_variables_initializer()
num_batches = int(len(train_sentences)) / batch_size
epoch = 1
print 'Number of batches:', num_batches

Number of batches: 991


In [32]:
with tf.Session() as sess:
    sess.run(init)
    for i in range(epoch):        
        
        for j in range(num_batches):
            ptr = 0
            batch_X = []
            batch_Y = []
            for _ in range(batch_size):
                x, y = (train_sentences[ptr: ptr + 1], 
                        train_tags[ptr: ptr + 1])            

                x_one_hot = []

                for s in x[0]:
                    try:
                        x_one_hot.append(w2v.word_vec(s))
                    except:
                        #if word isn't in the word2vec, use zeroes
                        x_one_hot.append([0]*n_features)
                    
                for remainder in range(max_length - len(x_one_hot)):
                    #pad sentence remainder with zeroes
                    x_one_hot.append([0]*n_features)
                    
                batch_X.append(x_one_hot)              

                y_one_hot = []

                for t in y[0]:
                    y_one_hot.append(np.eye(target_size)[unique_tags.index(t)])
                    
                for remainder in range(max_length - len(y_one_hot)):
                    y_one_hot.append(np.eye(target_size)[unique_tags.index('O')])
                    
                batch_Y.append(y_one_hot)

                ptr += 1
            
            _, entropy, preds = sess.run([minimize, cross_entropy, prediction],{X: np.array(batch_X).reshape(batch_size, max_length, n_features), Y: np.array(batch_Y).reshape(batch_size, max_length, target_size)})
            
            if j % display_size == 0:
                print 'Loss at batch {0}'.format(j), entropy

        print "Epoch ",str(i)
    

Loss at batch 0 101.00055
Loss at batch 50 15.769457
Loss at batch 100 13.819975
Loss at batch 150 13.336898
Loss at batch 200 12.802603
Loss at batch 250 12.793713
Loss at batch 300 12.767519
Loss at batch 350 12.957819
Loss at batch 400 12.76519
Loss at batch 450 12.762266
Loss at batch 500 12.815669
Loss at batch 550 12.763356
Loss at batch 600 12.761274
Loss at batch 650 12.760481
Loss at batch 700 12.777012
Loss at batch 750 12.76202
Loss at batch 800 12.76059
Loss at batch 850 12.760016
Loss at batch 900 12.79056
Loss at batch 950 12.762553
Epoch  0


Obvious benefit of using word2vec is that the network runs faster, converges quicker too. Runs faster because we've reduced the feature representation from an outrageous dimension in the length of the vocabulary (thousands) to only 300, the dimension of the array returned by word2vec.

### Prediction

In [43]:
with tf.Session() as sess:
    sess.run(init)

    valid_X = []
    
    for word in validation_sentence.split(' '):
        try:
            valid_X.append(w2v.word_vec(word))
        except:
            #if word isn't in the word2vec, use zeroes
            valid_X.append([0]*n_features)

    for remainder in range(max_length - len(valid_X)):
        #pad sentence remainder with zeroes
        valid_X.append([0]*n_features)           

    valid_Y = []

    for t in validation_tags:
        valid_Y.append(np.eye(target_size)[unique_tags.index(t)])

    for remainder in range(max_length - len(valid_Y)):
        valid_Y.append(np.eye(target_size)[unique_tags.index('O')])
            
    preds = sess.run([prediction],{X: np.array(valid_X).reshape(1, max_length, n_features), Y: np.array(valid_Y).reshape(1, max_length, target_size)})
    print preds

InvalidArgumentError: Input to reshape is a tensor with 18 values, but the requested shape requires a multiple of 576
	 [[Node: Reshape = Reshape[T=DT_FLOAT, Tshape=DT_INT32, _device="/job:localhost/replica:0/task:0/cpu:0"](Softmax, Reshape/shape)]]

Caused by op u'Reshape', defined at:
  File "/anaconda2/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/anaconda2/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/anaconda2/lib/python2.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/anaconda2/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 478, in start
    self.io_loop.start()
  File "/anaconda2/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/anaconda2/lib/python2.7/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/anaconda2/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/anaconda2/lib/python2.7/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2718, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2822, in run_ast_nodes
    if self.run_code(code, result):
  File "/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-30-6f58bb698d38>", line 11, in <module>
    prediction = tf.reshape(prediction, [batch_size, -1, target_size])
  File "/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 2510, in reshape
    name=name)
  File "/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 768, in apply_op
    op_def=op_def)
  File "/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2336, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1228, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): Input to reshape is a tensor with 18 values, but the requested shape requires a multiple of 576
	 [[Node: Reshape = Reshape[T=DT_FLOAT, Tshape=DT_INT32, _device="/job:localhost/replica:0/task:0/cpu:0"](Softmax, Reshape/shape)]]


### Things to try
- Use a bidirectional LSTM
- Add dropout
- Replace softmax with Linear-Chain CRF
- Try other word representations; Glove?
- Tune batch size, learning rate
- Add MOAR layers!!!