# Word tagging

Annotate relevant tags in texts. Example: name entities.



In [1]:
from __future__ import print_function
import tensorflow as tf 
import os 
print(tf.__version__)


0.12.0-rc0


## Dataset

ATIS (Airline Travel Information System) dataset. Available in: https://github.com/mesnilgr/is13/blob/master/data/load.py

### Example:

Input (words)	show	flights	from	Boston	to	New	York	today

Output (labels)	O	O	O	B-dept	O	B-arr	I-arr	B-date




In [3]:
# Read data

import numpy as np 
import pickle

atis_file = '/Users/jorge/data/training/text/atis/atis.pkl'
with open(atis_file,'rb') as f:
    #train, test, dicts = pickle.load(f, encoding='bytes') #python3
    train, test, dicts = pickle.load(f)


In [4]:
print(dicts.keys())

['labels2idx', 'tables2idx', 'words2idx']


## train / test sets:
    - X: list of input sequences
    - label: List of target labels asociated to each word in each sentence.
## Dictionaries
    - labels2idx:  To decode the labels
    - words2idx: To decode the sentences

In [5]:
# Visualize data

#w2idx, _, labels2idx = dicts['words2idx'], dicts['tables2idx'], dicts['labels2idx']
#idx2w  = dict((v,k) for k,v in w2idx.iteritems())
#idx2ne = dict((v,k) for k,v in ne2idx.iteritems())
#idx2la = dict((v,k) for k,v in labels2idx.iteritems())


w2idx, ne2idx, labels2idx = dicts[b'words2idx'], dicts[b'tables2idx'], dicts[b'labels2idx']
    
idx2w  = dict((v,k) for k,v in w2idx.items())
idx2la = dict((v,k) for k,v in labels2idx.items())

train_x, _, train_label = train
test_x,  _,  test_label  = test
wlength = 35

for e in ['train','test']:
    print(e)
    for sw, sl in zip(eval(e+'_x')[:2], eval(e+'_label')[:2]):
        print( 'WORD'.rjust(wlength), 'LABEL'.rjust(wlength))
        for wx, la in zip(sw, sl): print( idx2w[wx].rjust(wlength), idx2la[la].rjust(wlength))
        print( '\n'+'**'*30+'\n')


train
                               WORD                               LABEL
                                  i                                   O
                               want                                   O
                                 to                                   O
                                fly                                   O
                               from                                   O
                             boston                 B-fromloc.city_name
                                 at                                   O
                    DIGITDIGITDIGIT                  B-depart_time.time
                                 am                  I-depart_time.time
                                and                                   O
                             arrive                                   O
                                 in                                   O
                             denver                   B-to

In [9]:
#Select words for the label 48: b'B-fromloc.city_name' in train and test to check that are different:
for e in ['train','test']:
    print(e)
    print('---------')
    for sw, sl in zip(eval(e+'_x')[:5], eval(e+'_label')[:5]):
        for wx, la in zip(sw, sl): 
            if la==48:
                print( idx2w[wx])
    print('\n')


train
---------
boston
pittsburgh
san
washington
tacoma
pittsburgh


test
---------
charlotte
tacoma
phoenix
phoenix
orlando




## Data transformation
    - Convert the list of sequences of words into an array of words x characteristics.
    - The characteristics are the context of the word in the sentence.
        - For each word in the sentence, generate the context with the previous and the next words in the sentence.
        - For words at the beggining and the end, use padding to complete the context.

In [10]:
ID_PAD = 527

def context(l, size=3):
    l = list(l)
    lpadded = size // 2 * [ID_PAD] + l + size // 2 * [ID_PAD]
    out = [lpadded[i:(i + size)] for i in range(len(l))]
    return out

x = np.array([0, 1, 2, 3, 4], dtype=np.int32)
context(x)

[[527, 0, 1], [0, 1, 2], [1, 2, 3], [2, 3, 4], [3, 4, 527]]

In [11]:
X_trn=[]
for s in train_x:
    X_trn += context(s,size=10)
X_trn = np.array(X_trn)

X_tst=[]
for s in test_x:
    X_tst += context(s,size=10)
X_tst = np.array(X_tst)

print('X trn shape: ', X_trn.shape)
print('X_tst shape: ',X_tst.shape)

X trn shape:  (56590, 10)
X_tst shape:  (9198, 10)


In [12]:
y_trn=[]
for s in train_label:
    y_trn += list(s)
y_trn = np.array(y_trn)
print('y_trn shape: ',y_trn.shape)

y_tst=[]
for s in test_label:
    y_tst += list(s)
y_tst = np.array(y_tst)
print('y_tst shape: ',y_tst.shape)

y_trn shape:  (56590,)
y_tst shape:  (9198,)


In [13]:
print('Num labels: ',len(set(y_trn)))
print('Num words: ',len(set(idx2w)))

Num labels:  121
Num words:  572


# First model

## Architecture
    - tf.nn.embedding_lookup
    - tf.nn.dynamic_rnn layer
    - Dense layer: tf.nn.relu(tf.matmul(x, W) + b)
    
## Features
    - Dropout
    - Saver
    - Cross entropy with loss regularization
    - Score function

In [14]:
#General parameters
LOG_DIR = '/tmp/airline/'

# data attributes
input_seq_length = X_trn.shape[1]
input_vocabulary_size = len(set(idx2w)) + 1
output_length = 127

#Model parameters
embedding_size=64
num_hidden_lstm = 128


In [23]:
# Save words and labels for embedding visualization
with open( os.path.join(LOG_DIR, 'records.tsv'), "w") as record_file:
    for item in idx2w.items():
        record_file.write(item[1].decode('ascii')+'\n')



In [17]:
from tensorflow.contrib.tensorboard.plugins import projector

# Define the tensorflow graph
batch_size = 256

graph = tf.Graph()
with graph.as_default():
    # graph definition
    # Inputs
    with tf.name_scope('Inputs') as scope:
        x = tf.placeholder(tf.int32, shape=[None, input_seq_length], name='x')
        y = tf.placeholder(tf.int64, shape=[None], name='y')
        keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    with tf.name_scope('Embeddings') as scope:
        W_embedding = tf.Variable(tf.random_uniform([input_vocabulary_size, embedding_size], -1.0, 1.0) ,name="W")
        embedding_layer = tf.nn.embedding_lookup(W_embedding, x)
        print('embedding_layer: ', embedding_layer)

        ## VISUALIZE EMBEDDINGS
        # Use the same LOG_DIR where you stored your checkpoint.
        summary_writer = tf.train.SummaryWriter(LOG_DIR)

        # Format: tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto
        config = projector.ProjectorConfig()

        # You can add multiple embeddings. Here we add only one.
        embedding = config.embeddings.add()
        embedding.tensor_name = W_embedding.name
        # Link this tensor to its metadata file (e.g. labels).
        embedding.metadata_path = os.path.join(LOG_DIR, 'records.tsv')

        # Saves a configuration file that TensorBoard will read during startup.
        projector.visualize_embeddings(summary_writer, config)

        
        
    
    with tf.name_scope('RNN') as scope:
        cell_1 = tf.nn.rnn_cell.LSTMCell(num_hidden_lstm, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123))
        cell_1 = tf.nn.rnn_cell.DropoutWrapper(cell_1, output_keep_prob=keep_prob)
        lstm_outputs, lstm_state = tf.nn.dynamic_rnn(cell_1, embedding_layer, dtype=tf.float32, scope='rnn1')
        print('lstm_outputs: ', lstm_outputs)
 

    #Dense layer form RNN outs to prediction
    with tf.name_scope('Dense') as scope:
        W_dense = tf.Variable(tf.truncated_normal([num_hidden_lstm, output_length], stddev=0.1), name='W_dense')
        b_dense = tf.Variable(tf.constant(0.1, shape=[output_length]), name='b_dense')
        dense_output = tf.nn.relu(tf.matmul(lstm_outputs[:,-1,:], W_dense) + b_dense)
        print('dense_output: ', dense_output)

        
    #Prediction
    y_pred = tf.nn.softmax(dense_output, name='y_pred')

    # Loss function
    with tf.name_scope("xent") as scope:
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(dense_output, y, name='cross_entropy')
        '''
        # Regularize the loss
        l2_loss = 0.001 * (tf.nn.l2_loss(W) + tf.nn.l2_loss(W_dense) + tf.nn.l2_loss(b_dense)) 
        cross_entropy = tf.add(cross_entropy, l2_loss, name='loss')        
        '''        
        ce_summ = tf.histogram_summary("cross entropy", cross_entropy) #TENSORBOARD


    #Optimizer
    with tf.name_scope("train") as scope:
        learning_rate = tf.placeholder(tf.float32, name='learning_rate')
        optimizer = tf.train.AdamOptimizer(learning_rate)
        train_op = optimizer.minimize(cross_entropy, name='train_op')


    #Accuracy
    with tf.name_scope("test") as scope:
        correct_prediction = tf.equal(tf.argmax(dense_output,1), y)
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy')
        accuracy_summary = tf.scalar_summary("accuracy", accuracy) #TENSORBOARD

        
    # Create a saver and save weigths.
    saver = tf.train.Saver()


embedding_layer:  Tensor("Embeddings/embedding_lookup:0", shape=(?, 10, 64), dtype=float32)
lstm_outputs:  Tensor("RNN/rnn1/transpose:0", shape=(?, 10, 128), dtype=float32)
dense_output:  Tensor("Dense/Relu:0", shape=(?, 127), dtype=float32)


In [18]:
#batch generator
def batch_generator(x=X_trn, y=y_trn, batch_size=batch_size):
    from sklearn.utils import shuffle
    x_shuffle, y_shuffle = shuffle(x, y, random_state=0)
    for i in range(0, x.shape[0]-batch_size, batch_size):
        x_batch = x_shuffle[i:i+batch_size,:]
        y_batch = y_shuffle[i:i+batch_size]
        yield x_batch, y_batch
    
seq = batch_generator(x=X_trn, y=y_trn, batch_size=20)
print(next(seq))

(array([[554, 241, 481, 165, 193, 197, 208, 379, 502,  64],
       [193, 514, 208,  77, 502, 137, 359, 544,  40, 481],
       [232, 331, 237, 358,  13, 193, 208,  77, 502, 137],
       [ 32, 194,  40, 183, 208, 137, 502, 415, 205, 527],
       [232, 331,  13, 277, 353, 194, 208, 452, 375, 195],
       [527, 193, 348, 208, 313, 502, 282,  71, 358, 249],
       [193, 208, 128, 502, 415, 205, 527, 527, 527, 527],
       [358, 481, 174, 353,  65, 524, 435, 527, 527, 527],
       [208, 481,  29, 234, 379, 502, 159, 527, 527, 527],
       [527, 527, 527, 439, 301, 481, 194, 208, 415, 205],
       [481, 265, 193, 208,  64, 502, 137, 358, 248, 435],
       [157,  37,  26, 221, 561,  13, 105, 353, 430, 111],
       [534, 358, 481, 190, 105,  37,  26, 193, 208, 376],
       [527, 527, 527, 383, 276, 530, 194,  73,  77,  40],
       [527, 527, 527, 527, 554, 194,  50, 389,  86,  37],
       [481, 193, 501, 481, 321, 358, 530,  26, 200, 426],
       [527, 527, 527, 527,  13, 190, 105, 193, 358,  3

In [19]:
# Execute the graph to train a network
nEpochs = 100

with tf.Session(graph=graph) as session:
    print('Initializing')
    print('Epoch - Loss(trn) -  Acc(trn)   -   Loss(tst) -   Acc(tst)')
    session.run(tf.initialize_all_variables())
    for epoch in range(nEpochs):
        ce_c=[]
        acc_c=[]
        ce_c_tst=[]
        acc_c_tst=[]
        
        batch_list = batch_generator(x=X_trn, y=y_trn, batch_size=batch_size)
        for i, batch in enumerate(batch_list):
            feedDict = {x: batch[0], y: batch[1], keep_prob: 0.5, learning_rate: 0.001} # dictionary of batch data to run the graph
            _, ce, acc = session.run([train_op, cross_entropy, accuracy], feed_dict=feedDict)
            ce_c += [ce]
            acc_c += [acc]
            
        batch_list_tst = batch_generator(x=X_tst, y=y_tst, batch_size=batch_size)
        for x_batch, y_batch in batch_list_tst:
            feedDict = {x: x_batch, y: y_batch, keep_prob: 1} # dictionary of batch data to run the graph
            ce_tst, acc_tst = session.run([cross_entropy, accuracy], feed_dict=feedDict)
            ce_c_tst += [ce_tst]
            acc_c_tst += [acc_tst]
            
        saver.save(session, os.path.join(LOG_DIR, "model.ckpt"), epoch)
        
        print(epoch, np.mean(ce_c), np.mean(acc_c), np.mean(ce_c_tst), np.mean(acc_c_tst), sep='   -   ')

Initializing
Epoch - Loss(trn) -  Acc(trn)   -   Loss(tst) -   Acc(tst)
Instructions for updating:
Use `tf.global_variables_initializer` instead.
INFO:tensorflow:/tmp/airline/model.ckpt-0 is not in all_model_checkpoint_paths. Manually adding it.
0   -   1.91106   -   0.626361   -   1.53324   -   0.632254
INFO:tensorflow:/tmp/airline/model.ckpt-1 is not in all_model_checkpoint_paths. Manually adding it.
1   -   1.21519   -   0.719793   -   1.06113   -   0.763951
INFO:tensorflow:/tmp/airline/model.ckpt-2 is not in all_model_checkpoint_paths. Manually adding it.
2   -   0.813426   -   0.815522   -   0.709563   -   0.854241
INFO:tensorflow:/tmp/airline/model.ckpt-3 is not in all_model_checkpoint_paths. Manually adding it.
3   -   0.552627   -   0.87873   -   0.540679   -   0.89375
INFO:tensorflow:/tmp/airline/model.ckpt-4 is not in all_model_checkpoint_paths. Manually adding it.
4   -   0.412718   -   0.915583   -   0.433619   -   0.914174
INFO:tensorflow:/tmp/airline/model.ckpt-5 is not i

In [22]:
# Predict. Score new paragraph 

#inv_map = {v: k for k, v in my_map.iteritems()} #python2
w2idx = {v.decode('ascii'): k for k, v in idx2w.items()} #python3

def score_paragraph(paragraph):
    #Preprocess data
    p_w = paragraph.split()
    p_w_c = [w2idx[w] for w in  p_w]
    x_score = np.array(context(p_w_c, size=10))

    with tf.Session(graph=graph) as session:

        saver.restore(session, os.path.join(LOG_DIR, "model.ckpt-99"))
        feedDict = {x: x_score, keep_prob: 1} # dictionary of batch data to run the graph
        pred_score = session.run(y_pred, feed_dict=feedDict)

    response = [idx2la[l] for l in np.argmax(pred_score,axis=1)]
    return response


paragraph = 'i need a business ticket in any flight with departure from alaska to las vegas monday with breakfast'
response = score_paragraph(paragraph)
for wx, la in zip(paragraph.split(), response): print( wx.rjust(wlength), la.rjust(wlength))


                                  i                                   O
                               need                                   O
                                  a                                   O
                           business                                   O
                             ticket                                   O
                                 in                                   O
                                any                                   O
                             flight                                   O
                               with                                   O
                          departure                                   O
                               from                                   O
                             alaska                 B-fromloc.city_name
                                 to                                   O
                                las                   B-toloc.ci