# Word tagging

Annotate relevant tags in texts. Example: name entities.



In [1]:
from __future__ import print_function
import tensorflow as tf 
import os 
print(tf.__version__)


0.12.0


## Dataset transformation

In [None]:
# Read data
import pickle

atis_file = '/home/ubuntu/data/training/text/atis/atis.pkl'
with open(atis_file,'rb') as f:
    train, test, dicts = pickle.load(f)

#Dictionaries and train test partition
w2idx, ne2idx, labels2idx = dicts[b'words2idx'], dicts[b'tables2idx'], dicts[b'labels2idx']
    
idx2w  = dict((v,k) for k,v in w2idx.items())
idx2la = dict((v,k) for k,v in labels2idx.items())

train_x, _, train_label = train
test_x,  _,  test_label  = test


# Max value of word coding to assign the ID_PAD
ID_PAD = np.max([np.max(tx) for tx in train_x]) + 1
print('ID_PAD: ', ID_PAD)

def context(l, size=3):
    l = list(l)
    lpadded = size // 2 * [ID_PAD] + l + size // 2 * [ID_PAD]
    out = [lpadded[i:(i + size)] for i in range(len(l))]
    return out


# Create train and test X y.
X_trn=[]
for s in train_x:
    X_trn += context(s,size=10)
X_trn = np.array(X_trn)

X_tst=[]
for s in test_x:
    X_tst += context(s,size=10)
X_tst = np.array(X_tst)
print('X trn shape: ', X_trn.shape)
print('X_tst shape: ',X_tst.shape)

y_trn=[]
for s in train_label:
    y_trn += list(s)
y_trn = np.array(y_trn)
print('y_trn shape: ',y_trn.shape)

y_tst=[]
for s in test_label:
    y_tst += list(s)
y_tst = np.array(y_tst)
print('y_tst shape: ',y_tst.shape)

print('Num labels: ',len(set(y_trn)))
print('Num words: ',len(set(idx2w)))

# Simpe LSTM model

## Architecture
    - tf.nn.embedding_lookup
    - tf.nn.dynamic_rnn layer
    - Dense layer: tf.nn.relu(tf.matmul(x, W) + b)
    
## Features
    - Dropout
    - Saver
    - Cross entropy with loss regularization
    - Score function

In [10]:
#General parameters
LOG_DIR = '/tmp/tensorboard/airline/embeddings_visualize/'

# data attributes
input_seq_length = X_trn.shape[1]
input_vocabulary_size = len(set(idx2w)) + 1
output_length = 127

#Model parameters
embedding_size=64
num_hidden_lstm = 128


In [11]:
# Save words and labels for embedding visualization
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)
    
with open( os.path.join(LOG_DIR, 'records.tsv'), "w") as record_file:
    for item in idx2w.items():
        record_file.write(item[1].decode('ascii')+'\n')



In [12]:
from tensorflow.contrib.tensorboard.plugins import projector

# Define the tensorflow graph

graph = tf.Graph()

with graph.as_default():
    # graph definition
    # Inputs
    with tf.name_scope('Inputs') as scope:
        x = tf.placeholder(tf.int32, shape=[None, input_seq_length], name='x')
        y = tf.placeholder(tf.int64, shape=[None], name='y')
        keep_prob = tf.placeholder(tf.float32, name='keep_prob')

    with tf.name_scope('Embeddings') as scope:
        W_embedding = tf.Variable(tf.random_uniform([input_vocabulary_size, embedding_size], -1.0, 1.0) ,name="W")
        embedding_layer = tf.nn.embedding_lookup(W_embedding, x)
        print('embedding_layer: ', embedding_layer)

        ## VISUALIZE EMBEDDINGS
        # Use the same LOG_DIR where you stored your checkpoint.
        summary_writer = tf.summary.FileWriter(LOG_DIR)

        # Format: tensorflow/contrib/tensorboard/plugins/projector/projector_config.proto
        config = projector.ProjectorConfig()

        # You can add multiple embeddings. Here we add only one.
        embedding = config.embeddings.add()
        embedding.tensor_name = W_embedding.name
        # Link this tensor to its metadata file (e.g. labels).
        embedding.metadata_path = os.path.join(LOG_DIR, 'records.tsv')

        # Saves a configuration file that TensorBoard will read during startup.
        projector.visualize_embeddings(summary_writer, config)

        
        
    
    with tf.name_scope('RNN') as scope:
        cell_1 = tf.nn.rnn_cell.LSTMCell(num_hidden_lstm, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=123))
        cell_1 = tf.nn.rnn_cell.DropoutWrapper(cell_1, output_keep_prob=keep_prob)
        lstm_outputs, lstm_state = tf.nn.dynamic_rnn(cell_1, embedding_layer, dtype=tf.float32, scope='rnn1')
        print('lstm_outputs: ', lstm_outputs)
 

    #Dense layer form RNN outs to prediction
    with tf.name_scope('Dense') as scope:
        W_dense = tf.Variable(tf.truncated_normal([num_hidden_lstm, output_length], stddev=0.1), name='W_dense')
        b_dense = tf.Variable(tf.constant(0.1, shape=[output_length]), name='b_dense')
        dense_output = tf.nn.relu(tf.matmul(lstm_outputs[:,-1,:], W_dense) + b_dense)
        print('dense_output: ', dense_output)

        
    #Prediction
    y_pred = tf.nn.softmax(dense_output, name='y_pred')

    # Loss function
    with tf.name_scope("xent") as scope:
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(dense_output, y, name='cross_entropy')
        ce_summ = tf.summary.histogram("cross entropy", cross_entropy) #TENSORBOARD


    #Optimizer
    with tf.name_scope("train") as scope:
        learning_rate = tf.placeholder(tf.float32, name='learning_rate')
        optimizer = tf.train.AdamOptimizer(learning_rate)
        train_op = optimizer.minimize(cross_entropy, name='train_op')


    #Accuracy
    with tf.name_scope("test") as scope:
        correct_prediction = tf.equal(tf.argmax(dense_output,1), y)
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy')
        accuracy_summary = tf.summary.scalar("accuracy", accuracy) #TENSORBOARD

        
    # Create a saver to save weigths.
    saver = tf.train.Saver()


embedding_layer:  Tensor("Embeddings/embedding_lookup:0", shape=(?, 10, 64), dtype=float32)
lstm_outputs:  Tensor("RNN/rnn1/transpose:0", shape=(?, 10, 128), dtype=float32)
dense_output:  Tensor("Dense/Relu:0", shape=(?, 127), dtype=float32)
INFO:tensorflow:Summary name cross entropy is illegal; using cross_entropy instead.


In [None]:
#batch generator
def batch_generator(x=X_trn, y=y_trn, batch_size=batch_size):
    from sklearn.utils import shuffle
    x_shuffle, y_shuffle = shuffle(x, y, random_state=0)
    for i in range(0, x.shape[0]-batch_size, batch_size):
        x_batch = x_shuffle[i:i+batch_size,:]
        y_batch = y_shuffle[i:i+batch_size]
        yield x_batch, y_batch
    
seq = batch_generator(x=X_trn, y=y_trn, batch_size=20)
print(next(seq))

(array([[554, 241, 481, 165, 193, 197, 208, 379, 502,  64],
       [193, 514, 208,  77, 502, 137, 359, 544,  40, 481],
       [232, 331, 237, 358,  13, 193, 208,  77, 502, 137],
       [ 32, 194,  40, 183, 208, 137, 502, 415, 205, 527],
       [232, 331,  13, 277, 353, 194, 208, 452, 375, 195],
       [527, 193, 348, 208, 313, 502, 282,  71, 358, 249],
       [193, 208, 128, 502, 415, 205, 527, 527, 527, 527],
       [358, 481, 174, 353,  65, 524, 435, 527, 527, 527],
       [208, 481,  29, 234, 379, 502, 159, 527, 527, 527],
       [527, 527, 527, 439, 301, 481, 194, 208, 415, 205],
       [481, 265, 193, 208,  64, 502, 137, 358, 248, 435],
       [157,  37,  26, 221, 561,  13, 105, 353, 430, 111],
       [534, 358, 481, 190, 105,  37,  26, 193, 208, 376],
       [527, 527, 527, 383, 276, 530, 194,  73,  77,  40],
       [527, 527, 527, 527, 554, 194,  50, 389,  86,  37],
       [481, 193, 501, 481, 321, 358, 530,  26, 200, 426],
       [527, 527, 527, 527,  13, 190, 105, 193, 358,  3

In [None]:
# Execute the graph to train a network
batch_size = 256
nEpochs = 20

gpu_options = tf.GPUOptions(allow_growth = True)
with tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options)) as session:

    #Create sumaries writers
    train_writer = tf.summary.FileWriter(LOG_DIR + 'train', session.graph, flush_secs=2)
    test_writer  = tf.summary.FileWriter(LOG_DIR + 'test', flush_secs=2)

        
    print('Initializing')
    print('Epoch - Loss(trn) -  Acc(trn)   -   Loss(tst) -   Acc(tst)')
    session.run(tf.global_variables_initializer())
    for epoch in range(nEpochs):
        ce_c=[]
        acc_c=[]
        ce_c_tst=[]
        acc_c_tst=[]
        
        batch_list = batch_generator(x=X_trn, y=y_trn, batch_size=batch_size)
        for i, batch in enumerate(batch_list):
            feedDict = {x: batch[0], y: batch[1], keep_prob: 0.5, learning_rate: 0.001} # dictionary of batch data to run the graph
            _, ce, acc = session.run([train_op, cross_entropy, accuracy], feed_dict=feedDict)
            ce_c += [ce]
            acc_c += [acc]
            
        batch_list_tst = batch_generator(x=X_tst, y=y_tst, batch_size=batch_size)
        for x_batch, y_batch in batch_list_tst:
            feedDict = {x: x_batch, y: y_batch, keep_prob: 1} # dictionary of batch data to run the graph
            ce_tst, acc_tst = session.run([cross_entropy, accuracy], feed_dict=feedDict)
            ce_c_tst += [ce_tst]
            acc_c_tst += [acc_tst]
            
        saver.save(session, os.path.join(LOG_DIR, "model.ckpt"), epoch)
        
        print(epoch, np.mean(ce_c), np.mean(acc_c), np.mean(ce_c_tst), np.mean(acc_c_tst), sep='   -   ')

Initializing
Epoch - Loss(trn) -  Acc(trn)   -   Loss(tst) -   Acc(tst)
0   -   1.87054   -   0.629878   -   1.46709   -   0.638504
1   -   1.17752   -   0.731671   -   1.03954   -   0.767411
2   -   0.801249   -   0.822363   -   0.75586   -   0.858371
3   -   0.564598   -   0.880568   -   0.578582   -   0.892076
4   -   0.411807   -   0.915777   -   0.456453   -   0.911161
5   -   0.326596   -   0.934707   -   0.39942   -   0.923996
6   -   0.270338   -   0.947098   -   0.345853   -   0.933817
7   -   0.233067   -   0.954857   -   0.312223   -   0.940848
8   -   0.202159   -   0.961097   -   0.281028   -   0.947879
9   -   0.177632   -   0.966682   -   0.264368   -   0.951116
10   -   0.162337   -   0.969439   -   0.272681   -   0.948326
11   -   0.148046   -   0.972692   -   0.252488   -   0.953348
12   -   0.13442   -   0.975113   -   0.246492   -   0.956808
13   -   0.125712   -   0.976368   -   0.231779   -   0.957366
14   -   0.11624   -   0.977959   -   0.237933   -   0.957143
1

In [None]:
# Predict. Score new paragraph 

#inv_map = {v: k for k, v in my_map.iteritems()} #python2
w2idx = {v.decode('ascii'): k for k, v in idx2w.items()} #python3

def score_paragraph(paragraph):
    #Preprocess data
    p_w = paragraph.split()
    p_w_c = [w2idx[w] for w in  p_w]
    x_score = np.array(context(p_w_c, size=10))

    with tf.Session(graph=graph) as session:

        saver.restore(session, os.path.join(LOG_DIR, "model.ckpt-99"))
        feedDict = {x: x_score, keep_prob: 1} # dictionary of batch data to run the graph
        pred_score = session.run(y_pred, feed_dict=feedDict)

    response = [idx2la[l] for l in np.argmax(pred_score,axis=1)]
    return response


paragraph = 'i need a business ticket in any flight with departure from alaska to las vegas monday with breakfast'
response = score_paragraph(paragraph)
for wx, la in zip(paragraph.split(), response): print( wx.rjust(wlength), la.rjust(wlength))
