# Identify tags in airline database

## Minimal code

    - Read dataset
    - transform data
    - Minimal model
        - Embedings
        - Dense
        


In [1]:
from __future__ import print_function

import os 
import numpy as np 

import tensorflow as tf 
print(tf.__version__)


0.12.0


## Dataset

ATIS (Airline Travel Information System) dataset. Available in: https://github.com/mesnilgr/is13/blob/master/data/load.py

### Example:

Input (words)	show	flights	from	Boston	to	New	York	today

Output (labels)	O	O	O	B-dept	O	B-arr	I-arr	B-date




In [2]:
# Read data
import pickle

atis_file = '/home/ubuntu/data/training/text/atis/atis.pkl'
with open(atis_file,'rb') as f:
    #train, test, dicts = pickle.load(f, encoding='bytes') #python3
    train, test, dicts = pickle.load(f)


## train / test sets:
    - X: list of input sequences
    - label: List of target labels asociated to each word in each sentence.
## Dictionaries
    - labels2idx:  To decode the labels
    - words2idx: To decode the sentences

In [3]:
#Dictionaries and train test partition
w2idx, ne2idx, labels2idx = dicts[b'words2idx'], dicts[b'tables2idx'], dicts[b'labels2idx']
    
idx2w  = dict((v,k) for k,v in w2idx.items())
idx2la = dict((v,k) for k,v in labels2idx.items())

train_x, _, train_label = train
test_x,  _,  test_label  = test



# Visualize data
wlength = 35
for e in ['train','test']:
    print(e)
    for sw, sl in zip(eval(e+'_x')[:2], eval(e+'_label')[:2]):
        print( 'WORD'.rjust(wlength), 'LABEL'.rjust(wlength))
        for wx, la in zip(sw, sl): print( idx2w[wx].rjust(wlength), idx2la[la].rjust(wlength))
        print( '\n'+'**'*30+'\n')


train
                               WORD                               LABEL
                                  i                                   O
                               want                                   O
                                 to                                   O
                                fly                                   O
                               from                                   O
                             boston                 B-fromloc.city_name
                                 at                                   O
                    DIGITDIGITDIGIT                  B-depart_time.time
                                 am                  I-depart_time.time
                                and                                   O
                             arrive                                   O
                                 in                                   O
                             denver                   B-to

In [4]:
#Select words for the label 48: b'B-fromloc.city_name' in train and test to check that are different:
for e in ['train','test']:
    print(e)
    print('---------')
    for sw, sl in zip(eval(e+'_x')[:5], eval(e+'_label')[:5]):
        for wx, la in zip(sw, sl): 
            if la==48:
                print( idx2w[wx])
    print('\n')


train
---------
boston
pittsburgh
san
washington
tacoma
pittsburgh


test
---------
charlotte
tacoma
phoenix
phoenix
orlando




## Data transformation
    - Convert the list of sequences of words into an array of words x characteristics.
    - The characteristics are the context of the word in the sentence.
        - For each word in the sentence, generate the context with the previous and the next words in the sentence.
        - For words at the beggining and the end, use padding to complete the context.

In [5]:
# Max value of word coding to assign the ID_PAD
ID_PAD = np.max([np.max(tx) for tx in train_x]) + 1
print('ID_PAD: ', ID_PAD)

def context(l, size=3):
    l = list(l)
    lpadded = size // 2 * [ID_PAD] + l + size // 2 * [ID_PAD]
    out = [lpadded[i:(i + size)] for i in range(len(l))]
    return out

x = np.array([0, 1, 2, 3, 4], dtype=np.int32)
print('Context vectors: ', context(x))

ID_PAD:  572
Context vectors:  [[572, 0, 1], [0, 1, 2], [1, 2, 3], [2, 3, 4], [3, 4, 572]]


In [6]:
# Create train and test X y.
X_trn=[]
for s in train_x:
    X_trn += context(s,size=10)
X_trn = np.array(X_trn)

X_tst=[]
for s in test_x:
    X_tst += context(s,size=10)
X_tst = np.array(X_tst)

print('X trn shape: ', X_trn.shape)
print('X_tst shape: ',X_tst.shape)


y_trn=[]
for s in train_label:
    y_trn += list(s)
y_trn = np.array(y_trn)
print('y_trn shape: ',y_trn.shape)

y_tst=[]
for s in test_label:
    y_tst += list(s)
y_tst = np.array(y_tst)
print('y_tst shape: ',y_tst.shape)


X trn shape:  (56590, 10)
X_tst shape:  (9198, 10)
y_trn shape:  (56590,)
y_tst shape:  (9198,)


In [7]:
print('Num labels: ',len(set(y_trn)))
print('Num words: ',len(set(idx2w)))

Num labels:  121
Num words:  572


# First model

## Architecture
    - tf.nn.embedding_lookup
    - Dense layer: tf.nn.relu(tf.matmul(x, W) + b)
    

In [8]:
#General parameters
LOG_DIR = '/tmp/tensorboard/airline/embeddings/'

# data attributes
input_seq_length = X_trn.shape[1]
input_vocabulary_size = len(set(idx2w)) + 1
output_length = 127

#Model parameters
embedding_size=64


In [9]:
# Define the tensorflow graph

graph = tf.Graph()

with graph.as_default():
    # graph definition
    # Inputs
    with tf.name_scope('Inputs') as scope:
        x = tf.placeholder(tf.int32, shape=[None, input_seq_length], name='x')
        y = tf.placeholder(tf.int64, shape=[None], name='y')

    with tf.name_scope('Embeddings') as scope:
        W_embedding = tf.Variable(tf.random_uniform([input_vocabulary_size, embedding_size], -1.0, 1.0) ,name="W")
        embedding_layer = tf.nn.embedding_lookup(W_embedding, x)
        print('embedding layer: ', embedding_layer)
        flat_embedding = tf.reshape(embedding_layer, [-1,10*embedding_size])
        print('Flat embedding layer: ', flat_embedding)

    
    #Dense layer form RNN outs to prediction
    with tf.name_scope('Dense') as scope:
        W_dense = tf.Variable(tf.truncated_normal([embedding_size*10, output_length], stddev=0.1), name='W_dense')
        b_dense = tf.Variable(tf.constant(0.1, shape=[output_length]), name='b_dense')
        dense_output = tf.nn.relu(tf.matmul(flat_embedding, W_dense) + b_dense)
        print('dense output: ', dense_output)

        
    # Loss function
    with tf.name_scope("xent") as scope:
        cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(dense_output, y, name='cross_entropy')
        ce_summary = tf.summary.scalar("cross_entropy", tf.reduce_mean(cross_entropy))

    #Optimizer
    with tf.name_scope("train") as scope:
        optimizer = tf.train.AdamOptimizer(0.001)
        train_op = optimizer.minimize(cross_entropy, name='train_op')


    #Accuracy
    with tf.name_scope("test") as scope:
        #Prediction
        y_pred = tf.nn.softmax(dense_output, name='y_pred')
        #Accuracy
        correct_prediction = tf.equal(tf.argmax(dense_output,1), y)
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy')
        accuracy_summary = tf.summary.scalar("accuracy", accuracy)


    # Merge all the summaries and write them out to /tmp/mnist_logs
    with tf.name_scope('summaries') as scope:
        merged = tf.summary.merge_all()
        


embedding layer:  Tensor("Embeddings/embedding_lookup:0", shape=(?, 10, 64), dtype=float32)
Flat embedding layer:  Tensor("Embeddings/Reshape:0", shape=(?, 640), dtype=float32)
dense output:  Tensor("Dense/Relu:0", shape=(?, 127), dtype=float32)


In [10]:
#batch generator
def batch_generator(x, y, batch_size=128):
    for i in range(0, x.shape[0]-batch_size, batch_size):
        x_batch = x[i:i+batch_size,:]
        y_batch = y[i:i+batch_size]
        yield x_batch, y_batch
    
seq = batch_generator(X_trn, y_trn, batch_size=20)
print(next(seq))

(array([[572, 572, 572, 572, 572, 232, 542, 502, 196, 208],
       [572, 572, 572, 572, 232, 542, 502, 196, 208,  77],
       [572, 572, 572, 232, 542, 502, 196, 208,  77,  62],
       [572, 572, 232, 542, 502, 196, 208,  77,  62,  10],
       [572, 232, 542, 502, 196, 208,  77,  62,  10,  35],
       [232, 542, 502, 196, 208,  77,  62,  10,  35,  40],
       [542, 502, 196, 208,  77,  62,  10,  35,  40,  58],
       [502, 196, 208,  77,  62,  10,  35,  40,  58, 234],
       [196, 208,  77,  62,  10,  35,  40,  58, 234, 137],
       [208,  77,  62,  10,  35,  40,  58, 234, 137,  62],
       [ 77,  62,  10,  35,  40,  58, 234, 137,  62,  11],
       [ 62,  10,  35,  40,  58, 234, 137,  62,  11, 234],
       [ 10,  35,  40,  58, 234, 137,  62,  11, 234, 481],
       [ 35,  40,  58, 234, 137,  62,  11, 234, 481, 321],
       [ 40,  58, 234, 137,  62,  11, 234, 481, 321, 572],
       [ 58, 234, 137,  62,  11, 234, 481, 321, 572, 572],
       [234, 137,  62,  11, 234, 481, 321, 572, 572, 57

In [11]:
# Execute the graph to train a network
batch_size = 256
nEpochs = 20

gpu_options = tf.GPUOptions(allow_growth = True)
with tf.Session(graph=graph, config=tf.ConfigProto(gpu_options=gpu_options)) as session:

    #Create sumaries writers
    train_writer = tf.summary.FileWriter(LOG_DIR + 'train', session.graph, flush_secs=2)
    test_writer  = tf.summary.FileWriter(LOG_DIR + 'test', flush_secs=2)

        
    print('Initializing')
    print('Epoch - Loss(trn) -  Acc(trn)   -   Loss(tst) -   Acc(tst)')
    tf.global_variables_initializer().run()
    for epoch in range(nEpochs):
        ce_c=[]
        acc_c=[]
        ce_c_tst=[]
        acc_c_tst=[]
        
        batch_list = batch_generator(X_trn, y_trn, batch_size=batch_size)
        for i, batch in enumerate(batch_list):
            feedDict = {x: batch[0], y: batch[1]} # dictionary of batch data to run the graph
            _, ce, acc = session.run([train_op, cross_entropy, accuracy], feed_dict=feedDict)
            ce_c += [ce]
            acc_c += [acc]
        # Sumaries train    
        summary_str_trn = session.run(merged, feedDict)
        train_writer.add_summary(summary_str_trn, epoch)            
            
            
        batch_list_tst = batch_generator(X_tst, y_tst, batch_size=batch_size)
        for x_batch, y_batch in batch_list_tst:
            feedDict = {x: x_batch, y: y_batch} # dictionary of batch data to run the graph
            ce_tst, acc_tst = session.run([cross_entropy, accuracy], feed_dict=feedDict)
            ce_c_tst += [ce_tst]
            acc_c_tst += [acc_tst]
        # Sumaries test    
        summary_str_tst = session.run(merged, feedDict)
        test_writer.add_summary(summary_str_tst, epoch)            
        
        print(epoch, np.mean(ce_c), np.mean(acc_c), np.mean(ce_c_tst), np.mean(acc_c_tst), sep='   -   ')

Initializing
Epoch - Loss(trn) -  Acc(trn)   -   Loss(tst) -   Acc(tst)
0   -   1.45164   -   0.723346   -   0.711422   -   0.858371
1   -   0.401262   -   0.922105   -   0.428747   -   0.913728
2   -   0.235641   -   0.956519   -   0.333876   -   0.938281
3   -   0.169957   -   0.969917   -   0.290212   -   0.945759
4   -   0.136426   -   0.976598   -   0.266624   -   0.952009
5   -   0.116381   -   0.980239   -   0.251763   -   0.955134
6   -   0.103118   -   0.982714   -   0.241503   -   0.957254
7   -   0.0938775   -   0.984234   -   0.234357   -   0.959598
8   -   0.0870477   -   0.985312   -   0.229153   -   0.960938
9   -   0.0817815   -   0.986001   -   0.225312   -   0.96183
10   -   0.0775921   -   0.986584   -   0.222508   -   0.9625
11   -   0.0741804   -   0.987132   -   0.220477   -   0.962612
12   -   0.0713502   -   0.987504   -   0.218989   -   0.962723
13   -   0.0689697   -   0.987698   -   0.217967   -   0.96317
14   -   0.066944   -   0.987945   -   0.217336   -   