**Jupyter notebook avaialble @ [https://github.com/dhiraa/tf-guru/blob/master/dataset/2017-11-11-DatasetHandling.ipynb](https://github.com/dhiraa/tf-guru/blob/master/dataset/2017-11-11-DatasetHandling.ipynb)**

# Handling TextDataset with TensorFlow APIs

### Preparing vocab list with TF APIs

### Dynamic Sequence Lengths

In [1]:
import tensorflow as tf
from tensorflow.contrib import lookup
from tensorflow.python.platform import gfile
import numpy as np
from tqdm import tqdm

  return f(*args, **kwds)


In [2]:
def get_sequence_length(sequence_ids, axis=1, pad_word_id=0):
    '''
    Returns the sequence length, droping out all the padded tokens if the sequence is padded
    
    :param sequence_ids: Tensor(shape=[batch_size, doc_length])
    :param pad_word_id: 0 is default
    :return: Array of Document lengths of size batch_size
    '''
    flag = tf.greater(sequence_ids, pad_word_id)
    used = tf.cast(flag, tf.int32)
    length = tf.reduce_sum(used, axis)
    length = tf.cast(length, tf.int32)
    return length

In [3]:
# MAX_DOC_LENGTHS = 4
# rand_array = np.random.randint(1,MAX_DOC_LENGTHS, size=(3,5,4))

#Assume all negative values are padding
rand_array = np.array([[ 2,  0,  0,  0,  0,  0],
 [ 3,  4,  0,  0,  0,  0],
 [ 5,  6,  4,  0,  0,  0],
 [ 7,  8,  6,  4,  0,  0],
 [ 9, 10,  6, 11, 12, 13],
 [ 0,  0,  0,  0,  0,  0]])

rand_array


array([[ 2,  0,  0,  0,  0,  0],
       [ 3,  4,  0,  0,  0,  0],
       [ 5,  6,  4,  0,  0,  0],
       [ 7,  8,  6,  4,  0,  0],
       [ 9, 10,  6, 11, 12, 13],
       [ 0,  0,  0,  0,  0,  0]])

In [4]:
with tf.Session() as sess:
        length = get_sequence_length(rand_array, axis=1, pad_word_id=0)
        print("Get dynamic sequence lengths: ", sess.run(length))

Get dynamic sequence lengths:  [1 2 3 4 6 0]


In [5]:
with tf.Session() as sess:
        length = get_sequence_length(rand_array, axis=1, pad_word_id=0)
        print("Get dynamic sequence lengths: ", sess.run(length))
        data = np.random.randint(1,6, size=(3,5,4))
        length = get_sequence_length(data, axis=1)
        print("Get dynamic sequence lengths: ", sess.run(length))
    

Get dynamic sequence lengths:  [1 2 3 4 6 0]
Get dynamic sequence lengths:  [[5 5 5 5]
 [5 5 5 5]
 [5 5 5 5]]


In [6]:
! rm vocab_test.tsv

In [5]:
#use tf.contrib.learn.preprocessing.VocabularyProcessor instead!
def naive_vocab_creater(lines, out_file_name):
    final_vocab = ["<PAD>", "<UNK>"]
    vocab = [word for line in lines for word in line.split(" ")]
    vocab = set(vocab)

    try:
        vocab.remove("<UNK>")
    except:
        print("No <UNK> token found")

    vocab = list(vocab)
    final_vocab.extend(vocab)
    return final_vocab

In [6]:
# Assume each line to be an document
lines = ['Some title', 
          'Simple',
         'A longer title', 
         'An even longer title', 
         'This is longer than doc length isnt',
          '']

# Normally this takes the mean length of the words in the dataset documents
MAX_DOCUMENT_LENGTH = 7
#LSTM APIs have seq_length paramaters which can used to capture all the words and ignore
#padding words

# Padding word that is used when a document has less words than the calculated mean length of the words
PADWORD = '<PAD>'
PADWORD_ID = 0


In [7]:
tf.reset_default_graph()


print ('TensorFlow Version: ', tf.__version__)


# Create vocabulary
# min_frequency -> consider a word if and only it repeats for fiven count
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH, 
                                                                     min_frequency=0)
vocab_processor.fit(lines)

word_vocab = []

#Create a file and store the words
with gfile.Open('vocab_test.tsv', 'wb') as f:
    f.write("{}\n".format(PADWORD))#, with latest LSTM APIs, sequence length for each document can be given
    word_vocab.append(PADWORD)
    for word, index in vocab_processor.vocabulary_._mapping.items():
        word_vocab.append(word)
        f.write("{}\n".format(word))
        
VOCAB_SIZE = len(vocab_processor.vocabulary_) + 1
print ('{} words into vocab.tsv'.format(VOCAB_SIZE))

EMBEDDING_SIZE = 3
word_vocab = {id:word for id, word in enumerate(word_vocab)}
word_vocab

TensorFlow Version:  1.4.0
15 words into vocab.tsv


{0: '<PAD>',
 1: '<UNK>',
 2: 'Some',
 3: 'title',
 4: 'Simple',
 5: 'A',
 6: 'longer',
 7: 'An',
 8: 'even',
 9: 'This',
 10: 'is',
 11: 'than',
 12: 'doc',
 13: 'length',
 14: 'isnt'}

In [8]:
! cat vocab_test.tsv

<PAD>
<UNK>
Some
title
Simple
A
longer
An
even
This
is
than
doc
length
isnt


In [9]:
words_chars_vocab = ['<P>', '<U>']

def get_char_vocab(words_vocab):
    '''

    :param words_vocab: List of words
    :return:
    '''
    chars = set()
    for word in words_vocab:
        for char in word:
            chars.add(str(char))
    return sorted(chars)

words_chars_vocab.extend(get_char_vocab(word_vocab.values()))

# Create char2id map
char_2_id_map = {c:i for i,c in enumerate(words_chars_vocab)}

CHAR_VOCAB_SIZE = len(char_2_id_map)
char_2_id_map

{'<': 2,
 '<P>': 0,
 '<U>': 1,
 '>': 3,
 'A': 4,
 'D': 5,
 'K': 6,
 'N': 7,
 'P': 8,
 'S': 9,
 'T': 10,
 'U': 11,
 'a': 12,
 'c': 13,
 'd': 14,
 'e': 15,
 'g': 16,
 'h': 17,
 'i': 18,
 'l': 19,
 'm': 20,
 'n': 21,
 'o': 22,
 'p': 23,
 'r': 24,
 's': 25,
 't': 26,
 'v': 27}

In [10]:
list_char_ids = []
char_ids_feature2 = []

for line in lines:
    for word in line.split():
        word_2_char_ids = [char_2_id_map.get(c, 0) for c in word]
        list_char_ids.append(word_2_char_ids)
    char_ids_feature2.append(list_char_ids)
    list_char_ids = []

In [11]:
char_ids_feature2

[[[9, 22, 20, 15], [26, 18, 26, 19, 15]],
 [[9, 18, 20, 23, 19, 15]],
 [[4], [19, 22, 21, 16, 15, 24], [26, 18, 26, 19, 15]],
 [[4, 21], [15, 27, 15, 21], [19, 22, 21, 16, 15, 24], [26, 18, 26, 19, 15]],
 [[10, 17, 18, 25],
  [18, 25],
  [19, 22, 21, 16, 15, 24],
  [26, 17, 12, 21],
  [14, 22, 13],
  [19, 15, 21, 16, 26, 17],
  [18, 25, 21, 26]],
 []]

In [12]:
def _pad_sequences(sequences, pad_tok, max_length):
    """
    Args:
        sequences: a generator of list or tuple
        pad_tok: the char to pad with

    Returns:
        a list of list where each sublist has same length
    """
    sequence_padded, sequence_length = [], []

    for seq in sequences:
        seq = list(seq)
        seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0)
        sequence_padded +=  [seq_]
        sequence_length += [min(len(seq), max_length)]

    return sequence_padded, sequence_length


def pad_sequences(sequences, pad_tok, nlevels, MAX_WORD_LENGTH=6):
    """
    Args:
        sequences: a generator of list or tuple
        pad_tok: the char to pad with
        nlevels: "depth" of padding, for the case where we have characters ids

    Returns:
        a list of list where each sublist has same length

    """
    if nlevels == 1:
        sequence_padded = []
        sequence_length = []
        max_length = max(map(lambda x : len(x.split(" ")), sequences))
        # sequence_padded, sequence_length = _pad_sequences(sequences,
        #                                                   pad_tok, max_length)
        #breaking the code to pad the string instead on its ids
        for seq in sequences:
            current_length = len(seq.split(" "))
            diff = max_length - current_length
            pad_data = pad_tok * diff
            sequence_padded.append(seq + pad_data)
            sequence_length.append(max_length) #assumed

        # print_info(sequence_length)
    elif nlevels == 2:
        # max_length_word = max([max(map(lambda x: len(x), seq))
        #                        for seq in sequences])
        sequence_padded, sequence_length = [], []
        for seq in tqdm(sequences):
            # all words are same length now
            sp, sl = _pad_sequences(seq, pad_tok, MAX_WORD_LENGTH)
            sequence_padded += [sp]
            sequence_length += [sl]

        max_length_sentence = max(map(lambda x : len(x), sequences))
        sequence_padded, _ = _pad_sequences(sequence_padded,
                                            [pad_tok]*MAX_WORD_LENGTH,
                                            max_length_sentence) #TODO revert -1 to pad_tok
        sequence_length, _ = _pad_sequences(sequence_length, 0,
                                            max_length_sentence)

    return sequence_padded, sequence_length

In [13]:
char_ids_feature2, char_seq_length = pad_sequences(char_ids_feature2, nlevels=2, pad_tok=0)
char_ids_feature2 = np.array(char_ids_feature2)

100%|██████████| 6/6 [00:00<00:00, 27265.25it/s]


In [14]:
lines

['Some title',
 'Simple',
 'A longer title',
 'An even longer title',
 'This is longer than doc length isnt',
 '']

In [15]:
print("Character IDs shape: ", char_ids_feature2.shape)
print("Number of sentences: ", len(lines))
print("MAX_DOC_LENGTH: ", max([len(line.split()) for line in lines]))
print("MAX_WORD_LENGTH: ", 8)

Character IDs shape:  (6, 7, 6)
Number of sentences:  6
MAX_DOC_LENGTH:  7
MAX_WORD_LENGTH:  8


In [16]:
# char_ids

In [17]:
WORD_EMBEDDING_SIZE = 3
CHAR_EMBEDDING_SIZE = 3
WORD_LEVEL_LSTM_HIDDEN_SIZE = 3
CHAR_LEVEL_LSTM_HIDDEN_SIZE = 3

In [72]:
tf.reset_default_graph()

 
# can use the vocabulary to convert words to numbers
table = lookup.index_table_from_file(
  vocabulary_file='vocab_test.tsv', 
    num_oov_buckets=0, vocab_size=None, 
    default_value=PADWORD_ID) #id of <PAD> is 0

word2ids = table.lookup(tf.constant(lines[1].split()))
word2ids_1 = table.lookup(tf.constant("Some unknown title".split()))


with tf.Session() as sess:
    #Tables needs to be initialized before useing it
    tf.tables_initializer().run()
    print ("{} --> {}".format(lines[1], word2ids.eval()))
    print ("{} --> {}".format("Some unknown title", word2ids_1.eval()))
    
# string operations
# Array of Docs -> Split it into Tokens/words 
#               -> Convert it into Dense Tensor apending PADWORD
#               -> Table lookup 
#               -> Slice it to MAX_DOCUMENT_LENGTH
titles = tf.constant(lines)
words = tf.string_split(titles)
print(words)

densewords = tf.sparse_tensor_to_dense(words, default_value=PADWORD)
numbers = table.lookup(densewords)

##Following extrasteps are taken care by above 'table.lookup'
# now pad out with zeros and then slice to constant length
# padding = tf.constant([[0,0],[0,MAX_DOCUMENT_LENGTH]])
# # this takes care of documents with zero length also
# padded = tf.pad(numbers, padding)

#if you wanted to clip the document MAX size then it can be done herwwwwwwe!
# sliced = tf.slice(numbers, [0,0], [-1, MAX_DOCUMENT_LENGTH])

seq_length= get_sequence_length(numbers)
with tf.device('/cpu:0'), tf.name_scope("embed-layer"):  

    # layer to take the words and convert them into vectors (embeddings)
    # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
    # maps word indexes of the sequence into
    # [batch_size, MAX_DOCUMENT_LENGTH, EMBEDDING_SIZE].
    word_embeddings = tf.contrib.layers.embed_sequence(numbers,
                                              vocab_size=VOCAB_SIZE,
                                              embed_dim=WORD_EMBEDDING_SIZE,
                                                   initializer=tf.contrib.layers.xavier_initializer(
                                                                   seed=42))

with  tf.name_scope("word_level_lstm_layer"):
    # Create a LSTM Unit cell with hidden size of EMBEDDING_SIZE.
    d_rnn_cell_fw_one = tf.nn.rnn_cell.LSTMCell(WORD_LEVEL_LSTM_HIDDEN_SIZE,
                                                state_is_tuple=True)
    #https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/LSTMStateTuple
    d_rnn_cell_bw_one = tf.nn.rnn_cell.LSTMCell(WORD_LEVEL_LSTM_HIDDEN_SIZE,
                                                state_is_tuple=True)

    (fw_output_one, bw_output_one), output_states = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=d_rnn_cell_fw_one,
        cell_bw=d_rnn_cell_bw_one,
        dtype=tf.float32,
        sequence_length=seq_length,
        inputs=word_embeddings,
        scope="encod_sentence")

    # [BATCH_SIZE, MAX_SEQ_LENGTH, 2*WORD_LEVEL_LSTM_HIDDEN_SIZE) TODO check MAX_SEQ_LENGTH?
    encoded_sentence = tf.concat([fw_output_one,
                                  bw_output_one], axis=-1)

    tf.logging.info('encoded_sentence =====> {}'.format(encoded_sentence))

with tf.variable_scope("char_embed_layer"):
    
        char_ids = tf.convert_to_tensor(char_ids_feature2, np.int64)
        s = tf.shape(char_ids)
        #remove pad words
        char_ids_reshaped = tf.reshape(char_ids, shape=(s[0] * s[1], s[2])) #20 -> char dim
        
        char_embeddings = tf.contrib.layers.embed_sequence(char_ids,
                                                           vocab_size=CHAR_VOCAB_SIZE,
                                                           embed_dim=CHAR_EMBEDDING_SIZE,
                                                           initializer=tf.contrib.layers.xavier_initializer(
                                                               seed=42))

        #[BATCH_SIZE, MAX_SEQ_LENGTH, MAX_WORD_LEGTH, CHAR_EMBEDDING_SIZE]

        tf.logging.info('char_embeddings =====> {}'.format(char_embeddings))

with tf.variable_scope("chars_level_bilstm_layer"):
        # put the time dimension on axis=1
        shape = tf.shape(char_embeddings)

        BATCH_SIZE = shape[0]
        MAX_DOC_LENGTH = shape[1]
        CHAR_MAX_LENGTH = shape[2]

        # [BATCH_SIZE, MAX_SEQ_LENGTH, MAX_WORD_LEGTH, CHAR_EMBEDDING_SIZE]  ===>
        #      [BATCH_SIZE * MAX_SEQ_LENGTH, MAX_WORD_LEGTH, CHAR_EMBEDDING_SIZE]
        char_embeddings = tf.reshape(char_embeddings,
                                     shape=[BATCH_SIZE * MAX_DOC_LENGTH, CHAR_MAX_LENGTH,
                                            CHAR_EMBEDDING_SIZE],
                                     name="reduce_dimension_1")

        tf.logging.info('reshaped char_embeddings =====> {}'.format(char_embeddings))

        # word_lengths = get_sequence_length_old(char_embeddings) TODO working
        word_lengths = get_sequence_length(char_ids_reshaped)

        tf.logging.info('word_lengths =====> {}'.format(word_lengths))

        # bi lstm on chars
        cell_fw = tf.contrib.rnn.LSTMCell(CHAR_LEVEL_LSTM_HIDDEN_SIZE,
                                          state_is_tuple=True)
        cell_bw = tf.contrib.rnn.LSTMCell(CHAR_LEVEL_LSTM_HIDDEN_SIZE,
                                          state_is_tuple=True)

        _output = tf.nn.bidirectional_dynamic_rnn(
            cell_fw=cell_fw,
            cell_bw=cell_bw,
            dtype=tf.float32,
            sequence_length=word_lengths,
            inputs=char_embeddings,
            scope="encode_words")

        # read and concat output
        (char_fw_output_one, char_bw_output_one) , output_state = _output
        ((hidden_fw, output_fw), (hidden_bw, output_bw)) = output_state
        encoded_words = tf.concat([output_fw, output_bw], axis=-1)
        
        char_encoded = tf.concat([char_fw_output_one,
                                  char_bw_output_one], axis=-1)
        lstm_out_encoded_words = encoded_words
        # [BATCH_SIZE, MAX_SEQ_LENGTH, WORD_EMBEDDING_SIZE]
        encoded_words = tf.reshape(encoded_words,
                                   shape=[BATCH_SIZE, MAX_DOC_LENGTH, 2 *
                                          CHAR_LEVEL_LSTM_HIDDEN_SIZE])

        tf.logging.info('encoded_words =====> {}'.format(encoded_words))

with tf.Session() as sess:
    
    sess.run(tf.global_variables_initializer())
    tf.tables_initializer().run()
    print ("titles=", titles.eval(), titles.shape)
    print('--------------------------------------------------------')
    print ("words=", words.eval())
    print('--------------------------------------------------------')
    print ("dense=", densewords.eval(), densewords.shape)
    print('--------------------------------------------------------')
    print ("numbers=", numbers.eval(), numbers.shape)
    print('--------------------------------------------------------')
#     print ("padding=", padding.eval(), padding.shape)
#     print('--------------------------------------------------------')
#     print ("padded=", padded.eval(), padded.shape)
#     print('--------------------------------------------------------')
#     print ("sliced=", sliced.eval(), sliced.shape)
#     print('--------------------------------------------------------')
    

    # [?, self.MAX_DOCUMENT_LENGTH, self.EMBEDDING_SIZE]
#     tf.logging.info('words_embed={}'.format(word_embeddings))
    
# #     tf.logging.info('words_embed={}'.format(word_embeddings.eval()))


    
    tf.logging.info('fw_output_one =====> {}'.format(fw_output_one.get_shape()))
    tf.logging.info('bw_output_one =====> {}'.format(bw_output_one.get_shape()))
    tf.logging.info('forward hidden state =====> {}'.format(output_states[0][0].get_shape()))
    tf.logging.info('forward out state =====> {}'.format(output_states[0][1].get_shape()))
    tf.logging.info('backward hidden state =====> {}'.format(output_states[1][0].get_shape()))
    tf.logging.info('backward out state =====> {}'.format(output_states[1][1].get_shape()))
    
    
    tf.logging.info('encoded_sentence =====> {}'.format(encoded_sentence.get_shape()))
    encoded_senence_out =  encoded_sentence.eval()
    tf.logging.info('encoded_senence_out =====> {}'.format(encoded_senence_out.shape))
    
    length = get_sequence_length(numbers)
    print("Get dynamic sequence lengths: ", sess.run(length))
    #By printing the output of LSTM we can see clearly that it only calculates for given
    #sequence length and rest are appended with zeros
    print("Word Ids: \n", numbers.eval())
    print("encoded_senence_out:\n" , encoded_senence_out)
    
    
    tf.logging.info('char_ids =====> {}'.format(char_ids_reshaped.get_shape()))
    tf.logging.info('char_ids_reshaped =====> {}\n'.format(char_ids_reshaped.eval()))
    
    tf.logging.info('char_embeddings =====> {}'.format(char_embeddings.shape))
    char_embeddings_out = char_embeddings.eval()
    print(char_embeddings_out.shape)
    
    tf.logging.info('word_lengths =====> {}'.format(word_lengths.eval()))
    tf.logging.info('char_encoded =====> {}'.format(char_encoded.get_shape()))
    print("char_encoded:\n", char_encoded.eval())
    
    tf.logging.info('char hidden_fw =====> {}'.format(hidden_fw.get_shape()))
    tf.logging.info('char output_fw =====> {}'.format(output_fw.get_shape()))
    tf.logging.info('char hidden_bw =====> {}'.format(hidden_bw.get_shape()))
    tf.logging.info('char output_bw =====> {}'.format(output_bw.get_shape()))
    
    tf.logging.info('lstm_out_encoded_words =====> {}'.format(lstm_out_encoded_words.get_shape()))
    tf.logging.info('lstm_out_encoded_words =====> {}\n'.format(lstm_out_encoded_words.eval()))
    
    tf.logging.info('encoded_words =====> {}'.format(encoded_words.get_shape()))
    tf.logging.info('encoded_words =====> {}\n'.format(encoded_words.eval()))
    
    
    
    
    

Simple --> [4]
Some unknown title --> [2 0 3]
SparseTensor(indices=Tensor("StringSplit:0", shape=(?, 2), dtype=int64), values=Tensor("StringSplit:1", shape=(?,), dtype=string), dense_shape=Tensor("StringSplit:2", shape=(2,), dtype=int64))
INFO:tensorflow:encoded_sentence =====> Tensor("word_level_lstm_layer/concat:0", shape=(?, ?, 6), dtype=float32)
INFO:tensorflow:char_embeddings =====> Tensor("char_embed_layer/EmbedSequence/embedding_lookup:0", shape=(6, 7, 6, 3), dtype=float32)
INFO:tensorflow:reshaped char_embeddings =====> Tensor("chars_level_bilstm_layer/reduce_dimension_1:0", shape=(?, ?, 3), dtype=float32)
INFO:tensorflow:word_lengths =====> Tensor("chars_level_bilstm_layer/Sum:0", shape=(?,), dtype=int32)
INFO:tensorflow:encoded_words =====> Tensor("chars_level_bilstm_layer/Reshape:0", shape=(?, ?, 6), dtype=float32)
titles= [b'Some title' b'Simple' b'A longer title' b'An even longer title'
 b'This is longer than doc length isnt' b''] (6,)
-------------------------------------

INFO:tensorflow:char output_fw =====> (?, 3)
INFO:tensorflow:char hidden_bw =====> (?, 3)
INFO:tensorflow:char output_bw =====> (?, 3)
INFO:tensorflow:lstm_out_encoded_words =====> (?, 6)
INFO:tensorflow:lstm_out_encoded_words =====> [[-0.00930316 -0.01440556  0.02546123  0.04592736  0.04153579  0.00817145]
 [-0.16451286 -0.05156391 -0.02350709 -0.07512818 -0.10014518  0.1288799 ]
 [ 0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.        ]
 [-0.06948498 -0.02573795 -0.01167641  0.06477174  0.04810657  0.02968378]
 [ 0.          0.          0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.        ]
 [ 0.          0

# Estimators Inputs
- https://www.tensorflow.org/api_docs/python/tf/estimator/inputs

In [None]:
!ls ../../../data/

In [130]:
import itertools

import pandas as pd
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

COLUMNS = ["crim", "zn", "indus", "nox", "rm", "age",
           "dis", "tax", "ptratio", "medv"]
FEATURES = ["crim", "zn", "indus", "nox", "rm",
            "age", "dis", "tax", "ptratio"]
LABEL = "medv"


def get_input_fn(data_set, num_epochs=None, shuffle=True):
    return tf.estimator.inputs.pandas_input_fn(
        x=pd.DataFrame({k: data_set[k].values for k in FEATURES}),
        y=pd.Series(data_set[LABEL].values),
        num_epochs=num_epochs,
        shuffle=shuffle)


# def main(unused_argv):
    # Load datasets
training_set = pd.read_csv("../data/boston_train.csv", skipinitialspace=True,
                         skiprows=1, names=COLUMNS)
test_set = pd.read_csv("../data/boston_test.csv", skipinitialspace=True,
                     skiprows=1, names=COLUMNS)

# Set of 6 examples for which to predict median house values
prediction_set = pd.read_csv("../data/boston_predict.csv", skipinitialspace=True,
                           skiprows=1, names=COLUMNS)

# Feature cols
feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURES]

# Build 2 layer fully connected DNN with 10, 10 units respectively.
regressor = tf.estimator.DNNRegressor(feature_columns=feature_cols,
                                    hidden_units=[10, 10],
                                    model_dir="/tmp/boston_model")

# Train
regressor.train(input_fn=get_input_fn(training_set), steps=5000)

# Evaluate loss over one epoch of test_set.
ev = regressor.evaluate(
  input_fn=get_input_fn(test_set, num_epochs=1, shuffle=False))
loss_score = ev["loss"]
print("Loss: {0:f}".format(loss_score))

# Print out predictions over a slice of prediction_set.
y = regressor.predict(
  input_fn=get_input_fn(prediction_set, num_epochs=1, shuffle=False))
# .predict() returns an iterator of dicts; convert to a list and print
# predictions
predictions = list(p["predictions"] for p in itertools.islice(y, 6))
print("Predictions: {}".format(str(predictions)))


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/boston_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f26e1b0ce10>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/boston_model/model.ckpt.
INFO:tensorflow:loss = 126708.0, step = 1
INFO:tensorflow:global_step/sec: 148.62
INFO:tensorflow:loss = 11384.6, step = 101 (0.678 sec)
INFO:tensorflow:global_step/sec: 220.339
INFO:tensorflow:loss = 9351.81, step = 201 (0.452 sec)
INFO:tensorflow:global_step/sec: 220.878
INFO:tensorflow:loss = 121

KeyboardInterrupt: 

# Char Embeddings

In [76]:
# random_3_dim_mat = np.random.randint(1, 5, size=(3,5,4))

#PADDED LENGTH => 5,5,5
#ACtual LENGTH => 4,3,5
random_3_dim_mat = np.array([
        [
        [1, 1, 2, 0],
        [4, 0, 0, 0],
        [2, 4, 2, 3],
        [3, 1, 0, 0],
        [0, 0, 0, 0]],

       [[3, 1, 2, 1],
        [1, 1, 2, 3],
        [4, 2, 2, 1],
        [0, 0, 0, 0], # eg: <PAD> ==> < P A D  > <PAD> <PAD> => 6, 12, 16, 19, 7, 0, 0
        [0, 0, 0, 0]],

       [[1, 3, 4, 2],
        [4, 4, 3, 3],
        [1, 2, 4, 2],
        [4, 2, 2, 1],
        [4, 2, 3, 2]]])

In [77]:
def get_sequence_length(sequence_ids, axis=1, pad_word_id=0):
    '''
    Returns the sequence length, droping out all the padded tokens if the sequence is padded
    
    :param sequence_ids: Tensor(shape=[batch_size, doc_length])
    :param pad_word_id: 0 is default
    :return: Array of Document lengths of size batch_size
    '''
    flag = tf.greater_equal(sequence_ids, pad_word_id)
    used = tf.cast(flag, tf.int32)
    length = tf.reduce_sum(used, axis)
    length = tf.cast(length, tf.int32)
    return length

In [78]:

# print(random_3_dim_mat)
random_3_dim_mat_reshaped = random_3_dim_mat.reshape(15,4)
random_3_dim_mat_reshaped

array([[1, 1, 2, 0],
       [4, 0, 0, 0],
       [2, 4, 2, 3],
       [3, 1, 0, 0],
       [0, 0, 0, 0],
       [3, 1, 2, 1],
       [1, 1, 2, 3],
       [4, 2, 2, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 3, 4, 2],
       [4, 4, 3, 3],
       [1, 2, 4, 2],
       [4, 2, 2, 1],
       [4, 2, 3, 2]])

In [81]:
with tf.Session() as sess:
    find_zero_rows = tf.reduce_sum(random_3_dim_mat_reshaped, 1)
    flag_zero_rows = tf.greater_equal(find_zero_rows, 1)
    print(sess.run(flag_zero_rows))
    filtered = tf.boolean_mask(random_3_dim_mat_reshaped, flag_zero_rows)

#     flag = tf.greater_equal(random_3_dim_mat, 1)
# #     print(flag.eval())
#     used = tf.cast(flag, tf.int32)
#     print(used.eval())
#     length = tf.reduce_sum(used, 1)
#     length = length.eval()
#     print(length)
#     length = tf.slice(length, [0, 0], [3, 1])
#     length = tf.cast(length, tf.int32)
#     length = length.eval()
#     print(length)
    
    length = sess.run(get_sequence_length(random_3_dim_mat, axis=1))
    print(length)
    filtered = sess.run(filtered)
    print(filtered)

[ True  True  True  True False  True  True  True False False  True  True
  True  True  True]
[[5 5 5 5]
 [5 5 5 5]
 [5 5 5 5]]
[[1 1 2 0]
 [4 0 0 0]
 [2 4 2 3]
 [3 1 0 0]
 [3 1 2 1]
 [1 1 2 3]
 [4 2 2 1]
 [1 3 4 2]
 [4 4 3 3]
 [1 2 4 2]
 [4 2 2 1]
 [4 2 3 2]]


In [82]:
filtered.shape

(12, 4)

In [36]:
three_dim = np.random.randint(0, 5, size=(3,4,1))
three_dim.flatten()

array([4, 4, 1, 3, 1, 4, 1, 0, 2, 1, 4, 1])

In [28]:
with tf.Session() as sess:
    res = sess.run(tf.nn.top_k([[1,3,4,8,2,9], [10,3,44,8,2,9]], k=3))
    unstacked = sess.run(tf.unstack(three_dim, axis=1))
    print(res)
    print(unstacked)

TopKV2(values=array([[ 9,  8,  4],
       [44, 10,  9]], dtype=int32), indices=array([[5, 3, 2],
       [2, 0, 5]], dtype=int32))
[array([[4],
       [4],
       [1]]), array([[3],
       [4],
       [4]]), array([[3],
       [0],
       [0]]), array([[2],
       [3],
       [3]])]


In [12]:
indices = res.indices
# indices.append(res.indices)

In [18]:
col3 = indices[:, 2:3]

In [19]:
list(map(lambda x:"e", col3))

['e', 'e']

In [None]:
for 

# References: 
- https://medium.com/towards-data-science/how-to-do-text-classification-using-tensorflow-word-embeddings-and-cnn-edae13b3e575
- https://github.com/GoogleCloudPlatform/training-data-analyst/tree/master/blogs/textclassification

In [None]:
# Convert this notebook for Docs
! jupyter nbconvert --to markdown --output-dir ../docs/_posts 2017-11-11-DatasetHandling.ipynb