**Jupyter notebook avaialble @ [https://github.com/dhiraa/tf-guru/blob/master/dataset/2017-11-12-Pandas-DataFramet-to-TFRecord.ipynb](https://github.com/dhiraa/tf-guru/blob/master/dataset/2017-11-12-Pandas-DataFramet-to-TFRecord.ipynb)**

# What we are gonna learn?:
- Refer my previous note book  @ [https://dhiraa.github.io/tf-guru/TFRecord.html](https://dhiraa.github.io/tf-guru/TFRecord.html)
- How to prepare text documents for large scala training with TensorFlow?
- What is the use case?
    - Sequence Tagging
    - Dataset format is adopted from [https://www.clips.uantwerpen.be/conll2003/ner/](https://www.clips.uantwerpen.be/conll2003/ner/). This is very custome setup for CoNLL dataset!

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

# Handy Utils

In [2]:
def get_vocab(df: pd.DataFrame, text_col: str):
    '''

    :param df: Pandas DataFrame
    :param text_col: Text Column
    :return:
    '''
    vocab = set()

    row_wise_tokens = df[text_col].str.split(" ").values
    try:
        for row in row_wise_tokens:
            for token in row:
                vocab.add(token)
    except:
        print(df)

    return vocab

In [3]:
def get_char_vocab(words_vocab):
    '''

    :param words_vocab: List of words
    :return:
    '''
    chars = set()
    for word in words_vocab:
        for char in word:
            chars.add(char)
    return sorted(chars)

In [4]:
def get_char_vocab_from_df(df: pd.DataFrame, text_col):
    return get_char_vocab(get_vocab(df, text_col))

### Dataset Preparation

- Read the IOB formated text data into Pandas DataFrame
- Mark each sentence ending with <END> with the help of `new line` available in the dataset

In [5]:
line_seperator = "<CUSTOM_END>"

In [6]:
df = pd.read_csv("../data/conll-sample-dataset.txt", 
                 delimiter=" ", 
                 header=None, 
                 skip_blank_lines =False).fillna(line_seperator)
columns = ["word", "tag"] #define columns
df.columns = columns # atach it to the DF

In [7]:
df[:10]

Unnamed: 0,word,tag
0,Jean,B-PER
1,Pierre,I-PER
2,lives,O
3,in,O
4,New,B-LOC
5,York,I-LOC
6,.,O
7,<CUSTOM_END>,<CUSTOM_END>
8,The,O
9,European,B-ORG


In [8]:
# df[columns].values

In [19]:
def make_seq_pair(df: pd.DataFrame,
                  word_col,
                  tag_col,
                  line_seperator=line_seperator):
    #get the column values
    sequences = df[word_col].values
    labels = df[tag_col].values 
    
    #extract char vocab, to build for chars id feature
    chars_vocab = get_char_vocab_from_df(df, word_col)
    chars_vocab = {k: v for v, k in enumerate(chars_vocab)}
    print("Chars Vocab For reference!")
    print(chars_vocab)
    
    list_text = []
    list_char_ids = []
    list_tag = [] 
    
    #[feature1 ,feature2, label]
    sentence_feature1 = []
    char_ids_feature2 = []
    tag_label = []
    
    for word, tag in zip(sequences, labels):
        if word != line_seperator: #collect the sequence data till new line
            list_text.append(word)
            list_char_ids.append([chars_vocab[c] for c in word])
            list_tag.append(tag)
        else: #when a new line encountered, make an example with feature and label
            sentence_feature1.append(" ".join(list_text))
            char_ids_feature2.append(list_char_ids)
            tag_label.append(" ".join(list_tag))
#             list_seq_pair.append([sentence_feature1, char_ids_feature2, tag_label])
            
            #Make the container empty
            list_text = []
            list_char_ids = []
            list_tag = []
    return sentence_feature1, char_ids_feature2, tag_label

In [20]:
preprocessed_dataset = make_seq_pair(df, "word", "tag")

Chars Vocab For reference!
{'.': 0, '<': 1, '>': 2, 'A': 3, 'C': 4, 'D': 5, 'E': 6, 'F': 7, 'J': 8, 'M': 9, 'N': 10, 'O': 11, 'P': 12, 'S': 13, 'T': 14, 'U': 15, 'Y': 16, '_': 17, 'a': 18, 'c': 19, 'd': 20, 'e': 21, 'h': 22, 'i': 23, 'k': 24, 'l': 25, 'm': 26, 'n': 27, 'o': 28, 'p': 29, 'r': 30, 's': 31, 't': 32, 'u': 33, 'v': 34, 'w': 35}


In [21]:
sentence_feature1, char_ids_feature2, tag_label = preprocessed_dataset

In [22]:
print("Example 1:")
sentence_feature1[0], char_ids_feature2[0], tag_label[0]

Example 1:


('Jean Pierre lives in New York .',
 [[8, 21, 18, 27],
  [12, 23, 21, 30, 30, 21],
  [25, 23, 34, 21, 31],
  [23, 27],
  [10, 21, 35],
  [16, 28, 30, 24],
  [0]],
 'B-PER I-PER O O B-LOC I-LOC O')

In [13]:
print("Text: {}".format(sentence_feature1[0]))
print("Char Ids {}".format(char_ids_feature2[0]))
print("NER Tags {}".format(tag_label[0]))

Text: Jean Pierre lives in New York .
Char Ids [[8, 21, 18, 27], [12, 23, 21, 30, 30, 21], [25, 23, 34, 21, 31], [23, 27], [10, 21, 35], [16, 28, 30, 24], [0]]
NER Tags B-PER I-PER O O B-LOC I-LOC O


# Time to convert above feature & label pairs into TF Example

### Save as TFRecordFiles

In [14]:
def _int64_feature(value):
    """Wrapper for inserting int64 features into Example proto."""
    if not isinstance(value, list):
        value = [value]
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))


def _float_feature(value):
    """Wrapper for inserting float features into Example proto."""
    if not isinstance(value, list):
        value = [value]
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))


def _bytes_feature(value):
    """Wrapper for inserting bytes features into Example proto."""
    if not isinstance(value, list):
        value = [value.encode()]
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

def _validate_text(text):
    """If text is not str or unicode, then try to convert it to str."""
  
    if isinstance(text, str):
        return text
    elif isinstance(text, unicode):
        return text.encode('utf8', 'ignore')
    else:
        return str(text)
    
def make_example(text_sequence, char_ids, labels):
    features = tf.train.Features(feature={
        "seq/text" : _bytes_feature(_validate_text(text_sequence)), #tf.VarLenFeature(tf.int64)
#         "seq/char_ids" : _int64_feature(char_ids),
        "seq/labels" : _bytes_feature(labels) #tf.VarLenFeature(tf.int64)
    })
    
    return tf.train.Example(features=features)



In [15]:
def save_into_tfrecord(preprocessed_dataset):
    with open("/tmp/dummy.tfrecord", "wb") as fp:
        
        writer = tf.python_io.TFRecordWriter(fp.name)
        #get each sequence along with its labels and write it to a file as Sequence Example.
        for text_sequence, char_ids, labels in zip(*preprocessed_dataset):
            example_sequence = make_example(text_sequence, char_ids, labels)
            writer.write(example_sequence.SerializeToString())
        writer.close()
        print("Wrote to {}".format(fp.name))
        return fp.name

In [16]:
save_path = save_into_tfrecord(preprocessed_dataset)

Wrote to /tmp/dummy.tfrecord


### Read back from TFRecordFiles

In [17]:
def read_from_tfrecord(filenames):
    tfrecord_file_queue = tf.train.string_input_producer(filenames, name='queue')
    reader = tf.TFRecordReader()
    _, tfrecord_serialized = reader.read(tfrecord_file_queue)

    # label and image are stored as bytes but could be stored as 
    # int64 or float64 values in a serialized tf.Example protobuf.
    tfrecord_features = tf.parse_single_example(tfrecord_serialized,
                        features={
                            'seq/text': tf.VarLenFeature(tf.string),
#                             'shape': tf.VarLenFeature(tf.string),
                            'seq/labels': tf.VarLenFeature(tf.string),
                        }, name='features')

    text = tf.cast(tfrecord_features['seq/text'], tf.string)
    labels = tf.cast(tfrecord_features['seq/labels'], tf.string)
    return text, labels

def read_tfrecord(tfrecord_file):
    text, labels = read_from_tfrecord([tfrecord_file])

    with tf.Session() as sess:
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)
        for _ in range(5):
            text_retrieved, labels_retrieved = sess.run([text, labels])
            print(text_retrieved.values[0])
            print(labels_retrieved.values[0])
        coord.request_stop()
        coord.join(threads)

In [18]:
read_tfrecord(save_path)

b'Jean Pierre lives in New York .'
b'B-PER I-PER O O B-LOC I-LOC O'
b'The European Union is a political and economic union'
b'O B-ORG I-ORG O O O O O O'
b'A French American actor won an oscar'
b'O B-MISC I-MISC O O O O'
b'Jean Pierre lives in New York .'
b'B-PER I-PER O O B-LOC I-LOC O'
b'The European Union is a political and economic union'
b'O B-ORG I-ORG O O O O O O'


# Use it with Dataset API

In [212]:
filenames = tf.placeholder(tf.string, shape=[None])

# Build dataset iterator
dataset = tf.contrib.data.TFRecordDataset(filenames)

dataset = dataset.repeat(None)  # Infinite iterations
# if shuffle:
#     dataset = dataset.shuffle(buffer_size=10000)
dataset = dataset.batch(12)
iterator_tf = dataset.make_initializable_iterator()

iterator_tf

<tensorflow.contrib.data.python.ops.dataset_ops.Iterator at 0x7f6bf4f23f60>

In [215]:
sess = tf.Session()

sess.run(iterator_tf.initializer,
        feed_dict={filenames: [save_path]})
res = sess.run(iterator_tf.get_next())
print(res)

sess.close()

[ b'\nb\n/\n\nseq/labels\x12!\n\x1f\n\x1dB-PER I-PER O O B-LOC I-LOC O\n/\n\x08seq/text\x12#\n!\n\x1fJean Pierre lives in New York .'
 b'\ns\nD\n\x08seq/text\x128\n6\n4The European Union is a political and economic union\n+\n\nseq/labels\x12\x1d\n\x1b\n\x19O B-ORG I-ORG O O O O O O'
 b'\na\n)\n\nseq/labels\x12\x1b\n\x19\n\x17O B-MISC I-MISC O O O O\n4\n\x08seq/text\x12(\n&\n$A French American actor won an oscar'
 b'\nb\n/\n\x08seq/text\x12#\n!\n\x1fJean Pierre lives in New York .\n/\n\nseq/labels\x12!\n\x1f\n\x1dB-PER I-PER O O B-LOC I-LOC O'
 b'\ns\nD\n\x08seq/text\x128\n6\n4The European Union is a political and economic union\n+\n\nseq/labels\x12\x1d\n\x1b\n\x19O B-ORG I-ORG O O O O O O'
 b'\na\n)\n\nseq/labels\x12\x1b\n\x19\n\x17O B-MISC I-MISC O O O O\n4\n\x08seq/text\x12(\n&\n$A French American actor won an oscar'
 b'\nb\n/\n\x08seq/text\x12#\n!\n\x1fJean Pierre lives in New York .\n/\n\nseq/labels\x12!\n\x1f\n\x1dB-PER I-PER O O B-LOC I-LOC O'
 b'\ns\nD\n\x08seq/text\x128\n6\n4Th

In [217]:
type(res[0])

bytes

In [136]:
# Define the inputs
def setup_input_graph(tfrecord_filenames, batch_size, shuffle=True, scope='train-data'):
    """Return the input function to get the training data.

    Args:
        batch_size (int): Batch size of training iterator that is returned
                          by the input function.
        mnist_data (Object): Object holding the loaded mnist data.

    Returns:
        (Input function, IteratorInitializerHook):
            - Function that returns (features, labels) when called.
            - Hook to initialise input iterator.
    """
    iterator_initializer_hook = IteratorInitializerHook()


    def inputs():
        """Returns training set as Operations.

        Returns:
            (features, labels) Operations that iterate over the dataset
            on every evaluation
        """
        with tf.name_scope(scaope):
            
            filenames = tf.placeholder(tf.string, shape=[None])
            
            # Build dataset iterator
            dataset = tf.data.TFRecordDataset(filenames)
            
            dataset = dataset.repeat(None)  # Infinite iterations
            if shuffle:
                dataset = dataset.shuffle(buffer_size=10000)
            dataset = dataset.batch(batch_size)
            iterator = dataset.make_initializable_iterator()

            # Set runhook to initialize iterator
            iterator_initializer_hook.iterator_initializer_func = \
                lambda sess: sess.run(
                    iterator.initializer,
                    feed_dict={filenames: tfrecord_filenames})

            next_example, next_label = iterator.get_next()

            # Return batched (features, labels)
            return next_example, next_label

    # Return function and hook
    return inputs, iterator_initializer_hook

### References:
- [https://github.com/visipedia/tfrecords/blob/master/create_tfrecords.py](https://github.com/visipedia/tfrecords/blob/master/create_tfrecords.py)
- [https://developers.googleblog.com/2017/09/introducing-tensorflow-datasets.html](https://developers.googleblog.com/2017/09/introducing-tensorflow-datasets.html)