In [3]:
import logging
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.contrib import rnn
from datetime import datetime
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
from typing import List
from nltk.tokenize import word_tokenize
import itertools
import nltk
import collections
import pickle
import re

  from ._conv import register_converters as _register_converters


In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
ENTROPY_PATH = os.path.join('/dataset', 'entropy_2018')
TRAINING_PATH = os.path.join(ENTROPY_PATH, 'training_set.csv')
TEST_PATH = os.path.join(ENTROPY_PATH, 'test_set.csv')

In [6]:
df_train = pd.read_csv(TRAINING_PATH)

In [114]:
x=df_train['sentence']
y=df_train['sentiment']

In [116]:
k = zip(x, y)

In [119]:
next(k)

('NguyenPhong di hok? Voucher co HSD den bao gio vay ad?', 'neutral')

In [47]:
def preprocess_text(doc):
    doc = doc.lower()
    NUMBERS_PATTERN = re.compile(r"[+-]?\d+(?:\.\d+)?")
    doc = re.sub(NUMBERS_PATTERN, '', doc)
    URL_PATTERN = re.compile(
            r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
    doc = re.sub(URL_PATTERN, 'URL', doc)
    return doc

In [48]:
class Text2Vector:
    OUT_OF_VOCAB = 'OUT_OF_VOCAB'
    VOCAB_SIZE = 10000
    def __init__(self):
        self.counts = None
        self.int_to_vocab = None
        self.vocab_to_int = None

    def __tokenize(self, text):
        """

        :param text:
        :return: list
        """
        return word_tokenize(text)

    def doc_to_vec(self, list_documents):
        logging.debug('-- From doc_to_vec')
        assert isinstance(list_documents, list)
        len_list = len(list_documents)
        tokenized_documents = []
        
        list_documents = [preprocess_text(doc) for doc in list_documents]
        for i, doc in enumerate(list_documents):
            if i % 100 == 0:
                logging.debug('--- Tokenizing: {}\{}, len={}'.format(i, len_list, len(doc)))
            tokenized_documents.append(self.__tokenize(doc))

        return [self.__transform(doc) for doc in tokenized_documents]

    def vec_to_doc(self, list_vecs):
        assert isinstance(list_vecs, list) or isinstance(list_vecs, np.ndarray)
        return [self.__invert_transform(vec) for vec in list_vecs]

    def fit(self, list_texts):
        logging.debug('-- From fit')
        if self.counts or self.vocab_to_int or self.int_to_vocab:
            raise Exception('"fit" is a one-time function')
        list_tokenized_texts = [self.__tokenize(text) for text in list_texts]
        all_tokens = itertools.chain(*list_tokenized_texts)
        self.counts = collections.Counter(all_tokens)

        self.int_to_vocab = self.__get_vocab(vocab_size=Text2Vector.VOCAB_SIZE-1) # 1 for PADDING
        self.int_to_vocab = self.int_to_vocab + [Text2Vector.OUT_OF_VOCAB]
        self.vocab_to_int = {word: index for index, word in enumerate(self.int_to_vocab)}

    def __transform(self, list_tokens):
        if not self.vocab_to_int:
            raise Exception('vocab_to_int is None')

        return [self.vocab_to_int[token] if token in self.vocab_to_int else self.vocab_to_int[Text2Vector.OUT_OF_VOCAB] for token in list_tokens]

    def __invert_transform(self, list_ints):
        """

        :param list_ints:
        :return: A document str
        """
        if not self.int_to_vocab:
            raise Exception('vocab_to_int is None')

        return ' '.join([self.int_to_vocab[int_item] for int_item in list_ints])

    def __get_vocab(self, vocab_size=1):
        if not self.counts:
            raise Exception('counts is None')
        return [item[0] for item in self.counts.most_common(n=vocab_size)]

    def get_most_common(self, n=10):
        if not self.counts:
            raise Exception('counts is None')
        return self.counts.most_common(n)

    def export_vocab(self, output_file):
        pd.DataFrame({'word': self.int_to_vocab}).to_csv(output_file, index=False, header=False)
        logging.debug('Exported %s words in vocab into file %s', len(self.int_to_vocab), output_file)

In [37]:
if os.path.exists('text2vec.p'):
    logging.info('Load from saved pickle')
    text2vec = pickle.load(open('text2vec.p', 'rb'))
else:
    logging.info('Fitting')
    text2vec = Text2Vector()
    text2vec.fit(list(df_train['sentence']))
    pickle.dump(text2vec, open('text2vec.p', 'wb' ))

INFO:root:Load from saved pickle


In [92]:

def preprocess_datapoint(doc, label):
    if not isinstance(doc, str):
        doc = doc.decode('utf-8')
        
    if not isinstance(label, str):
        label = label.decode('utf-8')

    LABEL_MAPPING = {
        'positive': 0,
        'neutral': 1,
        'negative': 2
    }
    doc = preprocess_text(doc)
    doc = text2vec.doc_to_vec([doc])[0]
    label = LABEL_MAPPING[label]
    
    return (doc, label)


In [104]:
with tf.Graph().as_default():
    with tf.Session() as sess:
        filenames = [TRAINING_PATH]
        record_defaults = [tf.string, tf.string]
        dataset = tf.contrib.data.CsvDataset(filenames, record_defaults, header=True)
        dataset = dataset.map(lambda doc, label: tuple(tf.py_func(preprocess_datapoint, [doc, label], [tf.int64, tf.int64])))
        dataset = dataset.padded_batch(4, padded_shapes=(1, 1))

#         dataset = dataset.batch(4)
        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()
        x = sess.run(next_element)
        y = sess.run(next_element)

DEBUG:root:-- From doc_to_vec
DEBUG:root:--- Tokenizing: 0\1, len=26
DEBUG:root:-- From doc_to_vec
DEBUG:root:--- Tokenizing: 0\1, len=80
DEBUG:root:-- From doc_to_vec
DEBUG:root:--- Tokenizing: 0\1, len=54
DEBUG:root:-- From doc_to_vec
DEBUG:root:--- Tokenizing: 0\1, len=195


DataLossError: Attempted to pad to a smaller size than the input element.
	 [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[[?,1], [?,1]], output_types=[DT_INT64, DT_INT64], _device="/job:localhost/replica:0/task:0/device:CPU:0"](OneShotIterator)]]

Caused by op 'IteratorGetNext', defined at:
  File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.5/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.5/dist-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelapp.py", line 486, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.5/dist-packages/tornado/platform/asyncio.py", line 127, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.5/asyncio/base_events.py", line 345, in run_forever
    self._run_once()
  File "/usr/lib/python3.5/asyncio/base_events.py", line 1312, in _run_once
    handle._run()
  File "/usr/lib/python3.5/asyncio/events.py", line 125, in _run
    self._callback(*self._args)
  File "/usr/local/lib/python3.5/dist-packages/tornado/platform/asyncio.py", line 117, in _handle_events
    handler_func(fileobj, events)
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python3.5/dist-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tornado/stack_context.py", line 276, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python3.5/dist-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python3.5/dist-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-104-4596c5fb3e1b>", line 11, in <module>
    next_element = iterator.get_next()
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/data/ops/iterator_ops.py", line 373, in get_next
    name=name)), self._output_types,
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/gen_dataset_ops.py", line 1666, in iterator_get_next
    output_shapes=output_shapes, name=name)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 3417, in create_op
    op_def=op_def)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/framework/ops.py", line 1743, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

DataLossError (see above for traceback): Attempted to pad to a smaller size than the input element.
	 [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[[?,1], [?,1]], output_types=[DT_INT64, DT_INT64], _device="/job:localhost/replica:0/task:0/device:CPU:0"](OneShotIterator)]]


In [98]:
isinstance(y, collections.Sequence)

True

In [22]:
text2vec.vec_to_doc(text2vec.doc_to_vec(['duc tri nguyen']))

['OUT_OF_VOCAB tri nguyen']

In [None]:
def build_input_v2():
    """
    Return tensor input
    """
    SENTENCE_MAX_LENGTH = 150
    BATCH_SIZE = 2
    
    filenames = [TRAINING_PATH]
    record_defaults = [tf.string, tf.string]
    dataset = tf.contrib.data.CsvDataset(filenames, record_defaults, header=True)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(preprocess_text)
    iterator = dataset.make_one_shot_iterator()
    tf_X, tf_y = iterator.get_next()

    return tf_X, tf_y

In [None]:

### -----------------------------------------------------------------
###           INGRADIENTS: atomic elements
### -----------------------------------------------------------------
def build_input_v1():
    """
    Return tensor input
    """
    SENTENCE_MAX_LENGTH = 150
    tf_X = tf.placeholder(dtype=tf.int32, name='tf_X', shape=[None, SENTENCE_MAX_LENGTH])
    tf_y = tf.placeholder(dtype=tf.int32, name='tf_y', shape=[None])
    return tf_X, tf_y

def build_inference_v1(tf_X):
    def project(tf_X):
        with tf.device('/cpu:0'), tf.variable_scope('embedding'):
            VOCAB_SIZE = 10000
            EMBEDDING_SIZE = 300

            tf_word_embeddings = tf.get_variable(name='word_embeddings', dtype=tf.float32,
                                              shape=[VOCAB_SIZE, EMBEDDING_SIZE],
                                              initializer=tf.truncated_normal_initializer(stddev=5e-2))
            tf_projected_sentences = tf.nn.embedding_lookup(params=tf_word_embeddings, ids=tf_X)
            return tf_projected_sentences
    
    tf_projected_sens = project(tf_X)
    tf_projected_sens = tf.expand_dims(tf_projected_sens, axis=3)
    
    with tf.variable_scope('convolution_layer'):
        tf_after_conv = tf.layers.conv2d(inputs=tf_projected_sens, filters=10, kernel_size=(5, 5), strides=(2, 2), padding='SAME', name='conv1')
        tf_after_conv = tf.layers.conv2d(inputs=tf_after_conv, filters=20, kernel_size=(3, 3), strides=(2, 2), padding='SAME', name='conv2')
    
    with tf.variable_scope('softmax'):
        tf_flatten = tf.layers.flatten(tf_after_conv)
        tf_logits = tf.layers.dense(inputs=tf_flatten, units=3, activation=tf.nn.relu)
    
    return tf_logits
    
def build_loss_v1(tf_logits, tf_y):
    tf_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf_y, logits=tf_logits)
    tf_aggregated_loss = tf.reduce_mean(tf_losses)

    tf.summary.scalar(name='loss', tensor=tf_aggregated_loss)
    return tf_aggregated_loss

def build_optimize_v1(tf_loss):
    """
    Return tensor optimizer and global step
    """
    tf_global_step = tf.get_variable(name='global_step', dtype=tf.int32, shape=(), initializer=tf.zeros_initializer())
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.05).minimize(tf_loss, global_step=tf_global_step)
    return optimizer, tf_global_step

def build_predict(tf_logit):
    """
    Convert from tensor logit to tensor one hot
    """
    pass


def training_block(graph, tf_X, tf_y, tf_optimizer, tf_global_step, training_generator, test_generator):
    
    with graph.as_default() as gr:
        tf_all_summary = tf.summary.merge_all()
        
        current_dir = os.getcwd()
        experiment_name = datetime.strftime(datetime.now(), '%Y-%m-%dT%H:%M:%S')
        tf_train_writer = tf.summary.FileWriter(logdir=os.path.join(current_dir, 'summary', 'train_' + experiment_name), graph=graph)
        tf_test_writer = tf.summary.FileWriter(logdir=os.path.join(current_dir, 'summary', 'test_' + experiment_name), graph=graph)
        
        with tf.Session().as_default() as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            SUMMARY_STEP = 10
            EVALUATION_STEP = 10
            for X, y in training_generator:
                feed_dict = {tf_X: X, tf_y: y}
                _, global_step = sess.run([tf_optimizer, tf_global_step], feed_dict=feed_dict)
                
                if global_step % SUMMARY_STEP == 0:
                    logging.debug('Collect summary data at step: %s', global_step)
                    train_summary_data = sess.run(tf_all_summary, feed_dict=feed_dict)
                    tf_train_writer.add_summary(train_summary_data, global_step=global_step)
                    
                if global_step % EVALUATION_STEP == 0:
                    logging.debug('Evaluate at step: %s', global_step)
                    X_test, y_test = next(test_generator)
                    
                    test_summary_data = sess.run(tf_all_summary, feed_dict={
                        tf_X: X_test,
                        tf_y: y_test
                    })
                    tf_test_writer.add_summary(test_summary_data, global_step=global_step)

def get_training_generator():
    X = np.random.randint(1000, size=(512, 150))
    y = np.random.randint(3, size=(512))
    for i in range(1000):
        yield X, y


In [None]:
graph = tf.Graph()
with graph.as_default():
    tf_X, tf_y = build_input_v1()
    tf_logit = build_inference_v1(tf_X)
    tf_loss = build_loss_v1(tf_logit, tf_y)
    tf_optimizer, tf_global_step = build_optimize_v1(tf_loss)
    
    training_block(graph=graph, tf_X=tf_X, tf_y=tf_y, training_generator=get_training_generator(), 
                   test_generator=get_training_generator(),
                   tf_optimizer=tf_optimizer,
                   tf_global_step=tf_global_step)
    
    

In [None]:
def build_inference_v2(tf_X):
    """
    ```Thoi kho qua, de thu sau
    Return tensor logit
    tf_X: [batch_size, sentence_max_length]
    """
    
    with tf.device('/cpu:0'), tf.variable_scope('embedding'):
        VOCAB_SIZE = 10000
        EMBEDDING_SIZE = 300
        
        tf_word_embeddings = tf.get_variable(name='word_embeddings', dtype=tf.float32,
                                          shape=[VOCAB_SIZE, EMBEDDING_SIZE],
                                          initializer=tf.truncated_normal_initializer(stddev=5e-2))
        tf_projected_sentences = tf.nn.embedding_lookup(params=tf_word_embeddings, ids=tf_X)
        list_tf_word_embeddings = tf.unstack(tf_projected_sentences, axis=1)
        
    with tf.variable_scope('LSTM'):
        STATE_SIZE = 200
        lstm_cell = rnn.BasicLSTMCell(STATE_SIZE, forget_bias=1.0)
        # Each output has shape of [batch_size, state_size]
        list_outputs, _ = rnn.static_rnn(cell=lstm_cell, inputs=word_embeddings, dtype=tf.float32)
    
    with tf.variable_scope('Attention'):
        ATTENTION_SIZE = 200
        tf_output = tf.stack(list_outputs, axis=2) # [batch_size, state_size, sentence_max_length]
        
#         tf_attention_weights =
        
        tf_after_attention = tf.layers.dense(tf_output, units=ATTENTION_SIZE, activation=tf.nn.relu)
        
    with tf.variable_scope('Fully-Connected'):
        tf_logit = tf.layers.dense(tf_after_attention, units=ATTENTION_SIZE, activation=tf.nn.relu)