In [1]:
import logging
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.contrib import rnn
from datetime import datetime
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
from typing import List
from nltk.tokenize import word_tokenize
import itertools
import nltk
import collections
import pickle
import re
from text2vector import Text2Vector
from dataset import Dataset

  from ._conv import register_converters as _register_converters


In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
ENTROPY_PATH = os.path.join('/dataset', 'entropy_2018')
TRAINING_PATH = os.path.join(ENTROPY_PATH, 'training_set.csv')
TEST_PATH = os.path.join(ENTROPY_PATH, 'test_set.csv')

In [5]:
def preprocess_text(doc):
        doc = doc.lower()
        NUMBERS_PATTERN = re.compile(r"[+-]?\d+(?:\.\d+)?")
        doc = re.sub(NUMBERS_PATTERN, '', doc)
        URL_PATTERN = re.compile(
                r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
        doc = re.sub(URL_PATTERN, 'URL', doc)
        return doc


In [10]:

if os.path.exists('text2vec.p'):
    logging.info('Load text2vector object from saved pickle')
    text2vec_model = pickle.load(open('text2vec.p', 'rb'))
else:
    logging.info('Fitting')
    df_train = pd.read_csv(TRAINING_PATH)
    docs = df_train['sentence'].map(preprocess_text)
    text2vec_model = Text2Vector()
    text2vec_model.fit(docs)
    pickle.dump(text2vec_model, open('text2vec.p', 'wb' ))


if os.path.exists('dataset.p'):
    logging.info('Load dataset from pickle')
    dataset = Dataset.from_pickle_file('dataset.p')
else:
    logging.info('Load dataset from CSV')
    LABEL_MAPPING = {
        'positive': 0,
        'neutral': 1,
        'negative': 2
    }
    def digitize_datapoint(datapoint):
        doc, label = datapoint
        doc = preprocess_text(doc)
        return text2vec_model.doc_to_vec([doc])[0], LABEL_MAPPING[label]

    text_dataset = Dataset.from_csv(TRAINING_PATH)
    dataset = text_dataset.map(digitize_datapoint)
    dataset.save('dataset.p')

dataset = dataset.shuffle(10000)
dataset = dataset.padded_batch(batch_size=2, list_lengths=(150, None), padded_value=text2vec_model.vocab_to_int[Text2Vector.PADDING])

INFO:root:Load text2vector object from saved pickle
INFO:root:Load dataset from pickle


In [38]:
text_db = Dataset.from_csv(TRAINING_PATH)
# text_db = text_db.shuffle(10000)
db = Dataset.from_pickle_file('dataset.p')
# db = db.shuffle(10000)
db = db.padded_batch(batch_size=1, list_lengths=(150, None), padded_value=text2vec_model.vocab_to_int[Text2Vector.PADDING])

In [39]:
iter1 = text_db.get_iterator()
iter2 = db.get_iterator()

In [48]:
next(iter1)

[("Xin chân thành cảm ơn nhà mạng  MobiFone July Six added 2 new photos — with trương thuật and 5 others. Photos from July Six's post",
  'neutral')]

In [49]:
text2vec_model.vec_to_doc([next(iter2)[0][0]])

["xin chân thành cảm ơn nhà mạng mobifone july OUT_OF_VOCAB added new photos — with trương thuật and others . photos from july OUT_OF_VOCAB 's post PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADDING PADD

In [None]:
with tf.Graph().as_default():
    with tf.Session() as sess:
        filenames = [TRAINING_PATH]
        record_defaults = [tf.string, tf.string]
        dataset = tf.contrib.data.CsvDataset(filenames, record_defaults, header=True)
        dataset = dataset.map(lambda doc, label: tuple(tf.py_func(preprocess_datapoint, [doc, label], [tf.int64, tf.int64])))
        dataset = dataset.padded_batch(4, padded_shapes=(1, 1))

#         dataset = dataset.batch(4)
        iterator = dataset.make_one_shot_iterator()
        next_element = iterator.get_next()
        x = sess.run(next_element)
        y = sess.run(next_element)

In [None]:
isinstance(y, collections.Sequence)

In [None]:
text2vec.vec_to_doc(text2vec.doc_to_vec(['duc tri nguyen']))

In [None]:
def build_input_v2():
    """
    Return tensor input
    """
    SENTENCE_MAX_LENGTH = 150
    BATCH_SIZE = 2
    
    filenames = [TRAINING_PATH]
    record_defaults = [tf.string, tf.string]
    dataset = tf.contrib.data.CsvDataset(filenames, record_defaults, header=True)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.map(preprocess_text)
    iterator = dataset.make_one_shot_iterator()
    tf_X, tf_y = iterator.get_next()

    return tf_X, tf_y

In [None]:

### -----------------------------------------------------------------
###           INGRADIENTS: atomic elements
### -----------------------------------------------------------------
def build_input_v1():
    """
    Return tensor input
    """
    SENTENCE_MAX_LENGTH = 150
    tf_X = tf.placeholder(dtype=tf.int32, name='tf_X', shape=[None, SENTENCE_MAX_LENGTH])
    tf_y = tf.placeholder(dtype=tf.int32, name='tf_y', shape=[None])
    return tf_X, tf_y

def build_inference_v1(tf_X):
    def project(tf_X):
        with tf.device('/cpu:0'), tf.variable_scope('embedding'):
            VOCAB_SIZE = 10000
            EMBEDDING_SIZE = 300

            tf_word_embeddings = tf.get_variable(name='word_embeddings', dtype=tf.float32,
                                              shape=[VOCAB_SIZE, EMBEDDING_SIZE],
                                              initializer=tf.truncated_normal_initializer(stddev=5e-2))
            tf_projected_sentences = tf.nn.embedding_lookup(params=tf_word_embeddings, ids=tf_X)
            return tf_projected_sentences
    
    tf_projected_sens = project(tf_X)
    tf_projected_sens = tf.expand_dims(tf_projected_sens, axis=3)
    
    with tf.variable_scope('convolution_layer'):
        tf_after_conv = tf.layers.conv2d(inputs=tf_projected_sens, filters=10, kernel_size=(5, 5), strides=(2, 2), padding='SAME', name='conv1')
        tf_after_conv = tf.layers.conv2d(inputs=tf_after_conv, filters=20, kernel_size=(3, 3), strides=(2, 2), padding='SAME', name='conv2')
    
    with tf.variable_scope('softmax'):
        tf_flatten = tf.layers.flatten(tf_after_conv)
        tf_logits = tf.layers.dense(inputs=tf_flatten, units=3, activation=tf.nn.relu)
    
    return tf_logits
    
def build_loss_v1(tf_logits, tf_y):
    tf_losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf_y, logits=tf_logits)
    tf_aggregated_loss = tf.reduce_mean(tf_losses)

    tf.summary.scalar(name='loss', tensor=tf_aggregated_loss)
    return tf_aggregated_loss

def build_optimize_v1(tf_loss):
    """
    Return tensor optimizer and global step
    """
    tf_global_step = tf.get_variable(name='global_step', dtype=tf.int32, shape=(), initializer=tf.zeros_initializer())
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.05).minimize(tf_loss, global_step=tf_global_step)
    return optimizer, tf_global_step

def build_predict(tf_logit):
    """
    Convert from tensor logit to tensor one hot
    """
    pass


def training_block(graph, tf_X, tf_y, tf_optimizer, tf_global_step, training_generator, test_generator):
    
    with graph.as_default() as gr:
        tf_all_summary = tf.summary.merge_all()
        
        current_dir = os.getcwd()
        experiment_name = datetime.strftime(datetime.now(), '%Y-%m-%dT%H:%M:%S')
        tf_train_writer = tf.summary.FileWriter(logdir=os.path.join(current_dir, 'summary', 'train_' + experiment_name), graph=graph)
        tf_test_writer = tf.summary.FileWriter(logdir=os.path.join(current_dir, 'summary', 'test_' + experiment_name), graph=graph)
        
        with tf.Session().as_default() as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            SUMMARY_STEP = 10
            EVALUATION_STEP = 10
            for X, y in training_generator:
                feed_dict = {tf_X: X, tf_y: y}
                _, global_step = sess.run([tf_optimizer, tf_global_step], feed_dict=feed_dict)
                
                if global_step % SUMMARY_STEP == 0:
                    logging.debug('Collect summary data at step: %s', global_step)
                    train_summary_data = sess.run(tf_all_summary, feed_dict=feed_dict)
                    tf_train_writer.add_summary(train_summary_data, global_step=global_step)
                    
                if global_step % EVALUATION_STEP == 0:
                    logging.debug('Evaluate at step: %s', global_step)
                    X_test, y_test = next(test_generator)
                    
                    test_summary_data = sess.run(tf_all_summary, feed_dict={
                        tf_X: X_test,
                        tf_y: y_test
                    })
                    tf_test_writer.add_summary(test_summary_data, global_step=global_step)

def get_training_generator():
    X = np.random.randint(1000, size=(512, 150))
    y = np.random.randint(3, size=(512))
    for i in range(1000):
        yield X, y


In [None]:
graph = tf.Graph()
with graph.as_default():
    tf_X, tf_y = build_input_v1()
    tf_logit = build_inference_v1(tf_X)
    tf_loss = build_loss_v1(tf_logit, tf_y)
    tf_optimizer, tf_global_step = build_optimize_v1(tf_loss)
    
    training_block(graph=graph, tf_X=tf_X, tf_y=tf_y, training_generator=get_training_generator(), 
                   test_generator=get_training_generator(),
                   tf_optimizer=tf_optimizer,
                   tf_global_step=tf_global_step)
    
    

In [None]:
def build_inference_v2(tf_X):
    """
    ```Thoi kho qua, de thu sau
    Return tensor logit
    tf_X: [batch_size, sentence_max_length]
    """
    
    with tf.device('/cpu:0'), tf.variable_scope('embedding'):
        VOCAB_SIZE = 10000
        EMBEDDING_SIZE = 300
        
        tf_word_embeddings = tf.get_variable(name='word_embeddings', dtype=tf.float32,
                                          shape=[VOCAB_SIZE, EMBEDDING_SIZE],
                                          initializer=tf.truncated_normal_initializer(stddev=5e-2))
        tf_projected_sentences = tf.nn.embedding_lookup(params=tf_word_embeddings, ids=tf_X)
        list_tf_word_embeddings = tf.unstack(tf_projected_sentences, axis=1)
        
    with tf.variable_scope('LSTM'):
        STATE_SIZE = 200
        lstm_cell = rnn.BasicLSTMCell(STATE_SIZE, forget_bias=1.0)
        # Each output has shape of [batch_size, state_size]
        list_outputs, _ = rnn.static_rnn(cell=lstm_cell, inputs=word_embeddings, dtype=tf.float32)
    
    with tf.variable_scope('Attention'):
        ATTENTION_SIZE = 200
        tf_output = tf.stack(list_outputs, axis=2) # [batch_size, state_size, sentence_max_length]
        
#         tf_attention_weights =
        
        tf_after_attention = tf.layers.dense(tf_output, units=ATTENTION_SIZE, activation=tf.nn.relu)
        
    with tf.variable_scope('Fully-Connected'):
        tf_logit = tf.layers.dense(tf_after_attention, units=ATTENTION_SIZE, activation=tf.nn.relu)