In [None]:
import logging
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.contrib import rnn
from datetime import datetime
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
from typing import List
from nltk.tokenize import word_tokenize
import itertools
import nltk
import collections
import pickle
import re

from text2vector import Text2Vector
from dataset import Dataset
from random import shuffle
import ingradient
import utils

nltk.download('punkt')

In [None]:
ENTROPY_PATH = os.path.join('/dataset', 'entropy_2018')
TRAINING_PATH = os.path.join(ENTROPY_PATH, 'training_set.csv')
TEST_PATH = os.path.join(ENTROPY_PATH, 'test_set.csv')
SAMPLE_PATH = os.path.join(ENTROPY_PATH, 'sample.csv')

In [None]:
def preprocess_text(doc):
    doc = doc.lower()
    NUMBERS_PATTERN = re.compile(r"[+-]?\d+(?:\.\d+)?")
    doc = re.sub(NUMBERS_PATTERN, '', doc)
    URL_PATTERN = re.compile(
            r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
    doc = re.sub(URL_PATTERN, 'URL', doc)
    return doc

if os.path.exists('text2vec.p'):
    logging.info('Load text2vector object from saved pickle')
    text2vec_model = Text2Vector.load('text2vec.p')
else:
    logging.info('Fitting')
    df_train = pd.read_csv(TRAINING_PATH)
    docs = df_train['sentence'].map(preprocess_text)
    text2vec_model = Text2Vector()
    text2vec_model.fit(docs)
    text2vec_model.save('text2vec.p')


if os.path.exists('training_dataset.p'):
    logging.info('Load training dataset from pickle')
    training_dataset = Dataset.load('training_dataset.p')
else:
    logging.info('Load training dataset from CSV')
    LABEL_MAPPING = {
        'positive': 0,
        'neutral': 1,
        'negative': 2
    }
    def digitize_datapoint(datapoint):
        doc, label = datapoint
        doc = preprocess_text(doc)
        return text2vec_model.doc_to_vec([doc])[0], LABEL_MAPPING[label]

    training_dataset = Dataset.from_csv(TRAINING_PATH)
    training_dataset = training_dataset.map(digitize_datapoint)
    training_dataset.save('training_dataset.p')

    
if os.path.exists('test_dataset.p'):
    logging.info('Load test dataset from pickle')
    test_dataset = Dataset.load('test_dataset.p')
else:
    logging.info('Load test dataset from CSV')
    LABEL_MAPPING = {
        'positive': 0,
        'neutral': 1,
        'negative': 2
    }
    def digitize_datapoint(datapoint):
        doc, label = datapoint
        doc = preprocess_text(doc)
        return text2vec_model.doc_to_vec([doc])[0], LABEL_MAPPING[label]

    test_dataset = Dataset.from_csv(TEST_PATH)
    test_dataset = test_dataset.map(digitize_datapoint)
    test_dataset.save('test_dataset.p')
    
BATCH_SIZE = 128
training_dataset = training_dataset.shuffle(10000)
training_dataset = training_dataset.padded_batch(batch_size=BATCH_SIZE, list_lengths=(150, None), padded_value=text2vec_model.vocab_to_int[Text2Vector.PADDING])
training_dataset = training_dataset.repeat(300)

test_dataset = test_dataset.shuffle(10000)
test_dataset = test_dataset.padded_batch(batch_size=1000, list_lengths=(150, None), padded_value=text2vec_model.vocab_to_int[Text2Vector.PADDING])
test_dataset = test_dataset.repeat(300)


In [None]:
graph = tf.Graph()
with graph.as_default():
    tf_X, tf_y = ingradient.build_input_v1()
    tf_logit = ingradient.build_inference_v1(tf_X)
    tf_predict = ingradient.build_predict_v1(tf_logit)
    ingradient.build_accuracy_v1(tf_predict, tf_y)
    tf_loss = ingradient.build_loss_v1(tf_logit, tf_y)
    tf_optimizer, tf_global_step = ingradient.build_optimize_v1(tf_loss)
    logging.info('Total parameters: %s', utils.count_trainable_variables())
    ingradient.training_block(graph=graph, tf_X=tf_X, tf_y=tf_y, training_generator=training_dataset.get_iterator(), 
                   test_generator=test_dataset.get_iterator(),
                   tf_optimizer=tf_optimizer,
                   tf_global_step=tf_global_step)

In [None]:
dataset.get_data_length()