In [1]:
import tensorflow as tf
import numpy as np
import time

from sklearn.feature_extraction.text import TfidfTransformer

tf.logging.set_verbosity(tf.logging.INFO)

In [2]:
VOCAB_SIZE = 20000
N_CLASS = 2
BATCH_SIZE = 32
N_EPOCH = 2
LR = 5e-3

In [3]:
def sparse_tfidf(X):
    t0 = time.time()
    count = np.zeros((len(X), VOCAB_SIZE))
    for i, indices in enumerate(X):
        for idx in indices:
            count[i, idx] += 1
    print("%.2f secs ==> Document-Term Matrix"%(time.time()-t0))

    t0 = time.time()
    X = TfidfTransformer().fit_transform(count)
    print("%.2f secs ==> TF-IDF transform"%(time.time()-t0))
    return X

In [4]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=VOCAB_SIZE)
X_train = sparse_tfidf(X_train)
X_test = sparse_tfidf(X_test)

4.00 secs ==> Document-Term Matrix
10.13 secs ==> TF-IDF transform
3.57 secs ==> Document-Term Matrix
10.00 secs ==> TF-IDF transform


In [5]:
def next_train_batch(X, y):
    for i in range(0, X.shape[0], BATCH_SIZE):
        yield X[i : i+BATCH_SIZE].toarray(), y[i: i+BATCH_SIZE]

def next_test_batch(X):
    for i in range(0, X.shape[0], BATCH_SIZE):
        yield X[i : i+BATCH_SIZE].toarray()

def train_input_fn(X_train, y_train):
    dataset = tf.data.Dataset.from_generator(
        lambda: next_train_batch(X_train, y_train),
        (tf.float32, tf.int64),
        (tf.TensorShape([None, VOCAB_SIZE]), tf.TensorShape([None])))
    dataset = dataset.shuffle(X_train.shape[0])
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

def predict_input_fn(X_test):
    dataset = tf.data.Dataset.from_generator(
        lambda: next_test_batch(X_test),
        tf.float32,
        tf.TensorShape([None, VOCAB_SIZE]))
    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()

In [6]:
def forward(inputs):
    return tf.layers.dense(inputs, N_CLASS)

def model_fn(features, labels, mode, params):
    logits = forward(features)
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        preds = tf.argmax(logits, -1)
        return tf.estimator.EstimatorSpec(mode, predictions=preds)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        global_step = tf.train.get_global_step()

        loss_op = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=labels))

        train_op = tf.train.AdamOptimizer(LR).minimize(loss_op, global_step=global_step)
        
        return tf.estimator.EstimatorSpec(
            mode=mode, loss=loss_op, train_op=train_op)

In [7]:
estimator = tf.estimator.Estimator(model_fn)

for _ in range(N_EPOCH):
    estimator.train(lambda: train_input_fn(X_train, y_train))
    y_pred = np.array(list(estimator.predict(lambda: predict_input_fn(X_test))))
    print("\nValidation Accuracy: %.4f\n" % (y_pred==y_test).mean())

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/sx/fv0r97j96fz8njp14dt5g7940000gn/T/tmpm4x6u01o', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1217c2898>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /var/folders/sx/fv0r97j96fz8njp14dt5g794000