In [259]:
from sklearn.cross_validation import train_test_split
import tensorflow as tf
import pandas as pd
import numpy as np
import functools
import csv

In [260]:
tf.logging.set_verbosity(tf.logging.INFO)

In [261]:
# Load Data
with open("/Users/dennybritz/Downloads/reddit_2008_01.txt") as f:
    reader = csv.reader(f)
    data_raw = [x[0] + " EOS" for x in reader]

In [262]:
# Preprocess data
vocab = tf.contrib.learn.preprocessing.text.VocabularyProcessor(max_document_length=25, min_frequency=1000)
vocab.fit(data_raw)

<tensorflow.contrib.learn.python.learn.preprocessing.text.VocabularyProcessor at 0x11ea85208>

In [263]:
VOCAB_SIZE = len(vocab.vocabulary_)
x = np.array(list(vocab.transform(data_raw)))
x_lengths = np.array([len(_) for _ in vocab._tokenizer(data_raw)])
y = np.array([_[1:] for _ in x])

In [264]:
len(vocab.vocabulary_)

1306

In [265]:
x_train, x_test, x_len_train, x_len_test, y_train, y_test = train_test_split(x, x_lengths, y, test_size=10000)

In [266]:
print("Training:")
print(x_train.shape)
print(y_train.shape)

print("\nTesting:")
print(x_test.shape)
print(y_test.shape)

Training:
(380367, 25)
(380367, 24)

Testing:
(10000, 25)
(10000, 24)


In [270]:
BATCH_SIZE = 8
def create_input_fn(x, x_len, y):
    def input_fn():
        x_batch, x_batch_len, y_batch = tf.train.batch(
            [x, x_len, y],
            batch_size=BATCH_SIZE,
            enqueue_many=True)
        return {
            "x": x_batch,
            "x_len": x_batch_len,
        }, y_batch
    return input_fn

In [271]:
train_input_fn = create_input_fn(x_train, x_len_train, y_train)
dev_input_fn = create_input_fn(x_test, x_len_test, y_test)

In [276]:
NUM_UNITS = 128
EMBEDDING_DIM = 128
def simple_rnn(x_dict, y_batch, mode):
    x_batch = x_dict["x"]
    x_batch_len = x_dict["x_len"]
    y_t = tf.unpack(y_batch,  axis=1)
    with tf.variable_scope("embedding"):
        W = tf.get_variable("W", initializer=tf.random_normal_initializer(), shape=[VOCAB_SIZE, EMBEDDING_DIM])
        x_embedded = tf.nn.embedding_lookup(W, x_batch)
    with tf.variable_scope("rnn"):
        cell = tf.nn.rnn_cell.GRUCell(NUM_UNITS)
        x_list = tf.unpack(x_embedded, axis=1)
        outputs, state = tf.nn.rnn(cell, x_list, dtype=tf.float32, sequence_length=x_batch_len)
    with tf.variable_scope("output"):
        W = tf.get_variable("W", initializer=tf.random_normal_initializer(), shape=[NUM_UNITS, VOCAB_SIZE])
        b = tf.get_variable("b", initializer=tf.zeros_initializer(shape=[VOCAB_SIZE]))
        losses = []
        for t, o in enumerate(outputs[:-1]):
            logits = tf.nn.xw_plus_b(o, W, b)
            probs = tf.nn.softmax(logits)
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y_t[t])
            eligible_exampels = tf.to_float(tf.less(tf.to_int64(t), x_batch_len))
            loss_filtered = loss * eligible_exampels
            losses.append(loss_filtered)
        avg_loss = tf.reduce_mean(tf.pack(losses))
    train_op = tf.contrib.layers.optimize_loss(
        loss=avg_loss,
        global_step=tf.contrib.framework.get_global_step(),
        learning_rate=0.001,
        clip_gradients=10.0,
        optimizer="Adam")
    if mode == tf.contrib.learn.ModeKeys.TRAIN:
        return probs, avg_loss, train_op
    else:
        return probs, avg_loss, None

In [277]:
estimator = tf.contrib.learn.Estimator(model_fn=simple_rnn)



In [278]:
vmon = tf.contrib.learn.monitors.ValidationMonitor(input_fn=dev_input_fn, every_n_steps=50)

In [279]:
estimator.fit(input_fn=train_input_fn, steps=1000)

INFO:tensorflow:Create CheckpointSaver
INFO:tensorflow:Step 1: loss = 7.55074
INFO:tensorflow:Step 101: loss = 3.46353
INFO:tensorflow:Step 201: loss = 3.97625
INFO:tensorflow:Saving checkpoints for 300 into /var/folders/d3/jc_t7rns3_qf61016_0zhrh800c0pj/T/tmpzyxcff4r/model.ckpt.
INFO:tensorflow:Step 301: loss = 3.3402
INFO:tensorflow:Step 401: loss = 3.92536
INFO:tensorflow:Step 501: loss = 2.74856
INFO:tensorflow:Saving checkpoints for 600 into /var/folders/d3/jc_t7rns3_qf61016_0zhrh800c0pj/T/tmpzyxcff4r/model.ckpt.
INFO:tensorflow:Step 601: loss = 4.35647
INFO:tensorflow:Step 701: loss = 3.52634
INFO:tensorflow:Step 801: loss = 3.8674
INFO:tensorflow:Saving checkpoints for 900 into /var/folders/d3/jc_t7rns3_qf61016_0zhrh800c0pj/T/tmpzyxcff4r/model.ckpt.
INFO:tensorflow:Step 901: loss = 3.77058
INFO:tensorflow:Saving checkpoints for 1000 into /var/folders/d3/jc_t7rns3_qf61016_0zhrh800c0pj/T/tmpzyxcff4r/model.ckpt.
INFO:tensorflow:Loss for final step: 3.4964.


Estimator(params=None)

In [186]:
estimator.evaluate(input_fn=train_input_fn, steps=100)



{'global_step': 10, 'loss': 2.5806007}