# Reddit Neural Bot Trainer
-----
#### ToDo
- Subredding embeddings

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import glob
import random
import time

import numpy as np
import tensorflow as tf

# Data

In [2]:
MAX_COMMENT_LENGTH = 20
BATCH_SIZE = 100

# x = tf.Variable([1.0, 2.0])

# init = tf.global_variables_initializer()

# sess = tf.Session()
# sess.run(init)
# v = sess.run(x)    
# print(v) # will show you your variable.

In [3]:
import pickle
meta = pickle.load( open( "metadata", "rb" ) )

vocab = meta["vocab"]
word_to_ix = meta["w2idx"]
ix_to_word = meta["idx2w"]

def to_eng(ids):
    return ' '.join([ix_to_word[id] if id != 0 else '' for id in ids])

In [4]:
proto_files = glob.glob('./*.tfrecords')
random.shuffle(proto_files)

In [5]:
filename_queue = tf.train.string_input_producer(proto_files)  #num_epochs=

In [6]:
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
  serialized_example,
  # Defaults are not specified since both keys are required.
  features={
      #'subredddit_id': tf.FixedLenFeature([], tf.int64),
      'question': tf.FixedLenFeature([MAX_COMMENT_LENGTH], tf.int64),
      'answer': tf.FixedLenFeature([MAX_COMMENT_LENGTH], tf.int64),
  })


# normal_rv = tf.Variable( tf.truncated_normal([2,3],stddev = 0.1))

# #initialize the variable
# init_op = tf.global_variables_initializer()
# print(normal_rv)
# print(features["comment"])

# #run the graph
# with tf.Session() as sess:
#     sess.run(init_op) #execute init_op
#     #print the random values that we sample
#     print (sess.run(normal_rv))
#     print (sess.run(features["comment"]))

In [7]:
min_after_dequeue = 10000
capacity = min_after_dequeue + 3 * BATCH_SIZE

comment, replies = tf.train.shuffle_batch(
    [features['question'], features['answer']],
    batch_size=BATCH_SIZE, capacity=capacity, min_after_dequeue=min_after_dequeue)

# Model

In [8]:
LEARNING_RATE = 0.01
SEQ_MAX_LEN = MAX_COMMENT_LENGTH
RNN_HIDDEN_SIZE = 1024
LAYERS = 3
CHAR_EMB_SIZE = 128
VOCAB_SIZE = len(vocab)
#SUBREDDIT_EMB_SIZE = ?

### Encoding

In [9]:
inner_cell = tf.contrib.rnn.BasicLSTMCell(RNN_HIDDEN_SIZE)
enc_cell = tf.contrib.rnn.MultiRNNCell([inner_cell] * LAYERS)

In [10]:
char_embeddings = tf.get_variable("embedding", [VOCAB_SIZE, CHAR_EMB_SIZE])
emb_comment = tf.nn.embedding_lookup(char_embeddings, comment)

In [11]:
_, thought_vector = tf.nn.dynamic_rnn(
    enc_cell, emb_comment, swap_memory=True, dtype=tf.float32)

### Decoding

In [12]:
reply_input = tf.concat(  # Add GO token to start
    [tf.zeros(shape=(BATCH_SIZE, 1), dtype=tf.int64), replies[:, :SEQ_MAX_LEN-1]], axis=1)
emb_reply_input = tf.nn.embedding_lookup(char_embeddings, reply_input)

In [13]:
dec_cell = tf.contrib.rnn.OutputProjectionWrapper(enc_cell, VOCAB_SIZE)

In [14]:
with tf.variable_scope("decoder"):
    dec_out, _ = tf.nn.dynamic_rnn(
        dec_cell, emb_reply_input, initial_state=thought_vector, swap_memory=True, dtype=tf.float32)

In [15]:
xent = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=dec_out, labels=replies)

In [16]:
loss = tf.reduce_sum(xent, axis=[1])
ave_loss = tf.reduce_mean(loss)

### Sampling

In [17]:
SAMPLE_TEMP = 0.7

def loop_fn(time, cell_output, cell_state, loop_state):
    if cell_output is None:  # time == 0
        next_cell_state = thought_vector  # state from the encoder
        next_input = tf.zeros([BATCH_SIZE], dtype=tf.int64)  # GO symbol
        next_input = tf.nn.embedding_lookup(char_embeddings, next_input)
        emit_output = tf.zeros([], dtype=tf.int64)
    else:
        next_cell_state = cell_state
        sample = tf.squeeze(tf.multinomial(cell_output / SAMPLE_TEMP, 1))
        print(sample)
        emb_sample = tf.nn.embedding_lookup(char_embeddings, sample)
        next_input = emb_sample
        emit_output = sample
    elements_finished = time >= tf.constant(SEQ_MAX_LEN, shape=(BATCH_SIZE,))
    finished = tf.reduce_all(elements_finished)
    print(next_input)
    next_input = tf.cond(
        finished,
        lambda: tf.zeros([BATCH_SIZE, CHAR_EMB_SIZE], dtype=tf.float32),
        lambda: next_input)
    print(next_input)
    next_loop_state = None
    return elements_finished, next_input, next_cell_state, emit_output, next_loop_state

with tf.variable_scope("decoder", reuse=True):
    outputs_ta, _, _ = tf.nn.raw_rnn(dec_cell, loop_fn, swap_memory=True)
    sample = outputs_ta.stack()

Tensor("decoder_1/rnn/embedding_lookup:0", shape=(100, 128), dtype=float32)
Tensor("decoder_1/rnn/cond/Merge:0", shape=(100, 128), dtype=float32)
Tensor("decoder_1/rnn/while/Squeeze:0", shape=(100,), dtype=int64)
Tensor("decoder_1/rnn/while/embedding_lookup:0", shape=(100, 128), dtype=float32)
Tensor("decoder_1/rnn/while/cond/Merge:0", shape=(100, 128), dtype=float32)


# Training

In [18]:
lr = tf.placeholder_with_default(LEARNING_RATE, [], name="lr")
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(ave_loss, tvars), 1.0)
optimizer = tf.train.RMSPropOptimizer(lr)
train_op = optimizer.apply_gradients(zip(grads, tvars))

In [19]:
from functools import reduce
import operator

def print_shapes():
    train_vars = tf.trainable_variables()
    
    lines = ['']
    lines.append('Trainable Variables:')
    lines.append('====================')
    total_params = 0
    for var in train_vars:
        n_param = reduce(operator.mul, var.get_shape().as_list(), 1)
        total_params += n_param
        lines.append('%20s %8d %s' % (var.get_shape().as_list(), n_param, var.name))
    lines.append('Total trainable parameters: %d' % total_params)
    
    lines.append('')
    lines.append('Other Varaibles:')
    lines.append('================')
    total_params = 0
    for var in tf.global_variables():
        if var in train_vars: continue
        n_param = reduce(operator.mul, var.get_shape().as_list(), 1)
        total_params += n_param
        lines.append('%20s %8d %s' % (var.get_shape().as_list(), n_param, var.name))
    lines.append('Total non-trainable parameters: %d' % total_params)
    
    return '\n'.join(lines)

print(print_shapes())


Trainable Variables:
         [6000, 128]   768000 embedding:0
        [1152, 4096]  4718592 rnn/multi_rnn_cell/cell_0/basic_lstm_cell/weights:0
              [4096]     4096 rnn/multi_rnn_cell/cell_0/basic_lstm_cell/biases:0
        [2048, 4096]  8388608 rnn/multi_rnn_cell/cell_1/basic_lstm_cell/weights:0
              [4096]     4096 rnn/multi_rnn_cell/cell_1/basic_lstm_cell/biases:0
        [2048, 4096]  8388608 rnn/multi_rnn_cell/cell_2/basic_lstm_cell/weights:0
              [4096]     4096 rnn/multi_rnn_cell/cell_2/basic_lstm_cell/biases:0
        [1152, 4096]  4718592 decoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/weights:0
              [4096]     4096 decoder/rnn/multi_rnn_cell/cell_0/basic_lstm_cell/biases:0
        [2048, 4096]  8388608 decoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/weights:0
              [4096]     4096 decoder/rnn/multi_rnn_cell/cell_1/basic_lstm_cell/biases:0
        [2048, 4096]  8388608 decoder/rnn/multi_rnn_cell/cell_2/basic_lstm_cell/weights:0

In [20]:
sess = tf.Session()
init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
sess.run(init_op)
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(sess=sess, coord=coord)
#coord.join(threads)
#sess.close()

In [21]:
CHECKPOINT_PATH = './checkpoints/'

saver = tf.train.Saver()
latest_checkpoint = tf.train.latest_checkpoint(CHECKPOINT_PATH)
if latest_checkpoint:
    saver.restore(sess, latest_checkpoint)

In [22]:
l_ave = b_ave = d_ave = 0

UPDATE_EVERY = 100

for step in range(500):
    start_time = time.time()
    _, l = sess.run([train_op, ave_loss], {
        lr: 0.0001
    })
    duration = time.time() - start_time

    l_ave += l
    b_ave += l / SEQ_MAX_LEN / np.log(2.)
    d_ave += duration
    
    #print("|", end="")
    if step % UPDATE_EVERY == 0:
        print()
        l_ave = l_ave / UPDATE_EVERY if step else l_ave
        b_ave = b_ave / UPDATE_EVERY if step else b_ave
        d_ave = d_ave / UPDATE_EVERY if step else d_ave
        
        print(step)
        print(l_ave, "(", b_ave, ")\t|", "%.3f sec" % d_ave)
        c, r = sess.run([comment, sample])
        for i in range(20):
            print(to_eng(c[i]), "-->", to_eng(r[:, i]))

        l_ave = b_ave = d_ave = 0
        print()
        saver.save(sess, CHECKPOINT_PATH + "checkpoint", global_step=0)
        #print('-'*24 + '|' + '-'*24 + '|' + '-'*24 + '|' + '-'*24 + '|')


0
173.994567871 ( 12.5510550105 )	| 9.798 sec
whats the old guy from the offices name creep unk           --> studying apps picking mirror refugee ryan saudi americans feels wells hah ongoing exact sends stanford donate slightly ahead gordon wore
oops the more u know unk               --> baseball knife reply throws dollars fargo amazed colors career 34 podcast wordpress guard reported height took lucky punk pen race
told him its all in your head  how rude           --> gender 😜 warmonger pose mentally rachel loses desktop ppl senator by injuries technical donations bs 20 sniff normal on apartment
allowing unk to keep 40 of their flow too much not enough depends on who you ask last take today --> transfer ring lgbt saying values laura context explanation apply pres fav small looks incident watched pose replace 32 remote lineup
does unk unk have anything left to say             --> reliable listened launched gray creator athletes celebrating msg cuban patient min tall tickets lowkey se

InvalidArgumentError: indices[69,13] = 6001 is not in [0, 6000)
	 [[Node: embedding_lookup_1 = Gather[Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@embedding"], validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](embedding/read, concat)]]
	 [[Node: embedding_lookup_1/_79 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/gpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=1, tensor_name="edge_1268_embedding_lookup_1", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]

Caused by op 'embedding_lookup_1', defined at:
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\ipykernel\kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\tornado\ioloop.py", line 887, in start
    handler_func(fd_obj, events)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\tornado\stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\tornado\stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\ipykernel\kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\IPython\core\interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\IPython\core\interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\IPython\core\interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-232f35714382>", line 3, in <module>
    emb_reply_input = tf.nn.embedding_lookup(char_embeddings, reply_input)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\ops\embedding_ops.py", line 111, in embedding_lookup
    validate_indices=validate_indices)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 1359, in gather
    validate_indices=validate_indices, name=name)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 763, in apply_op
    op_def=op_def)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\framework\ops.py", line 2327, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "c:\users\ericc\appdata\local\programs\python\python35\lib\site-packages\tensorflow\python\framework\ops.py", line 1226, in __init__
    self._traceback = _extract_stack()

InvalidArgumentError (see above for traceback): indices[69,13] = 6001 is not in [0, 6000)
	 [[Node: embedding_lookup_1 = Gather[Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@embedding"], validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](embedding/read, concat)]]
	 [[Node: embedding_lookup_1/_79 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/gpu:0", send_device="/job:localhost/replica:0/task:0/cpu:0", send_device_incarnation=1, tensor_name="edge_1268_embedding_lookup_1", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/gpu:0"]()]]
