In [1]:
import tensorflow as tf
from tensorflow.contrib import rnn
import pandas as pd
import numpy as np
import collections

In [2]:
#Config Vars
LSTM_SIZE = 128
batch_size = 500
test_batch_size = 500

In [3]:
data_path = "data/ptb"

def read_data(filename):
    with open(filename, "r") as file:
        data = file.read().replace("\n", "<eos>").split()
        
        counter = collections.Counter(data)
        sorted_counter = sorted(counter.items(), key=lambda x: (x[1], x[0]))
        
        words,_ = list(zip(*sorted_counter))

        return dict(zip(words, range(len(words))))
def file2id(filename, w2id):
    with open(filename, "r") as file:
        data = file.read().split("\n")
        
        data = [(sentence + " <eos>").split() for sentence in data]
        
        wordIds = [[w2id[word] for word in sentence if word in w2id] for sentence in data]
        maxLen = len(max(wordIds, key=len))
        
        return wordIds, maxLen
        
        
def load_data():
    train_path = data_path + "/ptb.train.txt"
    valid_path = data_path + "/ptb.valid.txt"
    test_path = data_path + "/ptb.test.txt"
    
    w2id = read_data(train_path)
    padChar = len(w2id)
    w2id["<pad>"] = padChar
    
    train_data, maxLen = file2id(train_path, w2id)
    valid_data, _ = file2id(valid_path, w2id)
    test_data, _ = file2id(test_path, w2id)
    
    id2w = dict(zip(w2id.values(), w2id.keys()))
    
    train_data = pad(maxLen, train_data, padChar)
    valid_data = pad(maxLen, valid_data, padChar)
    test_data = pad(maxLen, test_data, padChar)
    
    
    return train_data, valid_data, test_data, id2w, maxLen

def pad(maxLen, words, padChar):
    return [sentence + (maxLen - len(sentence)) * [padChar] for sentence in words]

In [4]:
train_data, valid_data, test_data, id2w, maxLen = load_data()
words = len(id2w)

In [5]:
sess = tf.InteractiveSession()
x = tf.placeholder(tf.float32, [None, maxLen, 1])

flat_x = tf.unstack(x, maxLen, 1)

In [6]:
lstmCell = rnn.BasicLSTMCell(LSTM_SIZE, forget_bias=1)
outputs, _ = rnn.static_rnn(lstmCell, flat_x, dtype="float32")

In [7]:
def weight_var(shape):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.1))
def bias(shape):
    return tf.Variable(tf.constant(0.1, shape=shape))

W_fc = tf.Variable(weight_var([LSTM_SIZE, words]))
b_fc = tf.Variable(bias([words]))

In [8]:
loss = tf.zeros([1])


correct_list = []
predictions = []
for i in range(len(outputs) - 1):
    output = outputs[i]
    
    pred = tf.matmul(output, W_fc) + b_fc
    
    predictions.append(tf.nn.softmax(pred))
    
    choice = tf.argmax(predictions[i], 1), 
    correct = tf.equal(choice, tf.cast(flat_x[i+1], tf.int64))
    
    correct_list.append(correct)
    
    
    labels = tf.reshape(flat_x[i+1], [-1])
    labels = tf.cast(labels, tf.int32)
    
    cross_entro = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pred, labels=labels))
    
    loss = loss + cross_entro

In [9]:
accur = tf.reduce_mean(tf.cast(tf.stack(correct_list), tf.float32))

In [14]:
opt = tf.train.AdamOptimizer(1e-4)

gradients, params = zip(*opt.compute_gradients(loss))
clipped_grad, _ = tf.clip_by_global_norm(gradients, 5)

train = opt.apply_gradients(zip(clipped_grad, params))

In [11]:
tf.global_variables_initializer().run()

In [12]:
def generate_data(dataset, batch_size = 1):
    indexs = np.random.randint(0, len(dataset), [batch_size])
    random_data = [dataset[i] for i in indexs]
    
    return np.array(random_data, dtype="float32")    

In [13]:
#Training!
for iter in range(500):
    if iter % 100 == 0:
        train_loss = loss.eval(feed_dict={x: generate_data(train_data, batch_size).reshape(batch_size, maxLen, 1)})
        test_loss = loss.eval(feed_dict={x: generate_data(test_data, len(test_data)).reshape(len(test_data), maxLen, 1)})
        
        train_accur = accur.eval(feed_dict={x: generate_data(train_data, test_batch_size).reshape(test_batch_size, maxLen, 1)})
        
        print("Iter %05d | Training loss %d, accur %f | Test loss %d"%(iter, train_loss, train_accur, test_loss))
    train.run(feed_dict={x: generate_data(train_data, batch_size).reshape(batch_size, maxLen, 1)})

Iter 00000 | Training loss 756, accur 0.000000 | Test loss 756
Iter 00100 | Training loss 718, accur 0.000000 | Test loss 718
Iter 00200 | Training loss 679, accur 0.001644 | Test loss 679
Iter 00300 | Training loss 620, accur 0.007048 | Test loss 619
Iter 00400 | Training loss 541, accur 0.006685 | Test loss 541


In [23]:
#Generate a sentence! With a really dumb method :(
gen_sent = np.array([np.random.randint(len(id2w))] * maxLen)
print(id2w[gen_sent[0]], end=" ")
for i in range(1, maxLen-1):
    curr = predictions[i].eval(feed_dict={x: gen_sent.reshape(1, maxLen, 1)})[0]
    
    choice = np.random.choice(range(len(curr)), p=curr)
    
    gen_sent[i] = choice
    
    print(id2w[choice], end=" ")
    
    if id2w[choice] == "<eos>":
        break

tax-loss buddy promising badly satisfied cananea seized abc N midland governor theme n't programming latest it arias a cambria dax stateswest hard-disk up widespread winter wilfred banponce poles subsidy overhaul <unk> kitchen aspect affiliated he more boomers component investors above wright dreams corner sentencing manitoba convex entry group jim jailed impression spent suggested air depends useful enabling glasnost intensify parents david withdrawal reality intermediate broken N gaubert carriers in reading solutions organization clark music sustain others jobs and association to cohen popularity 