In [1]:
import json, os, re, shutil, sys, time
import collections, itertools
import unittest
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# utils.pretty_print_matrix uses Pandas. Configure float format here.
import pandas as pd
pd.set_option('float_format', lambda f: "{0:.04f}".format(f))

# RNNLM Model
import rnnlm
reload(rnnlm)

<module 'rnnlm' from 'rnnlm.pyc'>

In [2]:
import collections

class Vocabulary(object):

  UNK_TOKEN = "<unk>"

  def __init__(self, tokens, size=None):
    self.unigram_counts = collections.Counter(tokens)
    # leave space for "<s>", "</s>", and "<unk>"
    top_counts = self.unigram_counts.most_common(None if size is None else (size - 1))
    vocab = ([self.UNK_TOKEN] +
             [w for w,c in top_counts])

    # Assign an id to each word, by frequency
    self.id_to_word = dict(enumerate(vocab))
    self.word_to_id = {v:k for k,v in self.id_to_word.iteritems()}
    self.size = len(self.id_to_word)
    if size is not None:
        assert(self.size <= size)

    # For convenience
    self.wordset = set(self.word_to_id.iterkeys())

    # Store special IDs
    self.UNK_ID = self.word_to_id[self.UNK_TOKEN]

  def words_to_ids(self, words):
    return [self.word_to_id.get(w, self.UNK_ID) for w in words]

  def ids_to_words(self, ids):
    return [self.id_to_word[i] for i in ids]

  def ordered_words(self):
    """Return a list of words, ordered by id."""
    return self.ids_to_words(range(self.size))


In [3]:
from os import listdir
import re

def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word):
    word = word.lower()
    return canonicalize_digits(word) # try to canonicalize numbers

def canonicalize_words(words):
    current = []
    for word in words.split(" "):   
        if word[-1] in (".", ',', '?', ';', '!'):
            punk = word[-1]
            current.append(punk)
            word = word[0:-1]

        word = canonicalize_word(word)
        current.append(word)
    return current

In [4]:
def pretty_timedelta(fmt="%d:%02d:%02d", since=None, until=None):
    """Pretty-print a timedelta, using the given format string."""
    since = since or time.time()
    until = until or time.time()
    delta_s = until - since
    hours, remainder = divmod(delta_s, 3600)
    minutes, seconds = divmod(remainder, 60)
    return fmt % (hours, minutes, seconds)


In [7]:
def load_train_data():
    train_data_dir = 'train_data'
    y = []
    X = []
    all_tokens = []
    author_to_id = {}
    for author_id, author in enumerate(listdir(train_data_dir)):
        author_to_id[author] = author_id
        author_path = "%s/%s" % (train_data_dir, author)
        for file_name in listdir(author_path):
            full_path = "%s/%s" % (author_path, file_name)
            y.append(author_id)            
            with open(full_path, "r") as f:
                current = canonicalize_words(f.read())
                all_tokens += current
                X.append(np.array(current))
                
    vocab = Vocabulary(all_tokens)

    # replace words with ids
    for i, x in enumerate(X):
        X[i] = np.array(vocab.words_to_ids(x))

    return vocab, np.array(X), np.array(y), author_to_id

In [8]:
vocab, X_train, y_train, author_to_id = load_train_data()
num_classes = len(np.unique(y_train))
print vocab.size
print author_to_id

33323
{'thomas_jefferson': 0, 'john_adams': 1, 'alexander_hamilton': 3, 'george_washington': 6, 'james_madison': 2, 'james_monroe': 4, 'john_jay': 5}


In [9]:
def load_eval_data(vocab):
    eval_data_dir = "unknown_data"
    eval_X = {}
    eval_y = {}
    for file_name in listdir(eval_data_dir):
        full_path = "%s/%s" % (eval_data_dir, file_name)
        with open(full_path, "r") as f:
            current = vocab.words_to_ids(canonicalize_words(f.read()))

        expanded_X = np.array(current)
        id = file_name.split("_")[2].split(".")[0]
        eval_X[id] = np.array([expanded_X])
        # working with the assumption that James Madison wrote all the disputed papers
        eval_y[id] = np.array([author_to_id['james_madison']])

    return eval_X, eval_y

eval_X, eval_y = load_eval_data(vocab)
print eval_y['18']
print eval_X['18']

[2]
[[ 179    1 1677 ...,  253    6  469]]


In [10]:
print y_train
print X_train

[0 0 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 5 5 5 5 5 6 6]
[array([   11,    12,     8, ...,     6, 16924,  1444])
 array([ 1444,     4,  1432, ...,     3,     1, 17362])
 array([   48,    36,    16, ..., 27132,     6,  3617])
 array([   1,  155,  587, ...,    6,  116, 1274])
 array([  11,   36,  628, ...,    6,  488, 1274])
 array([  1, 425, 222, ...,   3,   6, 509])
 array([ 196,  628,   10, ...,    6,  145, 1274])
 array([ 196, 6450,    1, ...,    6,  587, 1274])
 array([   1,  425, 1031, ...,    6,  739, 1274])
 array([ 179,    1,  610, ...,    6, 3140, 1274])
 array([7863,    1,  132, ...,    6,   45, 1274])
 array([  39,   17,  608, ...,    6,  951, 1274])
 array([   1,   96,  342, ...,  638, 2642, 1274])
 array([   7, 6268,    1, ...,  117,    6, 4120])
 array([  11,   12,   20, ...,    6, 1387, 1274])
 array([   8, 1827, 10

In [11]:
def batch_generator(publications, authors, batch_size, max_time):
    """Convert ids to data-matrix form."""
    for i, ids in enumerate(publications):
        # Clip to multiple of max_time for convenience
        clip_len = ((len(ids)-1) / batch_size) * batch_size
        
        input_w = ids[:clip_len]     # current word
        target_y = ids[1:clip_len+1]  # next word
        # Reshape so we can select columns
        input_w = input_w.reshape([batch_size,-1])

        # Yield batches
        for j in xrange(0, input_w.shape[1], max_time):
            this_w = input_w[:,j:j+max_time]
            yield this_w, np.full_like(this_w, authors[i])


In [14]:
for i, (w, y) in enumerate(batch_generator(X_train, y_train, 10, 5)):
    print y
    print w
    break

[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]
[[   11    12     8   539     3]
 [    4  3248     4 15631    31]
 [   28  1208    38 31402     3]
 [32160   298   187  1972    10]
 [  582    52   919 13039    13]
 [   21    52     3   556     2]
 [    2   199     5  2329   381]
 [    2   381    13    17  2109]
 [  857     8 26980     2    51]
 [    3     1    99     5     1]]


In [90]:
def run_epoch(lm, session, batch_iterator,
              train=False, verbose=False,
              tick_s=10, learning_rate=0.1):
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_batches = 0
    total_words = 0

    if train:
        train_op = lm.train_step_
        use_dropout = True
#        predictions_op = tf.no_op()
        loss = lm.loss_
    else:
        train_op = tf.no_op()
        use_dropout = False  # no dropout at test time
#        predictions_op = lm.predictions_
        loss = lm.loss_

    for i, (w, y) in enumerate(batch_iterator):
        cost = 0.0
        # At first batch in epoch, get a clean intitial state.
        if i == 0:
            h = session.run(lm.initial_h_, {lm.input_w_: w})
            
#        print y
#        print y[:,max_time-1].reshape(50,1)
        feed_dict = {lm.input_w_: w,
                     lm.target_y_: y,
                     lm.initial_h_: h,
                     lm.learning_rate_: learning_rate,
                     lm.use_dropout_: use_dropout}
        
#        _, h, cost, = session.run([train_op, lm.final_h_, lm.loss_], feed_dict)  
        _, h, logits, target_y, cost = session.run([train_op, lm.final_h_, lm.logits_last_, lm.target_y_last_, loss], feed_dict)  
#        print "logits shape", logits.shape
#        print "target_y shape", target_y.shape
#        print target_y
        
#        break
        total_cost += cost
        total_batches = i + 1
        total_words += w.size  # w.size = batch_size * max_time

        ##
        # Print average loss-so-far for epoch
        # If using train_loss_, this may be an underestimate.
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_wps = total_words / (time.time() - start_time)
            print "[batch %d]: seen %d words at %d wps, loss = %.3f" % (
                i, total_words, avg_wps, avg_cost)
            tick_time = time.time()  # reset time ticker

    return total_cost / total_batches

In [16]:
def score_dataset(lm, session, ids, authors, name="Data"):
    # For scoring, we can use larger batches to speed things up.
    bi = batch_generator(ids, authors, batch_size=100, max_time=100)
    cost = run_epoch(lm, session, bi, 
                     learning_rate=1.0, train=False, 
                     verbose=False, tick_s=3600)
    print "%s: avg. loss: %.03f  (perplexity: %.02f)" % (name, cost, np.exp(cost))
    return cost

In [17]:
# Training parameters
max_time = 20
batch_size = 50
learning_rate = 0.5
num_epochs = 4

# Model parameters
model_params = dict(V=vocab.size, 
                    H=100, 
                    num_classes=num_classes,
                    softmax_ns=7,
                    num_layers=1)

TF_SAVEDIR = "tf_saved"
checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")

In [91]:
reload(rnnlm)
# Will print status every this many seconds
print_interval = 5

# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)

lm = rnnlm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()

# Explicitly add global initializer and variable saver to LM graph
with lm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    session.run(initializer)

    for epoch in xrange(1,num_epochs+1):
        t0_epoch = time.time()
        bi = batch_generator(X_train, y_train, batch_size, max_time)
        print "[epoch %d] Starting epoch %d" % (epoch, epoch)
        # Run a training epoch.

        run_epoch(lm, session, bi, train=True, verbose=True, tick_s=100, learning_rate=learning_rate)
    
        print "[epoch %d] Completed in %s" % (epoch, pretty_timedelta(since=t0_epoch))
    
        # Save a checkpoint
        saver.save(session, checkpoint_filename, global_step=epoch)
    
        ##
        # score_dataset will run a forward pass over the entire dataset
        # and report perplexity scores. This can be slow (around 1/2 to 
        # 1/4 as long as a full epoch), so you may want to comment it out
        # to speed up training on a slow machine. Be sure to run it at the 
        # end to evaluate your score.
        print ("[epoch %d]" % epoch),
        score_dataset(lm, session, eval_X['18'], eval_y['18'], name="Federalist Paper 18")
        #score_dataset(lm, session, test_ids, name="Test set")
        print ""
    
    # Save final model
    saver.save(session, trained_filename)

[epoch 1] Starting epoch 1
[epoch 1] Completed in 0:00:35
[epoch 1] Federalist Paper 18: avg. loss: 10.523  (perplexity: 37176.74)

[epoch 2] Starting epoch 2
[epoch 2] Completed in 0:00:35
[epoch 2] Federalist Paper 18: avg. loss: 8.955  (perplexity: 7743.93)

[epoch 3] Starting epoch 3
[epoch 3] Completed in 0:00:35
[epoch 3] Federalist Paper 18: avg. loss: 6.770  (perplexity: 871.13)

[epoch 4] Starting epoch 4
[epoch 4] Completed in 0:00:33
[epoch 4] Federalist Paper 18: avg. loss: 7.401  (perplexity: 1637.64)

