In [1]:
import json, os, re, shutil, sys, time
import collections, itertools
import unittest
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# utils.pretty_print_matrix uses Pandas. Configure float format here.
import pandas as pd
pd.set_option('float_format', lambda f: "{0:.04f}".format(f))

# RNNLM Model
import rnnlm
reload(rnnlm)

<module 'rnnlm' from 'rnnlm.pyc'>

In [2]:
import collections

class Vocabulary(object):

  UNK_TOKEN = "<unk>"

  def __init__(self, tokens, size=None):
    self.unigram_counts = collections.Counter(tokens)
    # leave space for "<s>", "</s>", and "<unk>"
    top_counts = self.unigram_counts.most_common(None if size is None else (size - 1))
    vocab = ([self.UNK_TOKEN] +
             [w for w,c in top_counts])

    # Assign an id to each word, by frequency
    self.id_to_word = dict(enumerate(vocab))
    self.word_to_id = {v:k for k,v in self.id_to_word.iteritems()}
    self.size = len(self.id_to_word)
    if size is not None:
        assert(self.size <= size)

    # For convenience
    self.wordset = set(self.word_to_id.iterkeys())

    # Store special IDs
    self.UNK_ID = self.word_to_id[self.UNK_TOKEN]

  def words_to_ids(self, words):
    return [self.word_to_id.get(w, self.UNK_ID) for w in words]

  def ids_to_words(self, ids):
    return [self.id_to_word[i] for i in ids]

  def ordered_words(self):
    """Return a list of words, ordered by id."""
    return self.ids_to_words(range(self.size))


In [7]:
from os import listdir
import re

def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word):
    word = word.lower()
    return canonicalize_digits(word) # try to canonicalize numbers

def canonicalize_words(words):
    current = []
    for word in words.split(" "):   
        if word and word[-1] in (".", ',', '?', ';', '!'):
            punk = word[-1]
            current.append(punk)
            word = word[0:-1]

        word = canonicalize_word(word)
        current.append(word)
    return current

In [4]:
def pretty_timedelta(fmt="%d:%02d:%02d", since=None, until=None):
    """Pretty-print a timedelta, using the given format string."""
    since = since or time.time()
    until = until or time.time()
    delta_s = until - since
    hours, remainder = divmod(delta_s, 3600)
    minutes, seconds = divmod(remainder, 60)
    return fmt % (hours, minutes, seconds)


In [28]:
def load_train_data():
    train_data_dir = 'train_data'
    y = []
    X = []
    all_tokens = []
    author_to_id = {}
    for author_id, author in enumerate(listdir(train_data_dir)):
        author_to_id[author] = author_id
        author_path = "%s/%s" % (train_data_dir, author)
        for file_name in listdir(author_path):
            full_path = "%s/%s" % (author_path, file_name)
            print full_path, author_id
            y.append(author_id)            
            with open(full_path, "r") as f:
                current = canonicalize_words(f.read())
                all_tokens += current
                X.append(np.array(current))
                
    vocab = Vocabulary(all_tokens)

    # replace words with ids
    for i, x in enumerate(X):
        X[i] = np.array(vocab.words_to_ids(x))

    return vocab, np.array(X), np.array(y), author_to_id

In [30]:
vocab, X_train, y_train, author_to_id = load_train_data()
num_classes = len(np.unique(y_train))
print vocab.size
print author_to_id

train_data/thomas_paine/paine_rights_of_man_gutenberg_clean.txt 0
train_data/thomas_paine/paine_american_crisis_gutenberg_clean.txt 0
train_data/thomas_paine/paine_common_sense_gutenberg_clean.txt 0
train_data/thomas_jefferson/jefferson_writings_of_vol_2_gutenberg_clean.txt 1
train_data/thomas_jefferson/jefferson_writings_of_vol_4_gutenberg_clean.txt 1
train_data/thomas_jefferson/jefferson_writings_of_vol_1_gutenberg_clean.txt 1
train_data/thomas_jefferson/jefferson_writings_of_vol_3_gutenberg_clean.txt 1
train_data/thomas_jefferson/jefferson_state_of_the_unions_gutenberg_clean.txt 1
train_data/thomas_jefferson/jefferson_writings_of_vol_5_gutenberg_clean.txt 1
train_data/thomas_jefferson/jefferson_writings_of_vol_7_gutenberg_clean.txt 1
train_data/thomas_jefferson/jefferson_misc_letters_addresses_clean.txt 1
train_data/thomas_jefferson/jefferson_writings_of_vol_6_gutenberg_clean.txt 1
train_data/john_adams/adams_state_of_the_unions_gutenberg_clean.txt 2
train_data/james_madison/federal

In [31]:
def load_eval_data(vocab):
    eval_data_dir = "unknown_data"
    eval_X = {}
    eval_y = {}
    for file_name in listdir(eval_data_dir):
        full_path = "%s/%s" % (eval_data_dir, file_name)
        with open(full_path, "r") as f:
            current = vocab.words_to_ids(canonicalize_words(f.read()))

        expanded_X = np.array(current)
        id = file_name.split("_")[2].split(".")[0]
        eval_X[id] = np.array([expanded_X])
        # working with the assumption that James Madison wrote all the disputed papers
        eval_y[id] = np.array([author_to_id['james_madison']])

    return eval_X, eval_y

eval_X, eval_y = load_eval_data(vocab)
print eval_y['18']
print eval_X['18']

[3]
[[ 185    1 3753 ...,  359    5  495]]


In [10]:
print y_train
print X_train

[0 0 0 1 1 1 1 1 1 1 1 1 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 5 6 6 6 6 6 7 7 8]
[array([   214,      3, 110275, ...,      3,    107, 117020])
 array([42483,   344, 22831, ...,     1,   135, 14900])
 array([   279, 116136,   3396, ...,   6758,      3,  21317])
 array([   47,   192,     3, ...,   591,   516, 38425])
 array([   13,    58,    19, ...,  2333,   120, 96859])
 array([61541,     1,   135, ...,   432,     6, 38425])
 array([   13,   789,    27, ...,     5,  3499, 45432])
 array([   11,    12,     8, ...,     5, 51014,  1028])
 array([ 56122,    132,   1821, ...,     14,    613, 118907])
 array([   13,   156,     8, ...,   169, 15270, 57131])
 array([  1028,      4,    712, ...,      3,      2, 125230])
 array([   13,   696,     4, ...,   527,     6, 27168])
 array([   13,    31,    18, ..., 77518,     5,  

In [11]:
def batch_generator(publications, authors, batch_size, max_time):
    """Convert ids to data-matrix form."""
    for i, ids in enumerate(publications):
        # Clip to multiple of max_time for convenience
        clip_len = ((len(ids)-1) / batch_size) * batch_size
        
        input_w = ids[:clip_len]     # current word
        target_y = ids[1:clip_len+1]  # next word
        # Reshape so we can select columns
        input_w = input_w.reshape([batch_size,-1])

        # Yield batches
        for j in xrange(0, input_w.shape[1], max_time):
            this_w = input_w[:,j:j+max_time]
            yield this_w, np.full_like(this_w, authors[i])


In [12]:
for i, (w, y) in enumerate(batch_generator(X_train, y_train, 10, 5)):
    print y
    print w
    break

[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]
[[   214      3 110275     36    262]
 [    30  38231      1  15231    243]
 [     1    348      6  41141      6]
 [   423      1   5339     52      3]
 [  1719    132   1606      7      2]
 [  5258    393      2      1   4648]
 [   522      2  97611    423    508]
 [  3039      6      1   1505      3]
 [    17  96856      3  77690      2]
 [    78      8    401 126840      1]]


In [13]:
def run_epoch(lm, session, batch_iterator,
              train=False, verbose=False,
              tick_s=10, learning_rate=0.1):
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_batches = 0
    total_words = 0

    if train:
        train_op = lm.train_step_
        use_dropout = True
    else:
        train_op = tf.no_op()
        use_dropout = False  # no dropout at test time

    for i, (w, y) in enumerate(batch_iterator):
        cost = 0.0
        # At first batch in epoch, get a clean intitial state.
        if i == 0:
            h = session.run(lm.initial_h_, {lm.input_w_: w})
            
        feed_dict = {lm.input_w_: w,
                     lm.target_y_: y,
                     lm.initial_h_: h,
                     lm.learning_rate_: learning_rate,
                     lm.use_dropout_: use_dropout}
        
        _, h, cost = session.run([train_op, lm.final_h_, lm.loss_], feed_dict)  

        total_cost += cost
        total_batches = i + 1
        total_words += w.size  # w.size = batch_size * max_time

        ##
        # Print average loss-so-far for epoch
        # If using train_loss_, this may be an underestimate.
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_wps = total_words / (time.time() - start_time)
            print "[batch %d]: seen %d words at %d wps, loss = %.3f" % (
                i, total_words, avg_wps, avg_cost)
            tick_time = time.time()  # reset time ticker

    return total_cost / total_batches

In [14]:
def score_dataset(lm, session, ids, authors, max_time, name="Data"):
    # For scoring, we can use larger batches to speed things up.
    bi = batch_generator(ids, authors, batch_size=100, max_time=max_time)
    cost = run_epoch(lm, session, bi, 
                     learning_rate=1.0, train=False, 
                     verbose=False, tick_s=3600)
    print "%s: avg. loss: %.03f  (perplexity: %.02f)" % (name, cost, np.exp(cost))
    return cost

In [26]:
def predict_paper(lm, session, batch_iterator):
    for i, (w, y) in enumerate(batch_iterator):        
        feed_dict = {lm.input_w_: w,
                     lm.target_y_: y}
        
        cost, truths, logits, predictions = session.run([lm.loss_, lm.target_y_last_, lm.logits_last_, lm.predictions_], feed_dict)  
        print "predictions:", predictions.reshape(-1)
        print "truths:", truths.reshape(-1)
        print "logits:", logits


In [32]:
# Training parameters
max_time = 15
batch_size = 40
learning_rate = 0.1
num_epochs = 1

# Model parameters
model_params = dict(V=vocab.size, 
                    H=100, 
                    num_classes=num_classes,
                    num_layers=1)

TF_SAVEDIR = "tf_saved"
checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")

In [None]:
reload(rnnlm)
# Will print status every this many seconds
print_interval = 5

# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)

lm = rnnlm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()
lm.BuildClassifierGraph()

# Explicitly add global initializer and variable saver to LM graph
with lm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    session.run(initializer)

    for epoch in xrange(1,num_epochs+1):
        t0_epoch = time.time()
        bi = batch_generator(X_train, y_train, batch_size, max_time)
        print "[epoch %d] Starting epoch %d" % (epoch, epoch)
        # Run a training epoch.

        run_epoch(lm, session, bi, train=True, verbose=True, tick_s=100, learning_rate=learning_rate)
    
        print "[epoch %d] Completed in %s" % (epoch, pretty_timedelta(since=t0_epoch))
    
        # Save a checkpoint
        saver.save(session, checkpoint_filename, global_step=epoch)
    
        ##
        # score_dataset will run a forward pass over the entire dataset
        # and report perplexity scores. This can be slow (around 1/2 to 
        # 1/4 as long as a full epoch), so you may want to comment it out
        # to speed up training on a slow machine. Be sure to run it at the 
        # end to evaluate your score.
        print ("[epoch %d]" % epoch),
        score_dataset(lm, session, eval_X['18'], eval_y['18'], max_time, name="Federalist Paper 18")
        #score_dataset(lm, session, test_ids, name="Test set")
        print ""

    bi = batch_generator(eval_X['18'], eval_y['18'], batch_size, max_time)
    predict_paper(lm, session, bi)
    # Save final model
    saver.save(session, trained_filename)

[epoch 1] Starting epoch 1
[batch 3866]: seen 2310600 words at 23102 wps, loss = 0.015


In [25]:
print author_to_id['james_madison']
print author_to_id['george_washington']
print author_to_id

3
7
{'thomas_jefferson': 1, 'john_adams': 2, 'alexander_hamilton': 4, 'benjamin_franklin': 8, 'george_washington': 7, 'thomas_paine': 0, 'james_madison': 3, 'james_monroe': 5, 'john_jay': 6}
