In [1]:
import json, os, re, shutil, sys, time
import collections, itertools
import unittest
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# utils.pretty_print_matrix uses Pandas. Configure float format here.
import pandas as pd
pd.set_option('float_format', lambda f: "{0:.04f}".format(f))

# RNNLM Model
import rnnlm
reload(rnnlm)

<module 'rnnlm' from 'rnnlm.pyc'>

Vocabulary class holds the vocabulary and the mapping between words and ids for the words.

In [2]:
import collections
'''Vocabulary class, nearly identical to that used in a4'''
class Vocabulary(object):

  UNK_TOKEN = "<unk>"

  def __init__(self, tokens, size=None):
    self.unigram_counts = collections.Counter(tokens)
    # leave space for "<unk>"
    top_counts = self.unigram_counts.most_common(None if size is None else (size - 1))
    vocab = ([self.UNK_TOKEN] +
             [w for w,c in top_counts])

    # Assign an id to each word, by frequency
    self.id_to_word = dict(enumerate(vocab))
    self.word_to_id = {v:k for k,v in self.id_to_word.iteritems()}
    self.size = len(self.id_to_word)
    if size is not None:
        assert(self.size <= size)

    # For convenience
    self.wordset = set(self.word_to_id.iterkeys())

    # Store special IDs
    self.UNK_ID = self.word_to_id[self.UNK_TOKEN]

  def words_to_ids(self, words):
    return [self.word_to_id.get(w, self.UNK_ID) for w in words]

  def ids_to_words(self, ids):
    return [self.id_to_word[i] for i in ids]

  def ordered_words(self):
    """Return a list of words, ordered by id."""
    return self.ids_to_words(range(self.size))


These functions are used to massage the cleaned data into indivdual words. Punctuation at the end of a word is split into its own distince word. Also accomplishes some minor data cleaning, removing nonsense characters.

In [3]:
from os import listdir
import re

def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word):
    word = word.lower()
    return canonicalize_digits(word) # try to canonicalize numbers

def replace_all(text, dic):
    for i, j in dic.iteritems():
        text = text.replace(i, j)
    return text

def canonicalize_words(words):
    current = []
    rep_dict = {'\n':' '
                ,'\xc2':' '
                ,'\xa0':' '
                ,'\xc2':' '
                ,'\xc3':' '
                ,'\xa9':' '
                ,'\xef':' '
                ,'\xbb':' '
                ,'\xbf':' '
               }
    for word in replace_all(words, rep_dict).split(' '):   
        if word and word[-1] in ('.', ',', '?', ';', '!'):
            punk = word[-1]
            current.append(punk)
            word = word[0:-1]

        word = canonicalize_word(word)
        current.append(word)
    return current

In [4]:
def pretty_timedelta(fmt="%d:%02d:%02d", since=None, until=None):
    """Pretty-print a timedelta, using the given format string."""
    since = since or time.time()
    until = until or time.time()
    delta_s = until - since
    hours, remainder = divmod(delta_s, 3600)
    minutes, seconds = divmod(remainder, 60)
    return fmt % (hours, minutes, seconds)

load_train_data reads in everything in train_data directory from which it generates the vocab (of type Vocabulary defined above), the author_to_id map (which maps author names to ids) and two arrays, one contains the text of each file (as a list of word ids) under "train_data". The other holds the author id for each of the texts.

In [5]:
def load_train_data():
    train_data_dir = 'train_data'
    y = []
    X = []
    all_tokens = []
    author_to_id = {}
    for author_id, author in enumerate(listdir(train_data_dir)):
        author_to_id[author] = author_id
        author_path = "%s/%s" % (train_data_dir, author)
        print author, author_id

        for file_name in listdir(author_path):
            full_path = "%s/%s" % (author_path, file_name)
            y.append(author_id)            
            with open(full_path, "r") as f:
                current = canonicalize_words(f.read())
                all_tokens += current
                X.append(np.array(current))
                
    vocab = Vocabulary(all_tokens)

    # replace words with ids
    for i, x in enumerate(X):
        X[i] = np.array(vocab.words_to_ids(x))

    return vocab, np.array(X), np.array(y), author_to_id

Here we load the training data, note resulting number of classes (authors) and display some useful information.

In [6]:
vocab, X_train, y_train, author_to_id = load_train_data()
num_classes = len(np.unique(y_train))
print "vocab.size", vocab.size
print author_to_id

thomas_paine 0
thomas_jefferson 1
john_adams 2
james_madison 3
alexander_hamilton 4
james_monroe 5
john_jay 6
george_washington 7
benjamin_franklin 8
vocab.size 142863
{'thomas_jefferson': 1, 'john_adams': 2, 'alexander_hamilton': 4, 'benjamin_franklin': 8, 'george_washington': 7, 'thomas_paine': 0, 'james_madison': 3, 'james_monroe': 5, 'john_jay': 6}


Load the eval data in the same format as the training data. But each federalist paper here ends up in its own dictionary entry so that they can be scored/classified/attributed separately. Each is assumed to be written by James Madison.

In [7]:
def load_eval_data(vocab):
    eval_data_dir = "unknown_data"
    eval_X = {}
    eval_y = {}
    for file_name in listdir(eval_data_dir):
        full_path = "%s/%s" % (eval_data_dir, file_name)
        with open(full_path, "r") as f:
            current = vocab.words_to_ids(canonicalize_words(f.read()))

        expanded_X = np.array(current)
        id = file_name.split("_")[2].split(".")[0]
        eval_X[id] = np.array([expanded_X])
        # working with the assumption that James Madison wrote all the disputed papers
        eval_y[id] = np.array([author_to_id['james_madison']])

    return eval_X, eval_y

eval_X, eval_y = load_eval_data(vocab)
print eval_y['18']
print eval_X['18']

[3]
[[ 185    2 3794 ...,  365    5  497]]


Cut up the publications based on batch_size, and max_time. To reduce how much code had to be changed from a4 this expands the author id for each document to be an author id for each word in the document. So if you had publication[1] = [1 2 3] and authors[1] = 1, this would output (assuming a batch of 1 and max time of 3) w = [1 2 3] and y = [1 1 1]. In the end we'll ignore all the loss for everything except the last word, but all the matrix functions and multiplications could work as is if I kept expanded the author to be associated with each word (since that is what the sequence math was doing, each word had a corresponding target word). Note: this also randomly shuffles the batches so that an given author's data is mixed through out the training process.

In [8]:
import random
# maybe worth seeding here. or in the batch_generator function

def batch_generator(publications, authors, batch_size, max_time):
    """Convert ids to data-matrix form."""
    all_w = []
    all_y = []
    for i, ids in enumerate(publications):
        # Clip to multiple of max_time for convenience
        clip_len = ((len(ids)-1) / batch_size) * batch_size
        
        input_w = ids[:clip_len]     # current word
        target_y = ids[1:clip_len+1]  # next word
        # Reshape so we can select columns
        input_w = input_w.reshape([batch_size,-1])

        for j in xrange(0, input_w.shape[1], max_time):
            this_w = input_w[:,j:j+max_time]
            all_w.append(this_w)
            all_y.append(np.full_like(this_w, authors[i]))

    # Yield batches in random order     
    data = range(0, len(all_y)-1)
    random.shuffle(data)   

    for k in data:
        yield all_w[k], all_y[k]
            

In [9]:
# Sanity Check
for i, (w, y) in enumerate(batch_generator(X_train, y_train, 10, 5)):
    print y
    print w 
    break

[[1 1 1 1 1]
 [1 1 1 1 1]
 [1 1 1 1 1]
 [1 1 1 1 1]
 [1 1 1 1 1]
 [1 1 1 1 1]
 [1 1 1 1 1]
 [1 1 1 1 1]
 [1 1 1 1 1]
 [1 1 1 1 1]]
[[     2    454     39     49      8]
 [    10     35      2   4870      3]
 [107567      5  41765     11     12]
 [    93     26      1     72      6]
 [   213     21  29107      1    129]
 [     1    695      2    340   3371]
 [ 55802    106    751      9      2]
 [     7     16      1    121     11]
 [  7357     13     64    354     27]
 [     1    243     11     12    574]]


Runs the epoch. Mostly the same as a4, but there is no test phase (instead there is no a separate prediction phase) that didn't makes as much sense to me to have in this function, so it has its own function

In [10]:
def run_epoch(lm, session, batch_iterator,
              train=False, verbose=False,
              tick_s=10, learning_rate=0.1):
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_batches = 0
    total_words = 0

    if train:
        train_op = lm.train_step_
        use_dropout = True
    else:
        train_op = tf.no_op()
        use_dropout = False  # no dropout at test time

    for i, (w, y) in enumerate(batch_iterator):
        cost = 0.0
        # At first batch in epoch, get a clean intitial state.
        if i == 0:
            h = session.run(lm.initial_h_, {lm.input_w_: w})
            
        feed_dict = {lm.input_w_: w,
                     lm.target_y_: y,
                     lm.initial_h_: h,
                     lm.learning_rate_: learning_rate,
                     lm.use_dropout_: use_dropout}
        
        _, h, cost = session.run([train_op, lm.final_h_, lm.loss_], feed_dict)  

        total_cost += cost
        total_batches = i + 1
        total_words += w.size  # w.size = batch_size * max_time

        ##
        # Print average loss-so-far for epoch
        # If using train_loss_, this may be an underestimate.
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_wps = total_words / (time.time() - start_time)
            print "[batch %d]: seen %d words at %d wps, loss = %.3f" % (
                i, total_words, avg_wps, avg_cost)
            tick_time = time.time()  # reset time ticker

    return total_cost / total_batches

In [11]:
def score_dataset(lm, session, ids, authors, max_time, name="Data"):
    # For scoring, we can use larger batches to speed things up. Same as a4
    bi = batch_generator(ids, authors, batch_size=100, max_time=max_time)
    cost = run_epoch(lm, session, bi, 
                     learning_rate=1.0, train=False, 
                     verbose=False, tick_s=3600)
    print "%s: avg. loss: %.03f  (perplexity: %.02f)" % (name, cost, np.exp(cost))
    return cost

Functions used to predict a batch iterators data. Used post training to test the unknown federalist papers.

In [12]:
from collections import defaultdict

def print_prediction_results(predictions, author_to_id):
    print "Truth:", 'james_madison'
    counts = defaultdict(float)
    for p in predictions:
        counts[p] += 1

    print "Prediction Summary:"
    for id, count in counts.iteritems():
        print "%s: %.2f" % (id_to_author(author_to_id, id), count/len(predictions))
    print ""

def id_to_author(author_to_id, id):
    for author, author_id in author_to_id.iteritems():
        if id == author_id:
            return author
    
def predict_paper(lm, session, batch_iterator, authors, paper_name):
    total_predictions = np.array([])
    print "Predicting for %s" % paper_name
    for i, (w, y) in enumerate(batch_iterator):        
        feed_dict = {lm.input_w_: w,
                     lm.target_y_: y}
        
        cost, truths, logits, predictions = session.run([lm.loss_, lm.target_y_last_, lm.logits_last_, lm.predictions_], feed_dict)  
        total_predictions = np.append(total_predictions, predictions.reshape(-1))

    print_prediction_results(total_predictions, authors)
    
print_prediction_results([1, 2, 1, 2], author_to_id)

Truth: james_madison
Prediction Summary:
thomas_jefferson: 0.50
john_adams: 0.50



In [13]:
# Training parameters
max_time = 15
batch_size = 40
learning_rate = 0.1
num_epochs = 10

# Model parameters
model_params = dict(V=vocab.size, 
                    H=100, 
                    num_classes=num_classes,
                    num_layers=1)

TF_SAVEDIR = "tf_saved"
checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")

Nearly the same as a4, but instead of score

In [14]:
reload(rnnlm)
# Will print status every this many seconds
print_interval = 5

# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)

lm = rnnlm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()
lm.BuildClassifierGraph()

# Explicitly add global initializer and variable saver to LM graph
with lm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    session.run(initializer)

    for epoch in xrange(1,num_epochs+1):
        t0_epoch = time.time()
        bi = batch_generator(X_train, y_train, batch_size, max_time)
        print "[epoch %d] Starting epoch %d" % (epoch, epoch)
        # Run a training epoch.

        run_epoch(lm, session, bi, train=True, verbose=True, tick_s=100, learning_rate=learning_rate)
    
        print "[epoch %d] Completed in %s" % (epoch, pretty_timedelta(since=t0_epoch))
    
        # Save a checkpoint
        saver.save(session, checkpoint_filename, global_step=epoch)
    
        ##
        # score_dataset will run a forward pass over the entire dataset
        # and report perplexity scores. This can be slow (around 1/2 to 
        # 1/4 as long as a full epoch), so you may want to comment it out
        # to speed up training on a slow machine. Be sure to run it at the 
        # end to evaluate your score.
        print ("[epoch %d]" % epoch),
        score_dataset(lm, session, eval_X['18'], eval_y['18'], max_time, name="Federalist Paper 18")
        #score_dataset(lm, session, test_ids, name="Test set")

        for key in eval_X:
            prediction_bi = batch_generator(eval_X[key], eval_y[key], batch_size, max_time)
            predict_paper(lm, session, prediction_bi, author_to_id, "Federalist Paper %s" % key)        
        print ""

    # Save final model
    saver.save(session, trained_filename)

[epoch 1] Starting epoch 1
[batch 4916]: seen 2922960 words at 29225 wps, loss = 1.424
[epoch 1] Completed in 0:02:02
[epoch 1] Federalist Paper 18: avg. loss: 1.484  (perplexity: 4.41)
Predicting for Federalist Paper 20
Truth: james_madison
Prediction Summary:
thomas_jefferson: 0.72
james_madison: 0.19
alexander_hamilton: 0.09

Predicting for Federalist Paper 58
Truth: james_madison
Prediction Summary:
thomas_jefferson: 0.59
james_madison: 0.25
alexander_hamilton: 0.16

Predicting for Federalist Paper 49
Truth: james_madison
Prediction Summary:
thomas_jefferson: 0.49
james_madison: 0.26
alexander_hamilton: 0.25

Predicting for Federalist Paper 55
Truth: james_madison
Prediction Summary:
thomas_jefferson: 0.67
james_madison: 0.19
alexander_hamilton: 0.14

Predicting for Federalist Paper 54
Truth: james_madison
Prediction Summary:
thomas_jefferson: 0.55
james_madison: 0.24
alexander_hamilton: 0.21

Predicting for Federalist Paper 57
Truth: james_madison
Prediction Summary:
thomas_jeffer

In [15]:
print author_to_id['james_madison']
print author_to_id['george_washington']
print author_to_id

3
7
{'thomas_jefferson': 1, 'john_adams': 2, 'alexander_hamilton': 4, 'benjamin_franklin': 8, 'george_washington': 7, 'thomas_paine': 0, 'james_madison': 3, 'james_monroe': 5, 'john_jay': 6}
