In [442]:
import json, os, re, shutil, sys, time
import collections, itertools
import unittest
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# utils.pretty_print_matrix uses Pandas. Configure float format here.
import pandas as pd
pd.set_option('float_format', lambda f: "{0:.04f}".format(f))

# RNNLM Model
import rnnlm
reload(rnnlm)

# Other imports
from datetime import datetime
import pickle

Vocabulary class holds the vocabulary and the mapping between words and ids for the words.

In [443]:
import collections
'''Vocabulary class, nearly identical to that used in a4'''
class Vocabulary(object):

  UNK_TOKEN = "<unk>"

  def __init__(self, tokens, size=None):
    self.unigram_counts = collections.Counter(tokens)
    # leave space for "<unk>"
    top_counts = self.unigram_counts.most_common(None if size is None else (size - 1))
    vocab = ([self.UNK_TOKEN] +
             [w for w,c in top_counts])

    # Assign an id to each word, by frequency
    self.id_to_word = dict(enumerate(vocab))
    self.word_to_id = {v:k for k,v in self.id_to_word.iteritems()}
    self.size = len(self.id_to_word)
    if size is not None:
        assert(self.size <= size)

    # For convenience
    self.wordset = set(self.word_to_id.iterkeys())

    # Store special IDs
    self.UNK_ID = self.word_to_id[self.UNK_TOKEN]

  def words_to_ids(self, words):
    return [self.word_to_id.get(w, self.UNK_ID) for w in words]

  def ids_to_words(self, ids):
    return [self.id_to_word[i] for i in ids]

  def ordered_words(self):
    """Return a list of words, ordered by id."""
    return self.ids_to_words(range(self.size))


These functions are used to massage the cleaned data into indivdual words. Punctuation at the end of a word is split into its own distince word. Also accomplishes some minor data cleaning, removing nonsense characters.

In [444]:
from os import listdir
import re

def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word):
    word = word.lower()
    return canonicalize_digits(word) # try to canonicalize numbers

def replace_all(text, dic):
    for i, j in dic.iteritems():
        text = text.replace(i, j)
    return text

def canonicalize_words(words):
    current = []
    rep_dict = {'\n':' '
                ,'\xc2':' '
                ,'\xa0':' '
                ,'\xc2':' '
                ,'\xc3':' '
                ,'\xa9':' '
                ,'\xef':' '
                ,'\xbb':' '
                ,'\xbf':' '
                ,'\xa6':' '
                ,'\xb9':' '
                ,'\xa3':' '
                ,'\xbd':' '
                ,'\xb4':' '
                ,'\xcb':' '
                ,'\x9a':' '
                ,'\x86':' '
                ,'\xcf':' '
                ,'\x84':' '
                ,'\xce':' '
                ,'\x87':' '
                ,'\xe2':' '
                ,'\x80':' '
                ,'\x94':' '
               }
    for word in replace_all(words, rep_dict).split(' '):   
        if word:
            if word[-1] in ('.', ',', '?', ';', '!'):
                punk = word[-1]
                current.append(punk)
                word = word[0:-1]

            word = canonicalize_word(word)
            current.append(word)
    return current

In [445]:
def pretty_timedelta(fmt="%d:%02d:%02d", since=None, until=None):
    """Pretty-print a timedelta, using the given format string."""
    since = since or time.time()
    until = until or time.time()
    delta_s = until - since
    hours, remainder = divmod(delta_s, 3600)
    minutes, seconds = divmod(remainder, 60)
    return fmt % (hours, minutes, seconds)

load_train_data reads in everything in train_data directory from which it generates the vocab (of type Vocabulary defined above), the author_to_id map (which maps author names to ids) and two arrays, one contains the text of each file (as a list of word ids) under "train_data". The other holds the author id for each of the texts.

In [446]:
def load_train_data(train_data_dir):
    y = []
    X = []
    all_tokens = []
    author_to_id = {}
    for author_id, author in enumerate(listdir(train_data_dir)):
        author_to_id[author] = author_id
        author_path = "%s/%s" % (train_data_dir, author)
        print author, author_id

        for file_name in listdir(author_path):
            full_path = "%s/%s" % (author_path, file_name)
            y.append(author_id)            
            with open(full_path, "r") as f:
                current = canonicalize_words(f.read())
                all_tokens += current
                X.append(np.array(current))
                
    vocab = Vocabulary(all_tokens)

    # replace words with ids
    for i, x in enumerate(X):
        # X[i] = np.array(x) # This line can be used to make sure your words are useful 
        X[i] = np.array(vocab.words_to_ids(x))

    return vocab, np.array(X), np.array(y), author_to_id


def id_to_author(author_to_id, id):
    for author, author_id in author_to_id.iteritems():
        if id == author_id:
            return author

Here we load the training data, note resulting number of classes (authors) and display some useful information.

In [447]:
train_data_dir = './train_data_small'
vocab, X_train, y_train, author_to_id = load_train_data(train_data_dir)
num_classes = len(np.unique(y_train))
print "vocab.size", vocab.size
print author_to_id

james_madison 0
alexander_hamilton 1
john_jay 2
vocab.size 3560
{'james_madison': 0, 'john_jay': 2, 'alexander_hamilton': 1}


Load the eval data in the same format as the training data. But each federalist paper here ends up in its own dictionary entry so that they can be scored/classified/attributed separately. Each is assumed to be written by James Madison.

In [448]:
def load_eval_data(vocab, eval_data_dir):
    eval_X = {}
    eval_y = {}
    
    for author_id, author in enumerate(listdir(eval_data_dir)):
        author_path = "%s/%s" % (eval_data_dir, author)

        for file_name in listdir(author_path):
            full_path = "%s/%s" % (author_path, file_name)
            
            with open(full_path, "r") as f:
                current = vocab.words_to_ids(canonicalize_words(f.read()))
                
            expanded_X = np.array(current)
            id = file_name.split("_")[2].split(".")[0]
            eval_X[id] = np.array([expanded_X])
            eval_y[id] = np.array([author_to_id[author]])
                
    return eval_X, eval_y

eval_X, eval_y = load_eval_data(vocab, "unknown_data")
print eval_y['18']
print eval_X['18']

test_X, test_y = load_eval_data(vocab, "test_data")
print "Who wrote Federalist paper 5 (John Jay should be answer): %s" % id_to_author(author_to_id, test_y['5'])

[0]
[[294   1 175 ...,  93   6 383]]
Who wrote Federalist paper 5 (John Jay should be answer): john_jay


Cut up the publications based on batch_size, and max_time. To reduce how much code had to be changed from a4 this expands the author id for each document to be an author id for each word in the document. So if you had publication[1] = [1 2 3] and authors[1] = 1, this would output (assuming a batch of 1 and max time of 3) w = [1 2 3] and y = [1 1 1]. In the end we'll ignore all the loss for everything except the last word, but all the matrix functions and multiplications could work as is if I kept expanded the author to be associated with each word (since that is what the sequence math was doing, each word had a corresponding target word). Note: this also randomly shuffles the batches so that an given author's data is mixed through out the training process.

In [449]:
def slice_up_words(words, window_size=10, step_size=1):
    clip_len = ((len(words)-1) / window_size) * window_size
    words = words[:clip_len]
    slices = []
    num_words = len(words)
    for index in range(0, num_words, step_size):
        slices.append(words[index:index+window_size])
    return slices
        
slice_up_words(["hello", "I", "am", "mr", ".", "anderson", "what", "is", "your", "name", "?"], 3, 3)

[['hello', 'I', 'am'], ['mr', '.', 'anderson'], ['what', 'is', 'your']]

In [450]:
import random
# maybe worth seeding here. or in the batch_generator function

def shape_data_for_batching(publications, authors, max_time):
    """Convert ids to data-matrix form."""
    all_w = None
    all_y = []
    for i, ids in enumerate(publications):
        ids = np.array(slice_up_words(ids, max_time, max_time))
        y = np.full_like(ids, authors[i])
        
        if all_w is not None:
            all_w = np.append(all_w, ids, 0)
            all_y = np.append(all_y, y, 0)
            
        else:
            all_w = ids
            all_y = y

    # Yield batches in random order     
    index = range(0, len(all_y)-1)
    random.shuffle(index)   

    all_w = [all_w[i] for i in index]
    all_y = [all_y[i] for i in index]

    return all_w, all_y

def batch_generator(X_shaped, y_shaped, batch_size):
    clip_len = ((len(X_shaped)-1) / batch_size) * batch_size
    X_shaped = X_shaped[:clip_len]
    y_shaped = y_shaped[:clip_len]  
    for j in xrange(0, len(X_shaped), batch_size):
        this_x = X_shaped[j:j+batch_size]
        this_y = y_shaped[j:j+batch_size]
        yield np.array(this_x), np.array(this_y)


In [451]:
# Sanity Check
X_shaped, y_shaped = shape_data_for_batching(X_train, y_train, 5)

In [452]:
for i, (w, y) in enumerate(batch_generator(X_shaped, y_shaped, 10)):
    print y
    print w 
    break

[[2 2 2 2 2]
 [2 2 2 2 2]
 [0 0 0 0 0]
 [1 1 1 1 1]
 [2 2 2 2 2]
 [1 1 1 1 1]
 [1 1 1 1 1]
 [0 0 0 0 0]
 [1 1 1 1 1]
 [2 2 2 2 2]]
[[   3  602    5    2  471]
 [ 970    7    1   68    3]
 [1360  317    7   18  120]
 [2179    3    9  174    2]
 [  15    9  131   75 3180]
 [2758 3526   65 2767    5]
 [   1  262  292   66    6]
 [   2  531  273    4    1]
 [   5   22    1  237    3]
 [  17   90    2   32 1465]]


Runs the epoch. Mostly the same as a4, but there is no test phase (instead there is no a separate prediction phase) that didn't makes as much sense to me to have in this function, so it has its own function

In [453]:
def run_epoch(lm, session, batch_iterator,
              train=False, verbose=False,
              tick_s=10, learning_rate=0.1):
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_batches = 0
    total_words = 0

    if train:
        train_op = lm.train_step_
        use_dropout = True
    else:
        train_op = tf.no_op()
        use_dropout = False  # no dropout at test time

    for i, (w, y) in enumerate(batch_iterator):
        cost = 0.0
        # At first batch in epoch, get a clean intitial state.
        if i == 0:
            h = session.run(lm.initial_h_, {lm.input_w_: w})
            
        feed_dict = {lm.input_w_: w,
                     lm.target_y_: y,
                     lm.initial_h_: h,
                     lm.learning_rate_: learning_rate,
                     lm.use_dropout_: use_dropout}
        
        _, h, cost = session.run([train_op, lm.final_h_, lm.loss_], feed_dict)  

        total_cost += cost
        total_batches = i + 1
        total_words += w.size  # w.size = batch_size * max_time

        ##
        # Print average loss-so-far for epoch
        # If using train_loss_, this may be an underestimate.
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_wps = total_words / (time.time() - start_time)
            print "[batch %d]: seen %d words at %d wps, loss = %.3f" % (
                i, total_words, avg_wps, avg_cost)
            tick_time = time.time()  # reset time ticker

    return total_cost / total_batches

In [454]:
def score_dataset(lm, session, ids, authors, max_time, name="Data"):
    # For scoring, we can use larger batches to speed things up. Same as a4
    bi = batch_generator(ids, authors, batch_size=100, max_time=max_time)
    cost = run_epoch(lm, session, bi, 
                     learning_rate=1.0, train=False, 
                     verbose=False, tick_s=3600)
    print "%s: avg. loss: %.03f  (perplexity: %.02f)" % (name, cost, np.exp(cost))
    return cost

Functions used to predict a batch iterators data. Used post training to test the unknown federalist papers.

In [455]:
from collections import defaultdict

def get_prediction_results(predictions, author_to_id, print_results=True):
    '''
    Takes the predictions for a set of text batches and calculates the percentage
    of patches predicted for each author.
    '''
    counts = defaultdict(float)
    for p in predictions:
        counts[p] += 1

    # getting prediction percentages to return for later viewing
    predictions_num = len(predictions)
    results_dict = {author_id:float(count)/predictions_num for author_id, count in counts.iteritems()}
    
    if print_results:
        # prints results if indicated
        print "Prediction Summary:"
        for author_id, prediction in results_dict.iteritems():
            print "%s: %.2f" % (id_to_author(author_to_id, author_id), prediction)
        print ""

    return results_dict
    
def predict_paper(lm, session, batch_iterator, authors, paper_name, print_results=True):
    '''
    Splits given paper into batches and predicts the author for each batch.
    Passes these predictions to get_prediction_results() to tally the batches for each author
    and arrive at a final prediction percentage.
    '''
    total_predictions = np.array([])
    if print_results:
        print "Predicting for %s" % paper_name
    for i, (w, y) in enumerate(batch_iterator):        
        if i == 0 and print_results:
            print "Truth:", id_to_author(authors, y[0][0])
            
        feed_dict = {lm.input_w_: w,
                     lm.target_y_: y}
        
        cost, truths, logits, predictions = session.run([lm.loss_, lm.target_y_last_, lm.logits_last_, lm.predictions_], feed_dict)  
        total_predictions = np.append(total_predictions, predictions.reshape(-1))

    # gets final predictions, by author, for the given paper
    results_dict = get_prediction_results(total_predictions, authors, print_results=print_results)
    return results_dict

def test_papers(lm, session, papers, labels, authors, batch_size, print_results=True):
    '''
    Predicts authorship for given papers.
    Returns a dictionary of dictionaries containing the predictions by author for each paper.
    '''
    full_results = dict()
    for key in papers:
        prediction_bi = batch_generator(papers[key], labels[key], batch_size)
        paper_results = predict_paper(lm, session, prediction_bi, authors, "Federalist Paper %s" % key, print_results=print_results)
        full_results[key] = paper_results
    
    return full_results

def create_predictions_dataframe(epoch_predictions):
    '''
    Input: List of predictions for each epoch, where an epochs predictions is a dictionary of dictionaries containing
    the predictions for each paper by author.
    Output: Pandas dataframe of results for each author by paper, with a flag indicating the epoch.
    '''
    df_list = []
    for epoch in range(len(epoch_predictions)):
        epoch_num = epoch + 1 # we indexed our epochs from 1, but this list is 0-indexed of course
        df = pd.DataFrame(epoch_predictions[epoch])
        df['epoch'] = epoch_num
        df_list.append(df)

    return pd.concat(df_list)

def save_results(save_dir):
    '''
    Saves results dataframe to csv, pickles the author_to_id dictionary, and creates
    a text file with relevant settings that this model was trained with.
    This function is lazy and calls the objects from out of scope rather than passing, but
    it will do for now.
    '''
    # Creating directory
    shutil.rmtree(RESULTS_SAVE_DIR, ignore_errors=True)
    if not os.path.isdir(RESULTS_SAVE_DIR):
        os.makedirs(RESULTS_SAVE_DIR)
    df.to_csv(save_dir + '/results.csv')
    pickle.dump(author_to_id, open(save_dir + '/author_to_id.p', 'wb'))
    save_parms = {'max_time':max_time, 'batch_size':batch_size, 'learning_rate':learning_rate,
                  'num_epochs':num_epochs, 'train_data_dir':train_data_dir}
    with open(save_dir + '/parms.txt', 'w') as f:
        for parm, val in save_parms.iteritems():
            f.write(parm + ' : ' + str(val) + '\n')
    
get_prediction_results([1, 2, 1, 2], author_to_id)

Prediction Summary:
alexander_hamilton: 0.50
john_jay: 0.50



{1: 0.5, 2: 0.5}

In [456]:
# Training parameters
max_time = 15
batch_size = 50
learning_rate = 0.08
num_epochs = 300

save_predictions = True # flag to determine if we save the predictions on unkown papers in each epoch

# Model parameters
model_params = dict(V=vocab.size, 
                    H=100, 
                    num_classes=num_classes,
                    num_layers=1)

TF_SAVEDIR = "tf_saved"
checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")


X_train_shaped, y_train_shaped = shape_data_for_batching(X_train, y_train, max_time)

X_test_shaped = {}
y_test_shaped = {}
for key in test_X:
    X_test_shaped[key], y_test_shaped[key] = shape_data_for_batching(test_X[key], test_y[key], max_time)

X_eval_shaped = {}
y_eval_shaped = {}
for key in eval_X:
    X_eval_shaped[key], y_eval_shaped[key] = shape_data_for_batching(eval_X[key], eval_y[key], max_time)


Nearly the same as a4, but instead of scoring the data we look at prediction results after each epoch.

In [457]:
reload(rnnlm)

# Will print status every this many seconds
print_interval = 5

# Clear old log directory
shutil.rmtree("tf_summaries", ignore_errors=True)

lm = rnnlm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()
lm.BuildClassifierGraph()

# Explicitly add global initializer and variable saver to LM graph
with lm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    session.run(initializer)

    # List for saving the unkown paper predictions from each epoch
    if save_predictions:
        epoch_predictions = []
        
    for epoch in xrange(1,num_epochs+1):
        t0_epoch = time.time()
        bi = batch_generator(X_train_shaped, y_train_shaped, batch_size)
        print "[epoch %d] Starting epoch %d" % (epoch, epoch)
        # Run a training epoch.

        run_epoch(lm, session, bi, train=True, verbose=True, tick_s=100, learning_rate=learning_rate)
    
        print "[epoch %d] Completed in %s" % (epoch, pretty_timedelta(since=t0_epoch))
    
        # Save a checkpoint
        saver.save(session, checkpoint_filename, global_step=epoch)
    
        ##
        # score_dataset will run a forward pass over the entire dataset
        # and report perplexity scores. This can be slow (around 1/2 to 
        # 1/4 as long as a full epoch), so you may want to comment it out
        # to speed up training on a slow machine. Be sure to run it at the 
        # end to evaluate your score.
        print ("[epoch %d]" % epoch),
        #score_dataset(lm, session, eval_X['18'], eval_y['18'], max_time, name="Federalist Paper 18")
        #score_dataset(lm, session, test_ids, name="Test set")

        # test three of the federalist papers whose author is known after each epoch
        # test_predictions = test_papers(lm, session, X_test_shaped, y_test_shaped, author_to_id, batch_size, print_results=True)
        
        if save_predictions:
            # testing against all unkown federalist papers
            unk_predictions = test_papers(lm, session, X_eval_shaped, y_eval_shaped, author_to_id, batch_size, print_results=False)
            epoch_predictions.append(unk_predictions)

    # Testing final model against all unkown federalist papers
    final_test = test_papers(lm, session, X_eval_shaped, y_eval_shaped, author_to_id, batch_size, print_results=True)    
    # Save final model
    saver.save(session, trained_filename)
    
    if save_predictions:
        df = create_predictions_dataframe(epoch_predictions)
    


[epoch 1] Starting epoch 1
[epoch 1] Completed in 0:00:00
[epoch 1] [epoch 2] Starting epoch 2
[epoch 2] Completed in 0:00:00
[epoch 2] [epoch 3] Starting epoch 3
[epoch 3] Completed in 0:00:00
[epoch 3] [epoch 4] Starting epoch 4
[epoch 4] Completed in 0:00:00
[epoch 4] [epoch 5] Starting epoch 5
[epoch 5] Completed in 0:00:00
[epoch 5] [epoch 6] Starting epoch 6
[epoch 6] Completed in 0:00:00
[epoch 6] [epoch 7] Starting epoch 7
[epoch 7] Completed in 0:00:00
[epoch 7] [epoch 8] Starting epoch 8
[epoch 8] Completed in 0:00:00
[epoch 8] [epoch 9] Starting epoch 9
[epoch 9] Completed in 0:00:00
[epoch 9] [epoch 10] Starting epoch 10
[epoch 10] Completed in 0:00:00
[epoch 10] [epoch 11] Starting epoch 11
[epoch 11] Completed in 0:00:00
[epoch 11] [epoch 12] Starting epoch 12
[epoch 12] Completed in 0:00:00
[epoch 12] [epoch 13] Starting epoch 13
[epoch 13] Completed in 0:00:00
[epoch 13] [epoch 14] Starting epoch 14
[epoch 14] Completed in 0:00:00
[epoch 14] [epoch 15] Starting epoch 15

## SAVING: Uncomment and run to save the current model results to disk
By default, results will be placed in their own directory within `/nn_saved_results`, which is named with the current datetime stamp. This is to prevent overwrites when we are pushing to the directory.

In [458]:
RESULTS_SAVE_DIR = './nn_saved_results/' + datetime.now().strftime('%Y-%m-%d_%H_%M_%S')
save_results(RESULTS_SAVE_DIR)

### Viewing results

In [459]:
df

Unnamed: 0,18,19,20,49,50,51,52,53,54,55,56,57,58,62,63,72,epoch
0.0000,0.6933,0.7133,0.5600,0.5200,0.5200,0.5100,0.5200,0.6267,0.5200,0.4800,0.5100,0.5333,0.6133,0.5467,0.5150,0.5600,1
1.0000,0.1600,0.1000,0.2300,0.1900,0.2000,0.3200,0.2600,0.2400,0.2800,0.2800,0.2700,0.3000,0.2133,0.2867,0.2600,0.3100,1
2.0000,0.1467,0.1867,0.2100,0.2900,0.2800,0.1700,0.2200,0.1333,0.2000,0.2400,0.2200,0.1667,0.1733,0.1667,0.2250,0.1300,1
0.0000,0.6000,0.6733,0.5100,0.4800,0.6000,0.5400,0.5300,0.5933,0.5100,0.5100,0.5300,0.5400,0.5800,0.5133,0.5200,0.5100,2
1.0000,0.2933,0.2000,0.2800,0.2900,0.2400,0.3500,0.3200,0.3067,0.3200,0.3400,0.3000,0.3200,0.3533,0.3400,0.3450,0.3700,2
2.0000,0.1067,0.1267,0.2100,0.2300,0.1600,0.1100,0.1500,0.1000,0.1700,0.1500,0.1700,0.1400,0.0667,0.1467,0.1350,0.1200,2
0.0000,0.6067,0.6400,0.5500,0.4800,0.5200,0.5300,0.5200,0.6133,0.5400,0.5500,0.5100,0.5800,0.6267,0.4867,0.5750,0.5600,3
1.0000,0.2200,0.1533,0.2600,0.3100,0.3000,0.3200,0.3100,0.2600,0.2900,0.2900,0.2800,0.2467,0.2467,0.3400,0.2750,0.2800,3
2.0000,0.1733,0.2067,0.1900,0.2100,0.1800,0.1500,0.1700,0.1267,0.1700,0.1600,0.2100,0.1733,0.1267,0.1733,0.1500,0.1600,3
0.0000,0.5667,0.6467,0.4900,0.4800,0.5400,0.5300,0.5100,0.5133,0.5200,0.4900,0.4300,0.5133,0.5067,0.4933,0.4750,0.4100,4
