In [1]:
import numpy as np
import tensorflow as tf  #TF 1.1.0rc1
tf.logging.set_verbosity(tf.logging.ERROR)
import matplotlib.pyplot as plt
from tsc_model import Model,sample_batch

In [2]:
import collections

class Vocabulary(object):

  UNK_TOKEN = "<unk>"

  def __init__(self, tokens, size=None):
    self.unigram_counts = collections.Counter(tokens)
    # leave space for "<s>", "</s>", and "<unk>"
    top_counts = self.unigram_counts.most_common(None if size is None else (size - 1))
    vocab = ([self.UNK_TOKEN] +
             [w for w,c in top_counts])

    # Assign an id to each word, by frequency
    self.id_to_word = dict(enumerate(vocab))
    self.word_to_id = {v:k for k,v in self.id_to_word.iteritems()}
    self.size = len(self.id_to_word)
    if size is not None:
        assert(self.size <= size)

    # For convenience
    self.wordset = set(self.word_to_id.iterkeys())

    # Store special IDs
    self.UNK_ID = self.word_to_id[self.UNK_TOKEN]

  def words_to_ids(self, words):
    return [self.word_to_id.get(w, self.UNK_ID) for w in words]

  def ids_to_words(self, ids):
    return [self.id_to_word[i] for i in ids]

  def ordered_words(self):
    """Return a list of words, ordered by id."""
    return self.ids_to_words(range(self.size))


In [3]:
def slice_up_words(words, window_size=10):
    slices = []
    for index, word in enumerate(words):
        slice = words[index:index+window_size]
        if len(slice) == window_size:
            slices.append(slice)
        else:
            break
    return slices
        

slice_up_words(["hello", "I", "am", "mr", ".", "anderson", "what", "is", "your", "name", "?"], 3)

[['hello', 'I', 'am'],
 ['I', 'am', 'mr'],
 ['am', 'mr', '.'],
 ['mr', '.', 'anderson'],
 ['.', 'anderson', 'what'],
 ['anderson', 'what', 'is'],
 ['what', 'is', 'your'],
 ['is', 'your', 'name'],
 ['your', 'name', '?']]

In [36]:
from os import listdir
import re

def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word):
    word = word.lower()
    return canonicalize_digits(word) # try to canonicalize numbers

def canonicalize_words(words):
    current = []
    for word in words.split(" "):   
        if word[-1] in (".", ',', '?', ';', '!'):
            punk = word[-1]
            current.append(punk)
            word = word[0:-1]

        word = canonicalize_word(word)
        current.append(word)
    return current
      
def load_train_data(window_size):
    train_data_dir = 'train_data'
    y = []
    X = []
    all_tokens = []
    author_to_id = {}
    for author_id, author in enumerate(listdir(train_data_dir)):
        author_to_id[author] = author_id
        author_path = "%s/%s" % (train_data_dir, author)
        for file_name in listdir(author_path):
            full_path = "%s/%s" % (author_path, file_name)
            y.append(author_id)            
            with open(full_path, "r") as f:
                current = canonicalize_words(f.read())
                all_tokens += current
                X.append(current)
                
    vocab = Vocabulary(all_tokens)

    for i, x in enumerate(X):
        X[i] = vocab.words_to_ids(x)

    expanded_y = np.array([])
    expanded_X = None
    for i, x in enumerate(X):
        slices = slice_up_words(x, window_size)
        if expanded_X is None:
            expanded_X = np.array(slices)
        else:
            expanded_X = np.append(expanded_X, np.array(slices), axis=0)
        expanded_y = np.append(expanded_y, np.array([y[i]] * len(slices)))

    return vocab, expanded_X, expanded_y, author_to_id

In [37]:
num_words_per_x = 15
vocab, X_train, y_train, author_to_id = load_train_data(num_words_per_x)
N,sl = X_train.shape
num_classes = len(np.unique(y_train))
print vocab.size
print author_to_id

8626
{'james_madison': 0, 'john_jay': 2, 'alexander_hamilton': 1}


In [41]:
def load_eval_data(window_size, vocab):
    eval_data_dir = "unknown_data"
    eval_X = {}
    eval_y = {}
    for file_name in listdir(eval_data_dir):
        full_path = "%s/%s" % (eval_data_dir, file_name)
        with open(full_path, "r") as f:
            current = vocab.words_to_ids(canonicalize_words(f.read()))

        slices = slice_up_words(current, window_size)
        expanded_X = np.array(slices)
        id = file_name.split("_")[2].split(".")[0]
        eval_X[id] = expanded_X
        # working with the assumption that James Madison wrote all the disputed papers
        eval_y[id] = np.array([author_to_id['james_madison']] * len(slices))

    return eval_X, eval_y

eval_X, eval_y = load_eval_data(num_words_per_x, vocab)
print eval_y

{'20': array([0, 0, 0, ..., 0, 0, 0]), '58': array([0, 0, 0, ..., 0, 0, 0]), '49': array([0, 0, 0, ..., 0, 0, 0]), '55': array([0, 0, 0, ..., 0, 0, 0]), '54': array([0, 0, 0, ..., 0, 0, 0]), '57': array([0, 0, 0, ..., 0, 0, 0]), '56': array([0, 0, 0, ..., 0, 0, 0]), '51': array([0, 0, 0, ..., 0, 0, 0]), '50': array([0, 0, 0, ..., 0, 0, 0]), '53': array([0, 0, 0, ..., 0, 0, 0]), '52': array([0, 0, 0, ..., 0, 0, 0]), '19': array([0, 0, 0, ..., 0, 0, 0]), '62': array([0, 0, 0, ..., 0, 0, 0]), '63': array([0, 0, 0, ..., 0, 0, 0]), '18': array([0, 0, 0, ..., 0, 0, 0])}


In [None]:
#Set these directories
#direc = './data'
#summaries_dir = '.'

"""Load the data"""
#ratio = np.array([0.8,0.9]) #Ratios where to split the training and validation set
#X_train,X_val,X_test,y_train,y_val,y_test = load_data(direc,ratio,dataset='ChlorineConcentration')
#N,sl = X_train.shape
#num_classes = len(np.unique(y_train))

In [53]:
"""Hyperparamaters"""
num_epochs = 1
batch_size = 50
dropout = 0.8
config = {    'num_layers' :    3,               #number of layers of stacked RNN's
              'hidden_size' :   120,             #memory cells in a layer
              'max_grad_norm' : 5,             #maximum gradient norm during training
              'batch_size' :    batch_size,
              'learning_rate' : .005,
              'sl':             sl,
              'num_classes':    num_classes}

max_iterations = int(np.floor((num_epochs * N) / batch_size) + 1)


In [6]:
epochs = np.floor(batch_size*max_iterations / N)
print('Train %.0f samples in approximately %d epochs in %d iterations' %(N,epochs, max_iterations))

Train 176004 samples in approximately 0 epochs in 353 iterations


In [7]:
#Instantiate a model
model = Model(config)

Finished computation graph


In [54]:
"""Session time"""
sess = tf.Session() #Depending on your use, do not forget to close the session
#writer = tf.summary.FileWriter(summaries_dir, sess.graph)  #writer for Tensorboard
sess.run(model.init_op)

cost_train_ma = -np.log(1/float(num_classes)+1e-9)  #Moving average training cost
acc_train_ma = 0.0
try:
    for i in range(max_iterations):
        X_batch, y_batch = sample_batch(X_train,y_train,batch_size)
        #Next line does the actual training
        cost_train, acc_train,_ = sess.run([model.cost,model.accuracy, model.train_op],feed_dict = {model.input: X_batch,model.labels: y_batch,model.keep_prob:dropout})
        if i%100 == 1:
            print i
            print cost_train
            print acc_train
        cost_train_ma = cost_train_ma*0.99 + cost_train*0.01
        acc_train_ma = acc_train_ma*0.99 + acc_train*0.01


except KeyboardInterrupt:
  pass
  
epoch = float(i)*batch_size/N
#print('Trained %.1f epochs, accuracy is %5.3f and cost is %5.3f'%(epoch,acc_val,cost_val))
print('Trained %.1f epochs'%(epoch))

#now run in your terminal:
# $ tensorboard --logdir = <summaries_dir>
# Replace <summaries_dir> with your own dir


1
1.32169
0.62
101
0.7466
0.74
201
0.782717
0.66
301
0.65979
0.8
401
0.602412
0.78
501
0.887314
0.6
601
0.689751
0.74
701
0.696101
0.78
801
0.882722
0.62
901
0.765375
0.68
1001
0.77599
0.66
1101
0.99102
0.56
1201
0.775643
0.66
1301
0.710101
0.76
1401
0.72777
0.66
1501
0.608596
0.78
1601
0.924865
0.62
1701
0.833966
0.66
1801
0.612032
0.76
1901
0.698124
0.74
2001
0.68245
0.76
2101
0.724238
0.7
2201
0.895442
0.58
2301
0.634508
0.76
2401
0.78423
0.68
2501
0.634059
0.76
2601
0.923063
0.6
2701
0.796587
0.64
2801
0.866879
0.66
2901
0.603574
0.82
3001
1.02338
0.62
3101
0.768898
0.66
3201
0.792293
0.66
3301
0.900301
0.58
3401
0.607615
0.78
3501
0.628267
0.78
Trained 1.0 epochs


In [71]:
#Evaluate validation performance
# This is classifying batchs from disputed paper #20

total_acc = 0
for key, value in eval_X.iteritems():
    print "Evaluating federalist paper #%s" % key
    for i in range(1000):
        X_batch, y_batch = sample_batch(value,eval_y[key],batch_size)
        predictions, acc_val = sess.run([model.predictions, model.accuracy],feed_dict = {model.input: X_batch, model.labels: y_batch, model.keep_prob:1.0})
    #    print "Predictions:", predictions
    #    print "Suspected Actual: ", y_batch
    #    print "accuracy: %5.3f" % acc_val
        total_acc += acc_val
        if i != 0 and i % 100 == 0:
            print "Accuracy at iteration %d: %.2f" % (i, total_acc/i)
print total_acc


Evaluating federalist paper #20
Accuracy at iteration 100: 0.00
Accuracy at iteration 200: 0.00
Accuracy at iteration 300: 0.00
Accuracy at iteration 400: 0.00
Accuracy at iteration 500: 0.00
Accuracy at iteration 600: 0.00
Accuracy at iteration 700: 0.00
Accuracy at iteration 800: 0.00
Accuracy at iteration 900: 0.00
Evaluating federalist paper #58
Accuracy at iteration 100: 0.00
Accuracy at iteration 200: 0.00
Accuracy at iteration 300: 0.00
Accuracy at iteration 400: 0.00
Accuracy at iteration 500: 0.00
Accuracy at iteration 600: 0.00
Accuracy at iteration 700: 0.00
Accuracy at iteration 800: 0.00
Accuracy at iteration 900: 0.00
Evaluating federalist paper #49
Accuracy at iteration 100: 0.00
Accuracy at iteration 200: 0.00
Accuracy at iteration 300: 0.00
Accuracy at iteration 400: 0.00
Accuracy at iteration 500: 0.00
Accuracy at iteration 600: 0.00
Accuracy at iteration 700: 0.00
Accuracy at iteration 800: 0.00
Accuracy at iteration 900: 0.00
Evaluating federalist paper #55
Accuracy