In [1]:
import numpy as np
import tensorflow as tf  #TF 1.1.0rc1
tf.logging.set_verbosity(tf.logging.ERROR)
import matplotlib.pyplot as plt
from tsc_model import Model,sample_batch

In [2]:
import collections

class Vocabulary(object):

  UNK_TOKEN = "<unk>"

  def __init__(self, tokens, size=None):
    self.unigram_counts = collections.Counter(tokens)
    # leave space for "<s>", "</s>", and "<unk>"
    top_counts = self.unigram_counts.most_common(None if size is None else (size - 1))
    vocab = ([self.UNK_TOKEN] +
             [w for w,c in top_counts])

    # Assign an id to each word, by frequency
    self.id_to_word = dict(enumerate(vocab))
    self.word_to_id = {v:k for k,v in self.id_to_word.iteritems()}
    self.size = len(self.id_to_word)
    if size is not None:
        assert(self.size <= size)

    # For convenience
    self.wordset = set(self.word_to_id.iterkeys())

    # Store special IDs
    self.UNK_ID = self.word_to_id[self.UNK_TOKEN]

  def words_to_ids(self, words):
    return [self.word_to_id.get(w, self.UNK_ID) for w in words]

  def ids_to_words(self, ids):
    return [self.id_to_word[i] for i in ids]

  def ordered_words(self):
    """Return a list of words, ordered by id."""
    return self.ids_to_words(range(self.size))


In [3]:
def slice_up_words(words, window_size=10):
    slices = []
    for index, word in enumerate(words):
        slice = words[index:index+window_size]
        if len(slice) == window_size:
            slices.append(slice)
        else:
            break
    return slices
        

slice_up_words(["hello", "I", "am", "mr", ".", "anderson", "what", "is", "your", "name", "?"], 3)

[['hello', 'I', 'am'],
 ['I', 'am', 'mr'],
 ['am', 'mr', '.'],
 ['mr', '.', 'anderson'],
 ['.', 'anderson', 'what'],
 ['anderson', 'what', 'is'],
 ['what', 'is', 'your'],
 ['is', 'your', 'name'],
 ['your', 'name', '?']]

In [4]:
from os import listdir
import re

def canonicalize_digits(word):
    if any([c.isalpha() for c in word]): return word
    word = re.sub("\d", "DG", word)
    if word.startswith("DG"):
        word = word.replace(",", "") # remove thousands separator
    return word

def canonicalize_word(word):
    word = word.lower()
    return canonicalize_digits(word) # try to canonicalize numbers

def load_train_data(window_size):
    train_data_dir = 'train_data'
    y = []
    X = []
    all_tokens = []
    for author_id, author in enumerate(listdir(train_data_dir)):
        author_path = "%s/%s" % (train_data_dir, author)
        for file_name in listdir(author_path):
            full_path = "%s/%s" % (author_path, file_name)
            y.append(author_id)            
            with open(full_path, "r") as f:
                content = f.read()
                current = []
                for word in content.split(" "):   
                    if word[-1] in (".", ',', '?', ';', '!'):
                        punk = word[-1]
                        all_tokens.append(punk)
                        current.append(punk)
                        word = word[0:-1]
                        
                    word = canonicalize_word(word)
                    all_tokens.append(word)
                    current.append(word)
                X.append(current)
    vocab = Vocabulary(all_tokens)

    for i, x in enumerate(X):
        X[i] = vocab.words_to_ids(x)

    expanded_y = np.array([])
    expanded_X = None
    for i, x in enumerate(X):
        slices = slice_up_words(x, window_size)
        if expanded_X is None:
            expanded_X = np.array(slices)
        else:
            expanded_X = np.append(expanded_X, np.array(slices), axis=0)
        expanded_y = np.append(expanded_y, np.array([y[i]] * len(slices)))

    return vocab, expanded_X, expanded_y

num_words_per_x = 15
vocab, X_train, y_train = load_train_data(num_words_per_x)
N,sl = X_train.shape
num_classes = len(np.unique(y_train))


In [None]:
#Set these directories
#direc = './data'
#summaries_dir = '.'

"""Load the data"""
#ratio = np.array([0.8,0.9]) #Ratios where to split the training and validation set
#X_train,X_val,X_test,y_train,y_val,y_test = load_data(direc,ratio,dataset='ChlorineConcentration')
#N,sl = X_train.shape
#num_classes = len(np.unique(y_train))

In [5]:
"""Hyperparamaters"""
num_epochs = 2
batch_size = 50
dropout = 0.8
config = {    'num_layers' :    3,               #number of layers of stacked RNN's
              'hidden_size' :   120,             #memory cells in a layer
              'max_grad_norm' : 5,             #maximum gradient norm during training
              'batch_size' :    batch_size,
              'learning_rate' : .005,
              'sl':             sl,
              'num_classes':    num_classes}

max_iterations = int(np.floor((num_epochs * N) / batch_size) + 1)


In [6]:
epochs = np.floor(batch_size*max_iterations / N)
print('Train %.0f samples in approximately %d epochs in %d iterations' %(N,epochs, max_iterations))

Train 176004 samples in approximately 2 epochs in 7041 iterations


In [7]:
#Instantiate a model
model = Model(config)

Finished computation graph


In [9]:
"""Session time"""
sess = tf.Session() #Depending on your use, do not forget to close the session
#writer = tf.summary.FileWriter(summaries_dir, sess.graph)  #writer for Tensorboard
sess.run(model.init_op)

cost_train_ma = -np.log(1/float(num_classes)+1e-9)  #Moving average training cost
acc_train_ma = 0.0
try:
    for i in range(max_iterations):
        X_batch, y_batch = sample_batch(X_train,y_train,batch_size)
        #Next line does the actual training
        cost_train, acc_train,_ = sess.run([model.cost,model.accuracy, model.train_op],feed_dict = {model.input: X_batch,model.labels: y_batch,model.keep_prob:dropout})
        if i%100 == 1:
            print i
            print cost_train
            print acc_train
        cost_train_ma = cost_train_ma*0.99 + cost_train*0.01
        acc_train_ma = acc_train_ma*0.99 + acc_train*0.01


except KeyboardInterrupt:
  pass
  
epoch = float(i)*batch_size/N
#print('Trained %.1f epochs, accuracy is %5.3f and cost is %5.3f'%(epoch,acc_val,cost_val))
print('Trained %.1f epochs'%(epoch))

#now run in your terminal:
# $ tensorboard --logdir = <summaries_dir>
# Replace <summaries_dir> with your own dir


1
0.755934
0.72
101
0.920956
0.6
201
0.668028
0.72
301
0.717797
0.76
401
0.720618
0.74
501
0.622645
0.76
601
0.698615
0.78
701
0.642542
0.76
801
0.809578
0.62
901
0.894874
0.64
1001
0.714788
0.68
1101
0.697955
0.76
1201
0.795292
0.64
1301
0.812237
0.68
1401
0.973783
0.64
1501
0.799212
0.64
1601
0.839658
0.6
1701
0.731584
0.76
1801
0.655206
0.7
1901
0.897379
0.66
2001
0.786713
0.68
2101
0.754485
0.7
2201
0.522647
0.84
2301
0.756294
0.72
2401
0.799331
0.62
2501
0.816572
0.64
2601
0.828382
0.68
2701
0.634791
0.74
2801
0.7755
0.7
2901
0.711901
0.72
3001
0.760971
0.7
3101
0.734863
0.74
3201
0.780146
0.68
3301
0.794957
0.62
3401
0.661646
0.72
3501
0.732163
0.7
3601
0.830321
0.62
3701
0.612894
0.74
3801
0.818073
0.62
3901
0.812762
0.64
4001
0.619059
0.78
4101
0.76134
0.66
4201
0.711861
0.68
4301
0.684261
0.76
4401
0.832739
0.66
4501
0.61486
0.82
4601
0.841107
0.66
4701
0.573759
0.8
4801
0.767972
0.64
4901
0.894589
0.64
5001
0.730092
0.7
5101
0.655538
0.76
5201
0.638532
0.76
5301
0.539794
0.82

In [None]:
    if i%100 == 1:
    #Evaluate validation performance
      X_batch, y_batch = sample_batch(X_val,y_val,batch_size)
      cost_val, summ, acc_val = sess.run([model.cost,model.merged,model.accuracy],feed_dict = {model.input: X_batch, model.labels: y_batch, model.keep_prob:1.0})
      print('At %5.0f/%5.0f: COST %5.3f/%5.3f(%5.3f) -- Acc %5.3f/%5.3f(%5.3f)' %(i,max_iterations,cost_train,cost_val,cost_train_ma,acc_train,acc_val,acc_train_ma))
      #Write information to TensorBoard
      writer.add_summary(summ, i)
      writer.flush()