In [1]:
import numpy as np
from sklearn import metrics
import pandas

import tensorflow as tf
from tensorflow.contrib import learn

MAX_DOCUMENT_LENGTH = 100
HIDDEN_SIZE = 20

In [2]:
# for debugging
# import os
# data_dir = os.path.join(os.getenv('TF_EXP_BASE_DIR', ''), 'dbpedia_data')
# data_dir

In [57]:
%%time

# If this doesn't run you can download it from here:
#https://drive.google.com/drive/folders/0Bz8a_Dbh9Qhbfll6bVpmNUtUcFdjYmF2SEpmZUZUcVNiMUw1TWN6RDV3a0JHT3kxLVhVR2M

# Prepare training and testing data
dbpedia = learn.datasets.load_dataset('dbpedia',size='small')

x_train = pandas.DataFrame(dbpedia.train.data)[1]
y_train = pandas.Series(dbpedia.train.target)
x_test = pandas.DataFrame(dbpedia.test.data)[1]
y_test = pandas.Series(dbpedia.test.target)

print(len(y_train))


560
CPU times: user 6.88 s, sys: 141 ms, total: 7.02 s
Wall time: 7.16 s


In [62]:
idx = 25
print(x_test[idx])
print(y_test[idx])

 The British Rail Class 421 (or 4Cig) electrical multiple units were built at BR York Works between 1964 and 1972. Units were built in two batches and were initially introduced on services on the Brighton Main Line. Later units were introduced on services to Portsmouth. These units replaced older Southern Railway-designed units such as the 5Bel Brighton Belle units and 4Cor units.
6


In [53]:
!head dbpedia_data/dbpedia_csv/classes.txt

Company
EducationalInstitution
Artist
Athlete
OfficeHolder
MeanOfTransportation
Building
NaturalPlace
Village
Animal


# Character Embeddings


In [4]:
%%time
# Process character vocabulary
char_processor = learn.preprocessing.ByteProcessor(MAX_DOCUMENT_LENGTH)
x_train = np.array(list(char_processor.fit_transform(x_train)))
x_test = np.array(list(char_processor.transform(x_test)))

CPU times: user 62.9 ms, sys: 8.92 ms, total: 71.8 ms
Wall time: 83.6 ms


In [5]:
print(x_test)
print(x_test.shape)

[[ 32  84  89 ..., 105 114 105]
 [ 32  80  97 ..., 110  32  69]
 [ 32  78  73 ..., 102  32  65]
 ..., 
 [ 32  83 105 ..., 114 116  32]
 [ 32  68  97 ..., 101 100  32]
 [ 32  76  97 ..., 114 110  97]]
(70, 100)


In [6]:
%%time
def char_rnn_model(x, y):
    """Character level recurrent neural network model to predict classes."""
    
    # set up output encoding
    y = tf.one_hot(y, 15, 1, 0)
    
    # set up character matrix as one hot
    byte_list = learn.ops.one_hot_matrix(x, 256)
    byte_list = tf.unpack(byte_list, axis=1)
    print(len(byte_list))

    # create RNN cell type
    cell = tf.nn.rnn_cell.GRUCell(HIDDEN_SIZE)
    #cell = tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE,state_is_tuple=False)
    #cell = tf.nn.rnn_cell.BasicRNNCell(HIDDEN_SIZE)
    
    # define inputs and unroll cells
    rnn_output_list, final_encoding = tf.nn.rnn(cell, byte_list, dtype=tf.float32)
    print(final_encoding)

    # use encoding to predict via fully connected layer
    prediction, loss = learn.models.logistic_regression(final_encoding, y)

    train_op = tf.contrib.layers.optimize_loss(
      loss, tf.contrib.framework.get_global_step(),
      optimizer='Adam', learning_rate=0.01)

    return {'class': tf.argmax(prediction, 1), 'prob': prediction}, loss, train_op


CPU times: user 11 µs, sys: 9 µs, total: 20 µs
Wall time: 39.8 µs


In [10]:
%%time
# Build model
classifier = learn.Estimator(model_fn=char_rnn_model)

# Train and predict
classifier.fit(x_train, y_train, steps=200)
y_predicted = [p['class'] for p in classifier.predict(x_test, as_iterable=True)]
score = metrics.accuracy_score(y_test, y_predicted)
print('Accuracy: {0:f}'.format(score))



100
Tensor("RNN/GRUCell_99/add:0", shape=(?, 20), dtype=float32)
100
Tensor("RNN/GRUCell_99/add:0", shape=(?, 20), dtype=float32)
Accuracy: 0.285714
CPU times: user 6min 7s, sys: 1min 2s, total: 7min 9s
Wall time: 3min 28s


# Word Embeddings
Example adjusted from:
- https://github.com/tensorflow/tensorflow/blob/r0.11/tensorflow/examples/skflow/text_classification.py

In [22]:
# What about using word embeddings rather than char embeddings
MAX_DOCUMENT_LENGTH = 30
EMBEDDING_SIZE = 100

# Prepare training and testing data
dbpedia = learn.datasets.load_dataset('dbpedia',size='small')

x_train = pandas.DataFrame(dbpedia.train.data)[1]
y_train = pandas.Series(dbpedia.train.target)
x_test = pandas.DataFrame(dbpedia.test.data)[1]
y_test = pandas.Series(dbpedia.test.target)

# Process vocabulary
vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
x_train = np.array(list(vocab_processor.fit_transform(x_train)))
x_test = np.array(list(vocab_processor.transform(x_test)))
n_words = len(vocab_processor.vocabulary_)
print('Total words: %d' % n_words)

Total words: 7552


In [44]:
def word_rnn_model(x, y):
    """Recurrent neural network model to predict from sequence of words
    to a class."""
    # Convert indexes of words into embeddings.
    # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
    # maps word indexes of the sequence into [batch_size, sequence_length,
    # EMBEDDING_SIZE].
    word_vectors = learn.ops.categorical_variable(x, n_classes=n_words,
      embedding_size=EMBEDDING_SIZE, name='words')

    # Split into list of embedding per word, while removing doc length dim.
    # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE].
    word_list = tf.unpack(word_vectors, axis=1)

    # Create a Gated Recurrent Unit cell with hidden size of EMBEDDING_SIZE.
    #cell = tf.nn.rnn_cell.GRUCell(EMBEDDING_SIZE)
    #cell = tf.nn.rnn_cell.BasicLSTMCell(EMBEDDING_SIZE,state_is_tuple=False)
    cell = tf.nn.rnn_cell.LSTMCell(EMBEDDING_SIZE,state_is_tuple=False)
    #cell = tf.nn.rnn_cell.BasicRNNCell(HIDDEN_SIZE)

    # Create an unrolled Recurrent Neural Networks to length of
    # MAX_DOCUMENT_LENGTH and passes word_list as inputs for each unit.
    per_rnn_output, final_encoding = tf.nn.rnn(cell, word_list, dtype=tf.float32)

    # Given encoding of RNN, take encoding of last step (e.g hidden size of the
    # neural network of last step) and pass it as features for logistic
    # regression over output classes.
    target = tf.one_hot(y, 15, 1, 0)
    prediction, loss = learn.models.logistic_regression(final_encoding, target)

    # Create a training op.
    train_op = tf.contrib.layers.optimize_loss(
      loss, tf.contrib.framework.get_global_step(),
      optimizer='Adam', learning_rate=0.01)

    return {'class': tf.argmax(prediction, 1), 'prob': prediction}, loss, train_op

In [45]:
%%time
# Build model
classifier = learn.Estimator(model_fn=word_rnn_model)

# Train and predict
classifier.fit(x_train, y_train, steps=200)
y_predicted = [p['class'] for p in classifier.predict(x_test, as_iterable=True)]
score = metrics.accuracy_score(y_test, y_predicted)
print('Accuracy: {0:f}'.format(score))



Accuracy: 0.600000
CPU times: user 3min 46s, sys: 32.2 s, total: 4min 19s
Wall time: 1min 51s


In [48]:
%%time
# shorter syntax for basic cell configurations
# this example hides much of the configuratbility 
# and hides the process of cell creation, so...
# I wouldn't start with this one

def input_op_fn(x):
    """Customized function to transform batched x into embeddings."""
    # Convert indexes of words into embeddings.
    # This creates embeddings matrix of [n_words, EMBEDDING_SIZE] and then
    # maps word indexes of the sequence into [batch_size, sequence_length,
    # EMBEDDING_SIZE].
    word_vectors = learn.ops.categorical_variable(x, n_classes=n_words,
      embedding_size=EMBEDDING_SIZE, name='words')
    # Split into list of embedding per word, while removing doc length dim.
    # word_list results to be a list of tensors [batch_size, EMBEDDING_SIZE].
    word_list = tf.unpack(word_vectors, axis=1)
    return word_list

classifier = learn.TensorFlowRNNClassifier(
        rnn_size=EMBEDDING_SIZE, n_classes=15, cell_type='gru',
        input_op_fn=input_op_fn, num_layers=1, bidirectional=False,
        sequence_length=None, optimizer='Adam',
        learning_rate=0.01, continue_training=True)

# Train and predict
classifier.fit(x_train, y_train, steps=200)
y_predicted = classifier.predict(x_test)
score = metrics.accuracy_score(y_test, y_predicted)
print('Accuracy: {0:f}'.format(score))



Accuracy: 0.814286
CPU times: user 52.7 s, sys: 7.11 s, total: 59.8 s
Wall time: 45.6 s
