 ### Table of Contents Generation
 This notebook was created with the purpose of automatically generating table of contents from text.
 The toy dataset used to play with this problem was pulled from:
 http://jmcauley.ucsd.edu/data/amazon/
 
 This dataset contains reviews of books, and titles of reviews. The input to the model will be the block of review text and the output will be the short title of the review. This should give us a decent idea of the possibilities with table content generation. The current plan for the architecture is to name any section of text based on the location of certain words in the text. So the actual input will be one hot encoded text tokens and the output will be locations of words that should be used for the ToC.

In [1]:
import json
import numpy as np
from keras.preprocessing.text import text_to_word_sequence
from collections import Counter
from time import time
    
vocab_size = 200
max_sequence_len = 200
max_title_len = 5

Using TensorFlow backend.


In [2]:
data_string = open("../Downloads/reviews_Musical_Instruments_5.json").read()
raw_data = data_string.split("\n")
data = [json.loads(d) for d in raw_data if d]

In [3]:
def create_long_string(obj):
    s = ""
    for example in obj:
        s += example['summary']+" "
        s += example['reviewText']+" "
    return s

In [4]:
def train_word_index(text, num_words):
    sequence = text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=" ")
    word_counts = Counter(sequence)
    top_words = word_counts.most_common(num_words)
    word_index = dict((i+1,str(top_words[i][0])) for i in range(len(top_words)))
    word_index[0] = "UNK"
    return word_index, dict((v,k) for k,v in word_index.iteritems())

In [5]:
def create_one_hot_tensor(sequence,word_number_index, max_length):
    number_sequence = []
    for word in sequence:
        if word in word_number_index:
            number_sequence.append(word_number_index[word])
        else:
            number_sequence.append(0) #append 0 for the UNK word
    a = np.array(number_sequence)
    b = np.zeros((1,max_length, len(word_number_index)))
    b[0,np.arange(len(number_sequence)), a] = 1
    return b

In [6]:
def one_hot_tensor_to_words(tensor, number_word_index):
    sentence = ""
    for i in range(len(tensor)):
        if np.sum(tensor[i]):
            sentence += number_word_index[np.argwhere(tensor[i])[0][0]]+" "
    return sentence    

In [7]:
def filter_text(text, title, max_sequence_len, max_title_len):
    #Make sure that the text is in the title for
    #this base case and then make sure that the
    #review is short enough
    sequence = text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=" ")
    title_seq = text_to_word_sequence(title, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=" ")
    lt = len(title_seq)
    ls = len(sequence)
    if ls > max_sequence_len or lt > max_title_len or ls < 1 or lt < 1 :
        return []
    title_in_text = True
    for word in title_seq:
        if word not in sequence:
            title_in_text = False
    if title_in_text:
        return [sequence, title_seq]
    return []

In [8]:
def create_output_vector(text_seq, title_seq, max_sequence_len):
    output = np.zeros((1,max_sequence_len))
    num_words = len(title_seq)
    order_change_value = 0.1/num_words
    for i in range(num_words):
        output[0,text_seq.index(title_seq[i])] = 1.0 - i*order_change_value
    return output

In [29]:
def create_title(output_vector, input_sequence):
    copy = np.array(output_vector[0])
    title = ""
    while np.amax(copy) > 0.1:
        i = np.argmax(copy)
        title += input_sequence[i] + " "
        copy[i] = 0
    return title

In [10]:
def shape_data(data, word_number_index,max_sequence_len,max_title_len):
#     X = np.zeros((1,max_sequence_len,len(word_number_index)))
#     y = np.zeros((1,max_sequence_len))
    X = []
    y = []
    for i in range(len(data)):
        sequence = filter_text(data[i]['reviewText'], data[i]['summary'], max_sequence_len, max_title_len)
        if sequence:
            one_hot_input = create_one_hot_tensor(sequence[0],word_number_index, max_sequence_len)
            output_vector = create_output_vector(sequence[0], sequence[1], max_sequence_len)
            X.append(one_hot_input)
            y.append(output_vector)
    return np.concatenate(X, axis=0), np.concatenate(y, axis=0)

In [11]:
text = create_long_string(data[:1000])
number_word_index, word_number_index = train_word_index(text,vocab_size-1)
start = time()
X, y = shape_data(data, word_number_index,max_sequence_len,max_title_len)
print "This took", time()-start, "seconds"

This took 1.43782091141 seconds


In [12]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K

batch_size = 2
epochs = 3

# the data, split between train and test sets
x_train, y_train, x_test, y_test = X[:int(len(X)*0.8)],y[:int(len(y)*0.8)],X[int(len(X)*0.8):],y[int(len(y)*0.8):]

x_train = x_train.reshape(x_train.shape[0], max_sequence_len, vocab_size, 1)
x_test = x_test.reshape(x_test.shape[0], max_sequence_len, vocab_size, 1)
input_shape = (max_sequence_len, vocab_size, 1)

print(input_shape)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(max_sequence_len, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])

(200, 200, 1)
('x_train shape:', (1083, 200, 200, 1))
(1083, 'train samples')
(271, 'test samples')


In [13]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

Train on 1083 samples, validate on 271 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
('Test loss:', 0.051420163321978934)
('Test accuracy:', 0.98767528190823939)


In [14]:
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


In [25]:
def generate_text_title(text,model,max_sequence_len,word_number_index,vocab_size):
    sequence = text_to_word_sequence(text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=" ")
    if len(sequence) > max_sequence_len:
        return "ERROR: The input text is too long for this model"
    else:
        X = create_one_hot_tensor(sequence,word_number_index, max_sequence_len)
        X = X.reshape(X.shape[0], max_sequence_len, vocab_size, 1)
        y = model.predict(X)
        print y
        title = create_title(y, sequence)
        print "Input Text:"
        print text, "\n\n"
        print "The model produced the title:", title

In [33]:
text = "Hans Rosling unveils data visuals that untangle the complex risk factors of one of the world's deadliest (and most misunderstood) diseases: HIV. By following the data, he suggests a surprising key to ending the epidemic."
generate_text_title(text,model,max_sequence_len,word_number_index,vocab_size)

[[ 0.14211692  0.17014553  0.12502606  0.14736106  0.1954691   0.0853577
   0.05704629  0.0351056   0.0380179   0.0285765   0.02389583  0.02320454
   0.01747099  0.01997231  0.01662601  0.01796689  0.01021424  0.01059234
   0.00967774  0.00745477  0.01204208  0.00477566  0.00465875  0.00706493
   0.00467281  0.00472105  0.00409616  0.00523411  0.003194    0.00346611
   0.0040099   0.00335662  0.00268254  0.00556993  0.00208265  0.00272021
   0.0023888   0.00202976  0.0020385   0.00224964  0.00167014  0.00223631
   0.00383977  0.00130097  0.00303147  0.00317695  0.00224009  0.00164138
   0.00134431  0.00158017  0.00152349  0.00128965  0.0016482   0.00102297
   0.00112634  0.0009228   0.00172286  0.00198497  0.001576    0.00119431
   0.00093759  0.00108547  0.00158062  0.00084628  0.00128042  0.00079848
   0.00123767  0.00102759  0.00142596  0.00101647  0.00090521  0.00174489
   0.00109962  0.00148904  0.00157936  0.00117396  0.00096152  0.00087056
   0.00111546  0.00130588  0.00199301  