In [63]:
import numpy as np
import codecs
#import glob
from sklearn.model_selection import train_test_split
import pandas as pd
from nltk import word_tokenize
import h5py
import os
from keras.preprocessing import sequence
from keras.layers import Embedding, Input, Dense, LSTM, TimeDistributed
from keras.models import Model

In [10]:
data = codecs.open("data/kaggle_trainset.txt", 'r', 'utf-8').read().split('\n')
data = data[:20800]
data = [s.lower() for s in data]

labels = codecs.open("data/kaggle_train_labels.txt", 'r', 'utf-8').read().split('\n')
labels = labels[:20800]
labels = np.array([int(i) for i in labels])

train, dev, train_lab, dev_lab = train_test_split(data, labels, test_size=0.33, random_state=42)

In [56]:
# using the mean length of documents as max_doc_length for now
max_doc_length = int(np.round(np.mean([len(paragraph) for paragraph in train])))
print(max_doc_length)


4514


In [57]:
num_time_steps = max_doc_length

In [27]:
# padding example
mock_sequences = np.array([[[0], [2], [4]],[[1], [3], [5], [7], [9]]])

seq = sequence.pad_sequences(mock_sequences, maxlen=4, dtype='int32', padding='pre', truncating='pre', value=0.0)
seq

array([[[0],
        [0],
        [2],
        [4]],

       [[3],
        [5],
        [7],
        [9]]], dtype=int32)

In [40]:
word_to_idx = {}
for i in train+dev: 
    # print(i)
    sent = word_tokenize(i.lower())
    for word in sent:
        if word not in word_to_idx:
            word_to_idx[word] = len(word_to_idx)+1 # making the first id is 1, so that I can pad with zeroes.

#print(word_to_idx)

# Do I need an unknown token when just doing word-to-index (without any counts and such)?
# Yeah I think so...
# maybe use keras tokenizer and texts_to_sequences function, which skips unknown words. 
# however, an unknown token may be prefered?

vocab_size = len(word_to_idx)
print(vocab_size)

idx_to_word = {v: k for k, v in word_to_idx.items()}

232263


In [None]:
# save Dict file containing the mapping from word ID to word (e.g. train.dict)
f = open("dict.txt","w+")
f.write( str(idx_to_word) )
f.close()
# use tool from lstmvis to transform txt to .Dict file.

In [42]:
trainTextsSeq_list = []
for input_sequence in train:
    inputs = [word_to_idx[w] for w in word_tokenize(input_sequence.lower())]
    trainTextsSeq_list.append(inputs)
trainTextsSeq = np.array(trainTextsSeq_list)

In [46]:
# padding with max doc lentgh (mean length at the moment)
seq = sequence.pad_sequences(trainTextsSeq, maxlen=max_doc_length, dtype='int32', padding='post', truncating='post', value=0.0)


In [48]:
trainTextsSeq_flatten = np.array(seq).flatten()
hf = h5py.File("train.hdf5", "w")
hf.create_dataset('words', data=trainTextsSeq_flatten)
hf.close()

In [58]:
# Reshape y_train: 
y_train_tiled = np.tile(train_lab, (num_time_steps,1))
y_train_tiled = y_train_tiled.reshape(len(train_lab), num_time_steps , 1)

In [67]:
y_train_tiled.shape

(13936, 4514, 1)

In [71]:
# num_cells: number of LSTM cells
num_cells = 100 # 100 for now, probably test best parameter through cross-validation
num_samples = len(train_lab)
embedding_size = 100 # also just for now..
num_epochs = 100
num_batch = 32 # also find optimal through cross-validation

# max_doc_length vectors of size embedding_size
myInput = Input(shape=(max_doc_length,), name='input') 
x = Embedding(output_dim=embedding_size, input_dim=vocab_size, input_length=max_doc_length)(myInput)
lstm_out = LSTM(num_cells, return_sequences=True)(x)
predictions = TimeDistributed(Dense(2, activation='softmax'))(lstm_out)
model = Model(inputs=myInput, outputs=predictions)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

From tutorial:
Next, we need to reshape the document class labels. In many other usages such as POS tagging, the model emits a value for each input token, while in document classification, the model typically emits a value for each document. To make the class labels consistent with such model, we need to have a vector for each time step. We can reshape the original vector of class labels by repetition. That is, for instance in binary classification, if the sample is positive, all underlying words are labeled as 1, otherwise 0. Towards this goal, we can use tile function as follows:

In [80]:
print(seq.shape)
print(y_train_tiled.shape)
model.fit({'input': seq}, y_train_tiled, epochs=20, verbose=1)


(13936, 4514)
(13936, 4514, 1)
Epoch 1/20


KeyboardInterrupt: 

In [None]:
model.layers.pop();
model.summary()
# Save the states via predict
inp = model.input
out = model.layers[-1].output
model_RetreiveStates = Model(inp, out)
states_model = model_RetreiveStates.predict(trainTextsSeq, batch_size=num_batch)
print(states_model.shape)

In [None]:
# Flatten first and second dimension for LSTMVis
states_model_flatten = states_model.reshape(num_samples * num_time_steps, num_cells)

hf = h5py.File("states.hdf5", "w")
hf.create_dataset('states1', data=states_model_flatten)
hf.close()