In [2]:
%load_ext autoreload 
%autoreload 2

import numpy as np

from lstm_examples import util

np.random.seed(1701)

# Loading data

In [4]:
# Load data from CSV file
n_classes = 4
text, y = util.read_data('data/train.csv')

In [5]:
# Shuffle sentences
shuffle_idx = np.random.permutation(len(text))
text = [text[i] for i in shuffle_idx]
y = y[shuffle_idx]

# Train and test split (just take a thousand for speed)
n_train, n_test = 1000, 500
text_train = text[:n_train]
text_test = text[n_train : n_train+n_test]
y_train = y[:n_train]
y_test = y[n_train : n_train+n_test]

In [6]:
print(text_train[0])
print("\nClass: ", y_train[0])

AFP - How to create sustainable employment for impoverished, mostly illiterate African populations residing primarily in rural areas is the challenge the African Union aims to tackle at a summit opening Wednesday in Burkina Faso.

Class:  0


# Text processing with spaCy

In [7]:
import spacy

nlp = spacy.load('en')

In [8]:
# Parse all text
text_train_parsed = [nlp(s) for s in text_train]
text_test_parsed = [nlp(s) for s in text_test]

In [9]:
# Convert text to integer symbols
symbol_table = util.SymbolTable()

def preprocess_text(parsed_text, symbol_table, init=True):
    mapper = symbol_table.lookup_add if init else symbol_table.lookup
    return [[mapper(w.text.strip().lower()) for s in t.sents for w in s] for t in parsed_text]

In [10]:
symbols_train = preprocess_text(text_train_parsed, symbol_table, True)
symbols_test = preprocess_text(text_test_parsed, symbol_table, False)

In [11]:
print(text_train_parsed[100][:15])
print(symbols_train[100][:15])
print("\nClass: ", y_train[100])

Congress is now poised to hand President Bush an election-year tax cut victory
[1541, 21, 264, 1454, 5, 1542, 150, 882, 555, 1543, 3, 683, 1544, 1277, 1209]

Class:  2


# Converting to matrix format

In [12]:
from keras.preprocessing import sequence

MAX_LENGTH = 50

x_train = sequence.pad_sequences(symbols_train, maxlen=MAX_LENGTH)
x_test = sequence.pad_sequences(symbols_test, maxlen=MAX_LENGTH)

Using TensorFlow backend.


# Building the graph

In [13]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import LSTM
from keras.optimizers import Adam

d = 100

model = Sequential()
model.add(Embedding(symbol_table.num_symbols(), output_dim=d))
model.add(LSTM(d))
model.add(Dense(n_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(lr=0.0005),
              metrics=['accuracy'])

# Train!

In [17]:
x_train

array([[   0,    0,    0, ...,   32,   33,   34],
       [   0,    0,    0, ...,   56,   57,   34],
       [   0,    0,    0, ...,   67,   68,   69],
       ..., 
       [   0,    0,    0, ...,  928, 7537,   34],
       [   0,    0,    0, ..., 1292,  338,   34],
       [   0,    0,    0, ...,  186,  187,   34]], dtype=int32)

In [19]:
x_train

array([[   0,    0,    0, ...,   32,   33,   34],
       [   0,    0,    0, ...,   56,   57,   34],
       [   0,    0,    0, ...,   67,   68,   69],
       ..., 
       [   0,    0,    0, ...,  928, 7537,   34],
       [   0,    0,    0, ..., 1292,  338,   34],
       [   0,    0,    0, ...,  186,  187,   34]], dtype=int32)

In [14]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=32, epochs=20)

Train on 1000 samples, validate on 500 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1621c1f98>