In [None]:
import datetime
import pickle
import sys
import timeit

import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
OUTPUT_TRAIN_DF_FILE = 'output/train-df.pkl'
OUTPUT_TEST_DF_FILE = 'output/test-df.pkl'
OUTPUT_TOK_FILE = 'output/tok.pkl'
OUTPUT_X_TRAIN_FILE = 'output/encoded-X-train.pkl'
OUTPUT_X_TEST_FILE = 'output/encoded-X-test.pkl'
OUTPUT_Y_TRAIN_FILE = 'output/encoded-y-train.pkl'
OUTPUT_Y_TEST_FILE = 'output/encoded-y-test.pkl'
OUTPUT_MODEL_FILE = 'output/lstm-rnn-model.h5'
OUTPUT_RESULTS_FILE = 'output/lstm-rnn-results.pkl'

TEST_SIZE = 0.2
TIMESTEPS = 200
BATCH_SIZE = 4
OUTPUT_SIZE = 32

In [None]:
script_start_time = datetime.datetime.now()
print('{} started at {}'.format(sys.argv[0], script_start_time))

In [None]:
print('Reading data...', end='')
start_time = timeit.default_timer()
train_df = pd.read_pickle(OUTPUT_TRAIN_DF_FILE)
test_df = pd.read_pickle(OUTPUT_TEST_DF_FILE)
with open(OUTPUT_TOK_FILE, 'rb') as handle:
    tok = pickle.load(handle)
X_train = np.load(OUTPUT_X_TRAIN_FILE)
X_test = np.load(OUTPUT_X_TEST_FILE)
y_train = np.load(OUTPUT_Y_TRAIN_FILE)
y_test = np.load(OUTPUT_Y_TEST_FILE)
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

In [None]:
VOCAB_SIZE = len(tok.word_index) + 1

In [None]:
print('Encoding training and testing labels...', end='')
start_time = timeit.default_timer()
le_train = LabelEncoder().fit(y_train.unique().tolist())
y_train = to_categorical(le_train.transform(y_train))

le_test = LabelEncoder().fit(y_test.unique().tolist())
y_test = to_categorical(le_test.transform(y_test))
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

In [None]:
def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=VOCAB_SIZE, output_dim=OUTPUT_SIZE, input_length=TIMESTEPS, batch_input_shape=(BATCH_SIZE, TIMESTEPS)))
    model.add(LSTM(units=32, batch_input_shape=(BATCH_SIZE, TIMESTEPS, OUTPUT_SIZE), stateful=True, return_sequences=False))
    model.add(Dense(units=3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
    print(model.summary())
    model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=1, batch_size=BATCH_SIZE)
    return model

In [None]:
print('Training model...', end='')
start_time = timeit.default_timer()
model = create_model()
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

In [None]:
model.save(OUTPUT_MODEL_FILE)

In [None]:
print('Making predictions...', end='')
start_time = timeit.default_timer()
y_train_prob = model.predict(X_train, batch_size=BATCH_SIZE)
y_train_pred = y_train_prob.argmax(axis=-1)
y_train_actual = y_train.argmax(axis=-1)
train_df.loc[:, 'PredictedReason'] = le_train.inverse_transform(list(y_train_pred))
train_df.loc[:, 'ActualReason'] = le_train.inverse_transform(list(y_train_actual))

y_test_prob = model.predict(X_test, batch_size=BATCH_SIZE)
y_test_pred = y_test_prob.argmax(axis=-1)
y_test_actual = y_test.argmax(axis=-1)
test_df.loc[:, 'PredictedReason'] = le_test.inverse_transform(list(y_test_pred))
test_df.loc[:, 'ActualReason'] = le_test.inverse_transform(list(y_test_actual))
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

In [None]:
results_df = pd.concat([train_df, test_df])
cols = list(train_df.columns.values)
results_df = results_df[cols]

In [None]:
results_df.to_pickle(OUTPUT_RESULTS_FILE)