In [None]:
INPUT_RELEVANT_DF_FILE = 'output/relevant_df.pkl'

TEST_SIZE = 0.2
TIMESTEPS = 200
VOCAB_SIZE = 9896 # TODO: Find a way to calculate this dynamically and pass to the model
BATCH_SIZE = 1 # NOTE: 4 or 1
EPOCHS = 1
OUTPUT_SIZE = 32
COMPILE_PARAMS = dict(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

In [None]:
import pandas as pd
import numpy as np

relevant_df = pd.read_pickle(INPUT_RELEVANT_DF_FILE)

In [None]:
from sklearn.preprocessing import LabelBinarizer
# NOTE: If need to reimplement, https://github.com/scikit-learn/scikit-learn/blob/7389dba/sklearn/preprocessing/label.py#L163
    
y = relevant_df.pop('Reason').values
lb = LabelBinarizer()
lb.fit(y)
y = lb.transform(y)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    relevant_df.pop('ReadCodeText').values,
    y,
    test_size=TEST_SIZE,
    random_state=1337
)

In [None]:
from sklearn.base import BaseEstimator
from sklearn.pipeline import TransformerMixin
from keras.preprocessing.text import Tokenizer

class ReadCodesToSequences(Tokenizer, BaseEstimator, TransformerMixin):
    
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        
    def fit(self, X, y=None):
        self.fit_on_texts(X)
        return self
    
    def transform(self, X, y=None):
        return np.array(self.texts_to_sequences(X))
    
readcodes2sequences = ReadCodesToSequences(filters='', lower=False, num_words=VOCAB_SIZE)

In [None]:
from keras.preprocessing.sequence import pad_sequences

class Padder(BaseEstimator, TransformerMixin):
    
    def __init__(self, max_len=TIMESTEPS):
        self.max_len = max_len
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = pad_sequences(X, maxlen=self.max_len)
        return X
    
padder = Padder()

In [None]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense, LSTM, Flatten
from keras.wrappers.scikit_learn import KerasClassifier

def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=VOCAB_SIZE, output_dim=OUTPUT_SIZE, input_length=TIMESTEPS, batch_input_shape=(BATCH_SIZE, TIMESTEPS)))
    model.add(LSTM(units=OUTPUT_SIZE, batch_input_shape=(BATCH_SIZE, TIMESTEPS, OUTPUT_SIZE), stateful=True, return_sequences=False))
    model.add(Dense(units=3, activation='softmax'))
    model.compile(**COMPILE_PARAMS)
    print(model.summary())
    return model

sklearn_model = KerasClassifier(
    build_fn=create_model, 
    epochs=1, 
    batch_size=BATCH_SIZE, 
    verbose=1
)

In [None]:
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(readcodes2sequences, padder, sklearn_model)

In [None]:
pipeline.fit(X_train, y_train)
y_preds = pipeline.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

y_true = np.argmax(y_test, axis=1)

print('Accuracy: {:.2f} %'.format(100 * accuracy_score(y_preds, y_true)))

In [None]:
idx = 52
readcodes = X_test[idx]

print('Flagging period {}'.format(idx))
print('-' * 50)
print(' '.join(readcodes.split()))
print('-' * 50)
print('Probability array: ', pipeline.predict_proba([readcodes]))
print('Predicted class: ', lb.classes_[y_preds[idx]])
print('Actual class: ', lb.classes_[y_true[idx]])

In [None]:
from lime.lime_text import LimeTextExplainer

explainer = LimeTextExplainer(class_names=lb.classes_.tolist())
explanation = explainer.explain_instance(readcodes, pipeline.predict_proba, top_labels=3, distance_metric='cosine', bow=False)

In [None]:
explanation.show_in_notebook()

In [None]:
X_test[idx]