# Useful Functions 

In [1]:
import numpy as np
from __future__ import print_function
from keras.utils import to_categorical
from sklearn.metrics import classification_report

def make_labels(data):
    a2c = {'EAP': 0, 'HPL' : 1, 'MWS' : 2}
    labels = np.array([a2c[a] for a in data.author])
    labels = to_categorical(labels)
    return labels

def get_text_only(data):
    return data["text"]

def calc_metrics(x, y_true, y_pred):
    return classification_report(y_true, y_pred)

from sklearn.metrics import confusion_matrix

def calc_confusion_matrix(y_true, y_pred):
    return confusion_matrix(np.argmax(y_true, axis=1), np.argmax(y_pred, axis=1))

import pandas as pd

# loads the training data
def load_training_data():
    return pd.read_csv("train.csv")
def load_test_data():
    return pd.read_csv("test.csv")

from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical

# A function to execute a machine learning pipeline
# load_func : the function that loads training data
# preprocessing_func_arr : an array of functions that preprocess the data. Executed in order and the output feeds into the input of the next
# create_model_func : a function that creates a model to train and test.
def run(load_func, preprocess_func_arr, create_model_func, verbosity=2, preprocess_debug=False):
    print("Loading data")
    full_data = load_func()
    
    print("Getting labels")
    labels = make_labels(full_data)    
    
    print("Preprocessing")
    data = get_text_only(full_data)
    for func in preprocess_func_arr:
        data = func(data)
        if preprocess_debug is True:
            print(data[0])
    
    input_dim = max([max(x) for x in data]) + 1

    print("Creating model")
    model = create_model_func(input_dim)
    
    print("Model Summary", model.summary)
    
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)
    
    print("Training model")
    model.fit(x_train, y_train,
                 batch_size=16,
                 validation_data=(x_test, y_test),
                 epochs=64,
                 verbose=verbosity,
                 callbacks=[EarlyStopping(patience=2, monitor='val_loss')])
    print("Training complete")
    
    print("Testing model")
    y_pred = model.predict_classes(x_test)
    y_pred = to_categorical(y_pred, num_classes=3)  
    
    print("Test results")  
    print("accuracy", accuracy_score(y_test, y_pred))
    print("metrics")
    print(calc_metrics(x_test, y_test, y_pred))
    print("confusion matrix")
    print(calc_confusion_matrix(y_test, y_pred))
    
    print("Generating Kaggle test results - test_results.csv")
    test_data = load_test_data()
    data = test_data["text"]
    for func in preprocess_func_arr:
        data = func(data)
    out = model.predict(data)
    print(out.shape)

Using TensorFlow backend.


# Model creations functions

In [2]:
from keras.layers import Dense, GlobalAveragePooling1D, Embedding, Conv1D
from keras.models import Sequential

# This is the best performing of the three models, it has the lowest loss of all three.
# All three have similar accuracies.
def embedding_and_pooling_model(input_dim, embedding_dims=500, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

def embedding_conv1d_pooling_model(input_dim, embedding_dims=100, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(Conv1D(16, 8, activation="relu"))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

from keras.optimizers import RMSprop
from keras.layers import Dropout, MaxPooling1D

def deeper_with_multiple_convolutions(input_dim, embedding_dims=100, optimizer=RMSprop(lr=0.003)):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(Conv1D(128, 8, activation="relu"))
    model.add(MaxPooling1D(2))
    model.add(Conv1D(128, 8, activation="relu"))
    model.add(MaxPooling1D(2))
    model.add(Dropout(0.5))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

# Preprocessing functions

In [3]:
from keras.preprocessing.text import Tokenizer

# converts the sentences in text into a sequence of numbers
# input: an array of strings
# output: an array of  sequences of numbers
def convert_to_sequences_remove_chars(text):
    tokenizer = Tokenizer(split=" ", char_level=False)
    tokenizer.fit_on_texts(text);
    return tokenizer.texts_to_sequences(text)

# converts the sentences in text into a sequence of numbers
# input: an array of strings
# output: an array of  sequences of numbers
def convert_to_sequences_leave_chars(text):
    tokenizer = Tokenizer(filters="", split=" ", char_level=False)
    tokenizer.fit_on_texts(text);
    return tokenizer.texts_to_sequences(text)

from keras.preprocessing.sequence import pad_sequences

# Pads the input arrays to be of equal length
# input: An array of sequences of numbers
# output: an array of sequences of numbers
def pad_data(text):
    maxlen = np.amax([len(x) for x in text], axis=0)
    return pad_sequences(sequences=text, maxlen=maxlen) 

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

# removes stop words from the sentences in text
# input: an array of strings
# output: an array of strings
def remove_stops(text):
    stops = set(stopwords.words("english"))
    return [" ".join([word for word in nltk.word_tokenize(words) if word not in stops]) for words in text]

# We had seen a submission where the person treated punctuation as distinct words, we thought this would be worth trying
#  seeing as some authors may have different patterns of punctuation
# input: an array of strings
# output: an array of strings
def convert_punctuation_to_words(texts):
    chars = "~!@#$%^&*()_+`-=,./;'<>?:\""
  
    for c in chars:
        texts = [text.replace(c, " " + c + " ") for text in texts]
    return texts

from nltk.stem import PorterStemmer

# Uses the Porter stemmer to stem each word in the texts
# input: an array of strings
# output: an array of strings
def stem_texts_porter(texts):
    stmr = PorterStemmer()
    return [" ".join([stmr.stem(word) for word in nltk.word_tokenize(text)]) for text in texts]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Forrest\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
preprocessors = [
    convert_punctuation_to_words,
    remove_stops,
    convert_to_sequences_remove_chars,
    pad_data
]

run(load_training_data, preprocessors, embedding_and_pooling_model)

Loading data
Getting labels
Preprocessing
Creating model
Model Summary <bound method Container.summary of <keras.models.Sequential object at 0x00000266C24F4CC0>>
Training model
Train on 15663 samples, validate on 3916 samples
Epoch 1/64


InternalError: Blas GEMM launch failed : a.shape=(16, 500), b.shape=(500, 3), m=16, n=3, k=500
	 [[Node: dense_1/MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](global_average_pooling1d_1/Mean, dense_1/kernel/read)]]
	 [[Node: metrics/acc/Mean/_113 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_446_metrics/acc/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'dense_1/MatMul', defined at:
  File "c:\users\forrest\appdata\local\programs\python\python36\Lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "c:\users\forrest\appdata\local\programs\python\python36\Lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "c:\users\forrest\envs\ml\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "c:\users\forrest\envs\ml\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "c:\users\forrest\envs\ml\lib\site-packages\ipykernel\kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "c:\users\forrest\envs\ml\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "c:\users\forrest\envs\ml\lib\site-packages\tornado\ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "c:\users\forrest\envs\ml\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "c:\users\forrest\envs\ml\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "c:\users\forrest\envs\ml\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "c:\users\forrest\envs\ml\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "c:\users\forrest\envs\ml\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "c:\users\forrest\envs\ml\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "c:\users\forrest\envs\ml\lib\site-packages\ipykernel\kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "c:\users\forrest\envs\ml\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "c:\users\forrest\envs\ml\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "c:\users\forrest\envs\ml\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "c:\users\forrest\envs\ml\lib\site-packages\IPython\core\interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "c:\users\forrest\envs\ml\lib\site-packages\IPython\core\interactiveshell.py", line 2856, in run_ast_nodes
    if self.run_code(code, result):
  File "c:\users\forrest\envs\ml\lib\site-packages\IPython\core\interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-90e1d2f25425>", line 8, in <module>
    run(load_training_data, preprocessors, embedding_and_pooling_model)
  File "<ipython-input-1-4968ce1e8ffa>", line 57, in run
    model = create_model_func(input_dim)
  File "<ipython-input-2-621b3a09c04d>", line 10, in embedding_and_pooling_model
    model.add(Dense(3, activation='softmax'))
  File "c:\users\forrest\envs\ml\lib\site-packages\keras\models.py", line 489, in add
    output_tensor = layer(self.outputs[0])
  File "c:\users\forrest\envs\ml\lib\site-packages\keras\engine\topology.py", line 603, in __call__
    output = self.call(inputs, **kwargs)
  File "c:\users\forrest\envs\ml\lib\site-packages\keras\layers\core.py", line 843, in call
    output = K.dot(inputs, self.kernel)
  File "c:\users\forrest\envs\ml\lib\site-packages\keras\backend\tensorflow_backend.py", line 1057, in dot
    out = tf.matmul(x, y)
  File "c:\users\forrest\envs\ml\lib\site-packages\tensorflow\python\ops\math_ops.py", line 1891, in matmul
    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
  File "c:\users\forrest\envs\ml\lib\site-packages\tensorflow\python\ops\gen_math_ops.py", line 2436, in _mat_mul
    name=name)
  File "c:\users\forrest\envs\ml\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "c:\users\forrest\envs\ml\lib\site-packages\tensorflow\python\framework\ops.py", line 2956, in create_op
    op_def=op_def)
  File "c:\users\forrest\envs\ml\lib\site-packages\tensorflow\python\framework\ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InternalError (see above for traceback): Blas GEMM launch failed : a.shape=(16, 500), b.shape=(500, 3), m=16, n=3, k=500
	 [[Node: dense_1/MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](global_average_pooling1d_1/Mean, dense_1/kernel/read)]]
	 [[Node: metrics/acc/Mean/_113 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_446_metrics/acc/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
