In [4]:
from nltk.tokenize import TreebankWordTokenizer
import os
import gensim
from sklearn.model_selection import train_test_split
import time
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv1D, Dropout, GlobalMaxPooling1D, MaxPooling1D, GlobalAveragePooling1D
from keras.optimizers import SGD
import gensim
from keras.models import model_from_json
import pickle
import pandas as pd
from keras.optimizers import Adam

from model_template import Model
from glove_keras_cnn import GloveKerasCnn

In [6]:
load = True  
maxlen = 100
batch_size = 32
embedding_dims = 300
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 4
num_classes = 3

In [4]:
def pad_trunc(data, maxlen):
    new_data = []
    # Create a vector of 0s the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)
    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            # Append the appropriate number 0 vectors to the list
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data

In [9]:
filename = 'train.csv'
df = pd.read_csv(filename, index_col='id')
authors = list(df.author.unique())
lookup = {a: _ for _, a in enumerate(authors)}
y_numbers = [lookup[i] for i in df.author]
y_vecs = []
for y in y_numbers:
    base_vec = np.zeros(num_classes, dtype='int')
    base_vec[y] = 1
    y_vecs.append(base_vec)
df['y'] = y_vecs

In [64]:
authors[0]

'EAP'

In [19]:
EAP_test_string = """Once upon a midnight dreary, while I pondered, weak and weary, 
Over many a quaint and curious volume of forgotten lore, 
While I nodded, nearly napping, suddenly there came a tapping, 
As of some one gently rapping, rapping at my chamber door.""".replace("\n", "")

In [20]:
HPL_test_string = """In this luminous Company I was tolerated more because of my Years 
than for my Wit or Learning; being no Match at all for the rest. My Friendship for the 
celebrated Monsieur Voltaire was ever a Cause of Annoyance to the Doctor; who was deeply 
orthodox, and who us'd to say of the French Philosopher.""".replace("\n", "")

In [21]:
MWS_test_string = """A few seconds ago they had all been active and healthy beings, 
so full of employment they could not afford to mend his calèche unless tempted by 
some extraordinary reward; now the men declared themselves cripples and invalids, the 
children were orphans, the women helpless widows, and they would all die of hunger if 
his Eccellenza did not bestow a few grani.""".replace("\n", "")

In [None]:
GLOVE_DIR = "/media/D/data/glove/"
GLOVE_W2V_FILE = "glove.840B.300d.w2vformat.txt"
GLOVE_W2V_PATH = os.path.join(GLOVE_DIR, GLOVE_W2V_FILE)
glove_model = gensim.models.KeyedVectors.load_word2vec_format(GLOVE_W2V_PATH)
wv = glove_model.wv
tokenizer = TreebankWordTokenizer()

In [38]:
vectorized_data = []
for sentence in [EAP_test_string, HPL_test_string, MWS_test_string]:
    sample_vecs = []
    for token in tokenizer.tokenize(sentence):
        try:
            sample_vecs.append(wv[token])
        except KeyError:
            # print(token, "not in wv")
            pass
    vectorized_data.append(sample_vecs)
pickle.dump(vectorized_data, open("glove_vectorized_test_sentences", "wb"))

In [39]:
vectorized_data = pickle.load(open("glove_vectorized_test_sentences", "rb"))
vectorized_data = pad_trunc(vectorized_data, maxlen)

In [40]:
len(vectorized_data[2])

100

In [42]:
test = np.asarray(vectorized_data)
test.shape

(3, 100, 300)

In [43]:
new_test = np.reshape(test, (len(test), maxlen, embedding_dims))

In [44]:
new_test.shape

(3, 100, 300)

In [45]:
cnn = glove_keras_cnn()
model = cnn.load()

In [48]:
predictions = model.predict(new_test)

In [61]:
np.argmax(predictions, axis=1)

array([0, 0, 2])

Sandbox to try out different CNN model structures

In [7]:
from glove_keras_cnn import *

In [8]:
X = pickle.load(open("X-glove-encoding", "rb"))
y = pickle.load(open("y-glove-encoding", "rb"))
vectorized_query = pickle.load(open("glove_vectorized_test_sentences", "rb"))

In [None]:
cnn = GloveKerasCnn()
X_train, X_dev, Y_train, Y_dev = train_test_split(X, y, test_size=0.2, random_state=707)
X_train = cnn.pad_trunc(X_train, maxlen)
X_dev = cnn.pad_trunc(X_dev, maxlen)
X_train = np.reshape(X_train, (len(X_train), maxlen, embedding_dims))
Y_train = np.reshape(Y_train, (len(Y_train), num_classes))
X_dev = np.reshape(X_dev, (len(X_dev), maxlen, embedding_dims))
Y_dev = np.reshape(Y_dev, (len(Y_dev), num_classes))
model = cnn.create()
model = cnn.train(model, X_train, Y_train, X_dev, Y_dev)

In [22]:
padded_query = cnn.pad_trunc(vectorized_query, maxlen)
padded_query = np.asarray(padded_query)
predictions = cnn.model.predict(padded_query)

In [24]:
np.argmax(predictions, axis=1)

array([0, 0, 2])

In [55]:
stacked_filters_1 = 256
stacked_filters_2 = 128
stacked_kernel_size_1 = 5
stacked_kernel_size_2 = 3
stacked_hidden_dims_1 = 250
stacked_hidden_dims_2 = 50
stacked_epochs = 5

class GloveKerasStackedCnn(GloveKerasCnn):
    def __init__(self, wv=None):
        GloveKerasCnn.__init__(self)
        if wv:
            self.embedding = wv
        self.mname = "stacked_cnn_model.json"
        self.wname = "stacked_cnn_weights.h5"

    def create(self):
        model = Sequential()
        model.add(Conv1D(64, 3, padding='valid', activation='relu', input_shape=(maxlen, embedding_dims)))
        model.add(Conv1D(64, 3, activation='relu'))
        model.add(MaxPooling1D(5))
        model.add(Conv1D(128, 3, activation='relu'))
        model.add(Conv1D(128, 3, activation='relu'))
        model.add(GlobalAveragePooling1D())
        model.add(Dropout(0.5))
        model.add(Dense(num_classes))
        model.add(Activation('softmax'))
        self.model = model
        return model

    def train(self, model, X_train, Y_train, X_dev, Y_dev):
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        model.fit(X_train, Y_train, batch_size=batch_size, epochs=stacked_epochs, validation_data=(X_dev, Y_dev))
        return model

    def predict(self, query, vectorize=False):
        if not self.model:
            print("No model available for prediction")
        if vectorize:
            if isinstance(query, str):
                query = [query]
            vectorized_query = self.vectorize(query)
            pickle.dump(vectorized_query, open("glove_vectorized_test_sentences", "wb"))
        else:
            vectorized_query = query
        vectorized_query = self.pad_trunc(vectorized_query, maxlen)
        vectorized_query = np.asarray(vectorized_query)
        vectorized_query = np.reshape(vectorized_query, (len(query), maxlen, embedding_dims)) # Should be redundant, to ensure compliance
        predictions = self.model.predict(vectorized_query)
        return predictions

In [56]:
cnn2 = GloveKerasStackedCnn()
print("splitting test set")
X_train, X_dev, Y_train, Y_dev = train_test_split(X, y, test_size=0.2, random_state=2684)
print("padding")
X_train = cnn2.pad_trunc(X_train, maxlen)
X_dev = cnn2.pad_trunc(X_dev, maxlen)
print("reshaping data")
X_train = np.reshape(X_train, (len(X_train), maxlen, embedding_dims))
Y_train = np.reshape(Y_train, (len(Y_train), num_classes))
X_dev = np.reshape(X_dev, (len(X_dev), maxlen, embedding_dims))
Y_dev = np.reshape(Y_dev, (len(Y_dev), num_classes))

splitting test set
padding
reshaping data


In [61]:
cnn2 = GloveKerasStackedCnn()
print("creating model")
model = cnn2.create()
print("training model")
model = cnn2.train(model, X_train, Y_train, X_dev, Y_dev)

padded_query = cnn2.pad_trunc(vectorized_query, maxlen)
padded_query = np.asarray(padded_query)
predictions = cnn2.model.predict(padded_query)

creating model
training model
Train on 15663 samples, validate on 3916 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [62]:
print(predictions)
print(np.argmax(predictions, axis=1))

[[3.0235273e-01 6.9760913e-01 3.8170085e-05]
 [9.9995458e-01 4.5402543e-05 6.9700423e-09]
 [3.2605074e-04 2.5730924e-04 9.9941671e-01]]
[1 0 2]


In [64]:
cnn2.save(model, save_weights=True)

In [65]:
double_filters_1 = 128
double_filters_2 = 256
double_kernel_size_1 = 5
double_kernel_size_2 = 3
double_hidden_dims_1 = 250
double_hidden_dims_2 = 50
double_epochs = 10

class GloveKerasDoubleCnn(GloveKerasCnn):
    def __init__(self, wv=None):
        GloveKerasCnn.__init__(self)
        if wv:
            self.embedding = wv
        self.mname = "double_cnn_model.json"
        self.wname = "double_cnn_weights.h5"

    def create(self):
        model = Sequential()
        model.add(Conv1D(double_filters_1, 3, padding='valid', activation='relu', input_shape=(maxlen, embedding_dims)))
        model.add(MaxPooling1D(3))
        model.add(Conv1D(double_filters_2, 3, activation='relu'))
        model.add(GlobalMaxPooling1D())
        model.add(Dropout(0.5))
        model.add(Dense(num_classes))
        model.add(Activation('sigmoid'))
        self.model = model
        return model

    def train(self, model, X_train, Y_train, X_dev, Y_dev):
        adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False)
        model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
        model.fit(X_train, Y_train, batch_size=batch_size, epochs=double_epochs, validation_data=(X_dev, Y_dev))
        return model

    def predict(self, query, vectorize=False):
        if not self.model:
            print("No model available for prediction")
        if vectorize:
            if isinstance(query, str):
                query = [query]
            vectorized_query = self.vectorize(query)
            pickle.dump(vectorized_query, open("glove_vectorized_test_sentences", "wb"))
        else:
            vectorized_query = query
        vectorized_query = self.pad_trunc(vectorized_query, maxlen)
        vectorized_query = np.asarray(vectorized_query)
        vectorized_query = np.reshape(vectorized_query, (len(query), maxlen, embedding_dims)) # Should be redundant, to ensure compliance
        predictions = self.model.predict(vectorized_query)
        return predictions

In [66]:
cnn3 = GloveKerasDoubleCnn()
print("splitting test set")
X_train, X_dev, Y_train, Y_dev = train_test_split(X, y, test_size=0.2, random_state=115)
print("padding")
X_train = cnn3.pad_trunc(X_train, maxlen)
X_dev = cnn3.pad_trunc(X_dev, maxlen)
print("reshaping data")
X_train = np.reshape(X_train, (len(X_train), maxlen, embedding_dims))
Y_train = np.reshape(Y_train, (len(Y_train), num_classes))
X_dev = np.reshape(X_dev, (len(X_dev), maxlen, embedding_dims))
Y_dev = np.reshape(Y_dev, (len(Y_dev), num_classes))


splitting test set
padding
reshaping data


In [67]:
cnn3 = GloveKerasDoubleCnn()
print("creating model")
model = cnn3.create()
print("training model")
model = cnn3.train(model, X_train, Y_train, X_dev, Y_dev)

padded_query = cnn3.pad_trunc(vectorized_query, maxlen)
padded_query = np.asarray(padded_query)
predictions = cnn3.model.predict(padded_query)
print(predictions)

creating model
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_41 (Conv1D)           (None, 98, 128)           115328    
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 32, 128)           0         
_________________________________________________________________
conv1d_42 (Conv1D)           (None, 30, 256)           98560     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 256)               0         
_________________________________________________________________
dropout_13 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 3)                 771       
_________________________________________________________________
activation_13 (Activation)   (None, 3)                 0     

In [31]:
padded_query = np.reshape(padded_query, (3, maxlen, embedding_dims)) # Should be redundant, to ensure compliance
predictions = cnn3.model.predict(padded_query)

In [32]:
predictions

array([[2.0883083e-03, 6.3081980e-03, 9.7662210e-05],
       [1.3642669e-02, 2.6145577e-04, 1.0693073e-04],
       [4.2319298e-05, 1.8084049e-03, 3.3995770e-03]], dtype=float32)

In [68]:
cnn3.save(model, save_weights=True)