Approach to emulate:

1.Input Layer

2.Embedding layer

3.BLSTM layer, with element-wise sum of forward/backward pass outputs\

Classification should have dropout  applied on the embedding layer, LSTM layer and penultimate layer. as Well as L2 regularization

4.Attention Layer

5.Output Layer

In [None]:
import os
os.environ["KERAS_BACKEND"]='theano'
os.environ["KERAS_BACKEND"]='tensorflow'
import keras
keras.backend.backend()
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, GRU, Input, TimeDistributed
from keras.models import Sequential, Model, load_model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers
import lmdb
from lmdb_embeddings.reader import LmdbEmbeddingsReader
import lmdb_embeddings.exceptions as exceptions
from keras.utils import to_categorical, np_utils
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import string
import re
import spacy
import tensorflow as tf
nlp=spacy.load('en_core_web_sm')

In [None]:
#Theano
class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializers.get('normal')
        #self.input_spec = [InputSpec(ndim=3)]
        super(AttLayer, self).__init__(** kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        self.W = self.add_weight(name='kernel', 
        shape=(input_shape[-1],),
        initializer='normal',
        trainable=True)
        super(AttLayer, self).build(input_shape)  
        
    def call(self, x, mask=None):
        eij = K.tanh(K.dot(x, self.W))

        ai = K.exp(eij)
        weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')

        weighted_input = x*weights.dimshuffle(0,1,'x')
        return weighted_input.sum(axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [19]:
#Tensorflow
class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializers.get('normal')
        #self.input_spec = [InputSpec(ndim=3)]
        super(AttLayer, self).__init__(** kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        self.W = self.add_weight(name='kernel', 
        shape=(input_shape[-1],),
        initializer='normal',
        trainable=True)
        super(AttLayer, self).build(input_shape)  
        
    def call(self, x, mask=None):
        eij = K.tanh(np.dot(x, self.W))
        ai = K.exp(eij)
        product=tf.expand_dims(K.sum(ai, axis=1), 1)
        weights = ai/product 
        weighted_input = x*weights
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

In [None]:
classifier=Sequential()
classifier.add(Bidirectional(GRU(units=25, return_sequences=True), input_shape=(6,300)))
classifier.add(AttLayer())
classifier.add(Dropout(0.3))
classifier.add(Dense(units=4, activation='softmax'))
classifier.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
classifier=load_model('models/emotions_blstm_att_tf.h5', custom_objects={'AttLayer': AttLayer()})

In [20]:
embeddings=LmdbEmbeddingsReader('data/lmdb_databases')
encoder=LabelEncoder()

In [82]:
data=pd.read_csv('data/isear_plus_semeval.csv')

In [17]:
negative = ['not', 'no', 'neither', 'nor', 'but', 'however', 'although', 'nonetheless', 'despite', 'except',
                         'even though', 'yet']
stop = list(set(stopwords.words('english')))
for neg in negative:
    for stopword in stop:
        if stopword==neg:
            stop.remove(stopword)
rm=['don\'t', 'shouldn\'t', 'doesn\'t', 'didn\'t']
for r in rm:
    stop.remove(r)
exclude = set(string.punctuation)
exclude.add('\n')

In [21]:
def clean(doc):
    lemma=WordNetLemmatizer()
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = "".join([ch for ch in stop_free if ch not in exclude])
    re.sub(r'\n', '', punc_free)
    normalized = " ".join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized

In [22]:
def vec_words(li):
    total_vecs=[]
    for word in li:
        try:
            vector = embeddings.get_word_vector(word)
        except exceptions.MissingWordError:
            # 'google' is not in the database.
            vector= np.zeros(300, dtype='float32')
        total_vecs.append(vector)
    return np.array(total_vecs)

In [23]:
def transform_y(y):
    encoder.fit(y)
    y=encoder.transform(y)
    y_1=np_utils.to_categorical(y)
    #y_1=np.reshape(y_1, (-1, 4, 1))
    return y_1

In [24]:
def word_splits(series):
    word_splits=series.str.split(' ')
    return word_splits

In [None]:
def transform_6(X,y=None):
    X=pd.Series(X).apply(clean).apply(input_duplicator_train)
    splits=word_splits(X)
    numbers_series=splits.apply(vec_words)
    num_docs=len(numbers_series)
    X_1=[]
    y_1=[]
    for index in range(0, num_docs):
        doc=numbers_series.iloc[index]
        print(len(doc))
        for i in range(6, len(doc)):
            X_1.append(doc[i-6:i])
            if y is not None:
                y_1.append(y.iloc[index])
                y_1=transform_y(y_1)
    if y is not None:
        return np.array(X_1), np.array(y_1)
    else:
        return np.array(X_1)

In [None]:
def input_duplicator_train(text):
    splits=text.split(' ')
    while len(splits)<7:
        orig_doc=splits.copy()
        for word in orig_doc:
            splits.append(word)
    return ' '.join(splits)

In [None]:
def input_duplicator(text):
    X=pd.Series(text).apply(clean)
    splits=word_splits(X)
    numbers_series=splits
    num_docs=len(numbers_series)
    for index, doc in enumerate(numbers_series):
        while len(doc)<7:
            orig_doc=doc.copy()
            orig_doc=list(orig_doc)
            doc=list(doc)
            for word in orig_doc:
                doc.append(word)
                #doc=np.insert(doc,(len(doc)),word, axis=0)
                #doc=np.append(doc, word, axis=1)
            modified=True
        numbers_series.iloc[index]=np.array(doc)
    X_1 = []
    if num_docs>1:
        for index in range(0, num_docs):
            doc=numbers_series.iloc[index]
            for i in range(6, len(doc)):
                X_1.append(doc[i-6:i])
    else:
        doc=numbers_series.iloc[0]
        print(doc.shape)
        for i in range(6, len(doc)):
                X_1.append(doc[i-6:i])
    return X_1

In [25]:
def predict(text):
    X=transform(text)
    prediction=classifier.predict(X)
    prediction=np.mean(prediction, axis=0)
    return prediction

In [None]:
X_1, y_1=transform(data['1'], data['0'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size = 0.2, random_state = 0)

In [None]:
classifier.fit(X_train, y_train, epochs=5)

In [None]:
#Previous best with theano
classifier.evaluate(X_test, y_test)
#Theano training is many, many times slower than tf

In [None]:
#With Tensorflow
classifier.evaluate(X_test, y_test)
#Comparable results to without attention. This needs better implementation, closer to the papers.

In [None]:
predict('''I do not happy''')

In [None]:
classifier.save('models/emotions_blstm_att_tf.h5')

<h2>Hierachical Attention Network With Buckets</h2>

In [None]:
#Sentence Segmentation
text='Sentence #one... I hope it picks this up. Sentence LMFAO two! Sentence three?'
tokens=nlp(text)
for s in tokens.sents:
    print(s)

In [98]:
from bucketed_sequence import BucketedSequence
from keras.preprocessing.sequence import pad_sequences
from absl import app

UNK = np.zeros(300)
#FLAGS = flags.FLAGS

'''flags.DEFINE_integer('batch_size', 64, 'Batch size')
flags.DEFINE_integer('epochs', 20, 'Number of epochs to train')
flags.DEFINE_integer('lstm_units', 50, 'Number of LSTM units in RNN')
flags.DEFINE_integer('dense_breadth', 64, 'Number of neurons in the dense ' +
                     'layer')

flags.DEFINE_integer('dataset_size', 4726, 'Size of training dataset')
flags.DEFINE_integer('val_size', 1182, 'Size of validation set')
flags.DEFINE_integer('buckets', 4, 'Number of buckets to use (run with ' +
                     '0 to disable)')'''

'''flags.DEFINE_integer('seqlen_mean', 50, 'Sequence length mean (drawn ' +
                     'from normal distribution)')
flags.DEFINE_integer('seqlen_stddev', 200, 'Sequence length standard ' +
                     'deviation (drawn from normal distribution)')'''

batch_size=64
epochs=100
lstm_units=25
dense_breadth=64
buckets=4

In [101]:
# Set up a simple network (GRU + Dense)
inp = Input(shape=(None, 300), dtype="float32", name="in")
lstm = Bidirectional(GRU(lstm_units, return_sequences=False,
            name="lstm"))(inp)
dense = Dense(dense_breadth, kernel_initializer='normal',
              activation='relu')(lstm)
outputs = Dense(4, kernel_initializer='normal')(dense)
model = Model(inputs=inp, outputs=outputs)
model.compile(optimizer="adam", loss="categorical_crossentropy", 
              metrics=['acc'])

In [44]:
def pad(seqs, maxlen):
    # NOTE: prepends data
    padded = np.array(pad_sequences(seqs, maxlen=maxlen, value=UNK, 
                                    dtype=seqs[0].dtype))
    return np.vstack([np.expand_dims(x, axis=0) for x in padded])

In [107]:
def main(model):
    # Prepare data
    X,y=transform(data['1'], data['0'])
    len_train=[x.shape[0] for x in X[:4000]]
    len_val=[x.shape[0] for x in X[4000:]]
    sequence_lengths = [x.shape[0] for x in X]
    X = pad(X, np.max(sequence_lengths))
    X_train=X[:4000]
    X_test=X[4000:]
    y_train=y[:4000]
    y_test=y[4000:]
    if buckets > 0:
        # Create Sequence objects
        train_generator = BucketedSequence(buckets, batch_size,
                                           len_train, X_train, y_train)
        val_generator = BucketedSequence(buckets, batch_size,
                                         len_val, X_test, y_test)

        model.fit_generator(train_generator, epochs=epochs,
                            validation_data=val_generator,
                            shuffle=False, verbose=True)
    else:
        # No bucketing
        model.fit(x=x_train, y=y_train, epochs=epochs,
                  validation_data=(x_val, y_val),
                  batch_size=batch_size, verbose=True, shuffle=True)

In [104]:
X,y=transform(data['1'], data['0'])
len_train=[x.shape[0] for x in X[:4000]]
len_val=[x.shape[0] for x in X[4000:]]
sequence_lengths = [x.shape[0] for x in X]
X = pad(X, np.max(sequence_lengths))
X_train=X[:4000]
X_test=X[4000:]
y_train=y[:4000]
y_test=y[4000:]

In [106]:
X_train.shape

(4000, 54, 300)

In [36]:
def transform(X,y=None):
    X=pd.Series(X).apply(clean)
    splits=word_splits(X)
    numbers_series=splits.apply(vec_words)
    num_docs=len(numbers_series)
    X_1=[]
    y_1=[]
    for index in range(0, num_docs):
        doc=numbers_series.iloc[index]
        X_doc=[]
        for word in doc:
            X_doc.append(word)
        X_1.append(np.array(X_doc))
        if y is not None:
            y_1.append(y.iloc[index])
        
    if y is not None:
        y_1=transform_y(y_1)
        return np.array(X_1), np.array(y_1)
    else:
        return np.array(X_1)

In [77]:
sequence_lengths = [x.shape[0] for x in X]
padded_x = pad(X, 54)
padded_x.shape

(5908, 54, 300)

In [111]:
bucket_sizes, bucket_ranges = np.histogram(sequence_lengths,
                                                   bins=4)

In [119]:
import math
bucket_ranges
x_seq=X_train
input_shape = (1,) if len(x_seq.shape) == 2 else x_seq.shape[2:]
output_shape = (1,) if len(y.shape) == 1 else y.shape[1:]
actual_buckets = [bucket_ranges[i+1] 
                          for i,bs in enumerate(bucket_sizes) if bs > 0]
actual_bucketsizes = [bs for bs in bucket_sizes if bs > 0]
bucket_seqlen = [int(math.ceil(bs)) for bs in actual_buckets]
num_actual = len(actual_buckets)
print('Training with %d non-empty buckets' % num_actual)
#print(bucket_seqlen)
#print(actual_bucketsizes)
bins = [(np.ndarray([bs, bsl] + list(input_shape), dtype=x_seq.dtype),
              np.ndarray([bs] + list(output_shape), dtype=y.dtype)) 
             for bsl,bs in zip(bucket_seqlen, actual_bucketsizes)]
assert len(bins) == num_actual

Training with 4 non-empty buckets


In [133]:
bucket_seqlen

[15, 28, 41, 54]

In [134]:
actual_bucketsizes

[4716, 1082, 97, 13]

In [138]:
[14,4716]+list(input_shape)

[14, 4716, 300]

In [137]:
list(input_shape)

[300]

In [108]:
main(model)

Training with 4 non-empty buckets
Training with 4 non-empty buckets
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
13/65 [=====>........................] - ETA: 1s - loss: 11.7223 - acc: 0.18 - ETA: 1s - loss: 12.3175 - acc: 0.22 - ETA: 0s - loss: 12.0759 - acc: 0.20 - ETA: 0s - loss: 12.0410 - acc: 0.2159

KeyboardInterrupt: 

In [37]:
X, y=transform(data['1'], data['0'])

In [None]:
#sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
#sentence_input = Input(shape=(None,), dtype='int32')
#embedded_sequences = embedding_layer(sentence_input)
#l_lstm = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
sentence_input= Input(shape=(None, 300))
l_lstm = Bidirectional(GRU(100, return_sequences=True))(sentence_input)
l_dense = TimeDistributed(Dense(200))(l_lstm)
l_att = AttLayer()(l_dense)
sentEncoder = Model(sentence_input, l_att)
 
#review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
review_input = Input(shape=(7,None), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
l_dense_sent = TimeDistributed(Dense(200))(l_lstm_sent)
l_att_sent = AttLayer()(l_dense_sent)
preds = Dense(2, activation='softmax')(l_att_sent)
model = Model(review_input, preds)

In [None]:
han=Sequential()
han.add(Bidirectional(GRU(units=100, return_sequences=True), input_shape=(6,300)))
han.add(TimeDistributed(Dense(200)))
han.add(AttLayer())
classifier.add(Dropout(0.3))
classifier.add(Dense(units=4, activation='softmax'))
classifier.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

<h1>anger, fear, joy, sadness</h1>