# Hierarchal RNN

This kernel is a somewhat improved version of [Keras - Bidirectional LSTM baseline](https://www.kaggle.com/CVxTz/keras-bidirectional-lstm-baseline-lb-0-051) along with some additional documentation of the steps. (NB: this notebook has been re-run on the new test set.)

In [1]:
# Fast Text
# Increase the glove Embedding
# Use Fast Text to generate the embedding

In [1]:
# https://github.com/richliao/textClassifier/blob/master/textClassifierHATT.py
# https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf
# https://github.com/EdGENetworks/attention-networks-for-classification

In [40]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,CuDNNLSTM
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.layers import Conv1D, MaxPooling1D,Merge, GRU, TimeDistributed
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.layers.normalization import BatchNormalization

from keras.optimizers import Adam,SGD

import fastText

from nltk import tokenize
from keras.preprocessing.text import text_to_word_sequence

from keras.engine.topology import Layer
from keras import initializers
from keras import backend as K
from keras.engine import InputSpec
from keras.initializers import zero
from keras.initializers import RandomNormal
import tensorflow as tf

from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.class_weight import compute_sample_weight

import gc

import matplotlib.pyplot as plt
%matplotlib inline  

We include the GloVe word vectors in our input files. To include these in your kernel, simple click 'input files' at the top of the notebook, and search 'glove' in the 'datasets' section.

In [2]:
path = 'data/'
EMBEDDING_FILE=f'wv/glove.6B.300d.txt'
TRAIN_DATA_FILE=f'{path}train.csv'
TEST_DATA_FILE=f'{path}test.csv'

Set some basic config parameters:

In [51]:
MAX_SENT_LENGTH = 512
MAX_SENTS = 20
EMBEDDING_DIM = 300

Read in our data and replace missing values:

In [59]:
def normalize(s):
    """
    Given a text, cleans and normalizes it. Feel free to add your own stuff.
    """
    s = s.lower()
    # Replace ips
    s = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', ' _ip_ ', s)
    # Isolate punctuation
    s = re.sub(r'([\'\"\.\(\)\!\?\-\\\/\,])', r' \1 ', s)
    # Remove some special characters
    
    s = re.sub(r'([\;\:\|•«\n「」¤]\xa0)', ' ', s)
    # Replace numbers and symbols with language
#     s = s.replace('&', ' and ')
#     s = s.replace('@', ' at ')
#     s = s.replace('0', ' zero ')
#     s = s.replace('1', ' one ')
#     s = s.replace('2', ' two ')
#     s = s.replace('3', ' three ')
#     s = s.replace('4', ' four ')
#     s = s.replace('5', ' five ')
#     s = s.replace('6', ' six ')
#     s = s.replace('7', ' seven ')
#     s = s.replace('8', ' eight ')
#     s = s.replace('9', ' nine ')
    return s

In [60]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

train["comment_text"].fillna("_empty_",inplace=True)
list_sentences_train = train["comment_text"].apply(lambda x:normalize(x)).values
test["comment_text"].fillna("_empty_",inplace=True)
list_sentences_test = test["comment_text"].apply(lambda x:normalize(x)).values

list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values


In [61]:
tokenizer = Tokenizer(oov_token='_oov_')
tokenizer.fit_on_texts(list(list_sentences_train))

In [62]:
MAX_NB_WORDS = len(tokenizer.word_index)+1

In [63]:
# Only for Option 3

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

X_t = pad_sequences(list_tokenized_train, maxlen=MAX_SENT_LENGTH)
X_te = pad_sequences(list_tokenized_test, maxlen=MAX_SENT_LENGTH)

In [10]:
# Only for Option 1 and 2
reviews = []
labels = []
texts = []

for i in list_sentences_train:
    sentences = tokenize.sent_tokenize(i)
    reviews.append(sentences)
    
# Zero paddings 
data = np.zeros((len(list_sentences_train), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
data.shape

for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j< MAX_SENTS:
            # same as split + lower + punctuation removal
#             wordTokens = text_to_word_sequence(sent)
            wordTokens = sent.lower().split(' ')
#             k=0
            for k , word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH :
                    try :
                        data_i = tokenizer.word_index[word]
                    except KeyError:
#                         print(word)
                        data_i = 0
                    data[i,j,k] = data_i
#                     k=k+1                    

del list_sentences_train
gc.collect()

t_reviews = []

# Test Set 
for i in list_sentences_test:
    sentences = tokenize.sent_tokenize(i)
    t_reviews.append(sentences)
    
# Zero paddings 
t_data = np.zeros((len(list_sentences_test), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, sentences in enumerate(t_reviews):
    for j, sent in enumerate(sentences):
        if j< MAX_SENTS:
            # same as split + lower + punctuation removal
#             wordTokens = text_to_word_sequence(sent)
            wordTokens = sent.lower().split(' ')
#             k=0
            for k , word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH :
                    try :
                        data_i = tokenizer.word_index[word]
                    except KeyError:
#                         print(word)
                        data_i = 0
                    t_data[i,j,k] = data_i
#                     k=k+1                    

del list_sentences_test
gc.collect()

0

In [11]:
# from nltk.corpus import stopwords
# cachedStop =  stopwords.words('english')
# pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
# def cleanwords(sent):
#     return ' '.join([word.lower() for word in sent.lower().split() if word not in cachedStop ])
    # return pattern.sub('', sent.lower())

# def cleanchars(sent):
#     return sent.translate(translator)


Standard keras preprocessing, to turn each comment into a list of word indexes of equal length (with truncation or padding as needed).

Read the glove word vectors (space delimited strings) into a dictionary from word->vector.

In [9]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

In [33]:
len(embeddings_index.values())

400000

In [34]:
set([e.shape for e in embeddings_index.values()])
print(len([e.shape for e in embeddings_index.values() if e.shape[0] == 300]))

400000


Use these vectors to create our embedding matrix, with random initialization for words that aren't in GloVe. We'll use the same mean and stdev of embeddings the GloVe has when generating the random init.

In [10]:
embed_i = [e for e in embeddings_index.values() if e.shape[0] == 300]
# embed_i = embeddings_index.values()
all_embs = np.stack(embed_i)
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(-0.0039050116, 0.38177028)

In [11]:
word_index = tokenizer.word_index
embedding_matrix = np.random.normal(emb_mean, emb_std, (MAX_NB_WORDS, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS: continue # greater than max word features
    embedding_vector = embeddings_index.get(word) # out of word vocabulary
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

Simple bidirectional LSTM with two fully connected layers. We add some dropout to the LSTM since even 2 epochs is enough to overfit.

In [17]:
# Only for Option 1 and 2

VALIDATION_SPLIT = 0.1

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = y[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in traing and validation set')
# print y_train.sum(axis=0)
# print y_val.sum(axis=0)

del data
gc.collect()

Number of positive and negative reviews in traing and validation set


169

In [18]:
x_train.shape,y_train.shape,x_val.shape,y_val.shape

((143614, 20, 512), (143614, 6), (15957, 20, 512), (15957, 6))

In [19]:
# from keras.layers import Conv1D, MaxPooling1D,Merge, GRU, RNN
# RNN??

## Option 1

In [19]:
embedding_layer = Embedding(MAX_NB_WORDS,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True)

sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
sentEncoder = Model(sentence_input, l_lstm)

review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(LSTM(100))(review_encoder)
preds = Dense(6, activation='sigmoid')(l_lstm_sent)
model = Model(review_input, preds)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print("model fitting - Hierachical LSTM")
print(model.summary())


model fitting - Hierachical LSTM
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 15, 500)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 15, 200)           38832800  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 200)               240800    
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 1206      
Total params: 39,074,806
Trainable params: 39,074,806
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=1, batch_size=32)

In [20]:
# Experiment
# x = K.placeholder(shape=(2, 3))
# y = K.placeholder(shape=(3, 4))
# xy = tf.keras.backend.dot(x, y)
# xy

# import numpy as np
# x = np.zeros([500,200])
# x.shape[-1]

# init = initializers.get('normal')
# w = init((200,))
# K.expand_dims(w).shape

# init

#batch, time(max_len),word_dim
# x = tf.placeholder(np.float32,(16,500,200))
# W1 = tf.placeholder(np.float32,(200,500))
# y = tf.keras.backend.dot(x,W1)
# y.shape

<tf.Tensor 'MatMul:0' shape=(2, 4) dtype=float32>

## Option 2

In [20]:
# building Hierachical Attention network

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True)

class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializers.get('normal')
        self.input_spec = [InputSpec(ndim=3)]
        self.attention_size = 50
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        self.W = tf.Variable(tf.random_normal([input_shape[-1], self.attention_size], stddev=0.1))
        self.B = tf.Variable(tf.random_normal([self.attention_size], stddev=0.1))
        self.U = tf.Variable(tf.random_normal([self.attention_size], stddev=0.1))
        self.trainable_weights = [self.W,self.B,self.U]
        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

    def call(self, x, mask=None):
        #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
        v = tf.tanh(tf.tensordot(x, self.W, axes=1) + self.B)
        vu = tf.tensordot(v, self.U, axes=1)  # (B,T) shape
        alphas = tf.nn.softmax(vu)         # (B,T) shape
        output = tf.reduce_sum(x * tf.expand_dims(alphas, -1), 1)
        
        return output
#         eij = tf.squeeze(tf.keras.backend.dot(x, tf.keras.backend.expand_dims(self.W,-1)), axis=-1)
        
#         ai = tf.exp(eij)
#         weights = ai/K.sum(ai, axis=1).dimshuffle(0,'x')
#         weights = tf.keras.backend.expand_dims(ai/tf.keras.backend.sum(ai, axis=1),-1)
        # replace dimshuffle with tf.expand_dims()
        
#         weighted_input = x*weights
#         return tf.keras.backend.sum(weighted_input,axis=1)
#         return weighted_input.sum(axis=1)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(GRU(50, return_sequences=True))(embedded_sequences)
# returns 500 vectors and applies a dense layer of 200 to each
l_dense = TimeDistributed(Dense(100))(l_lstm)
l_att = AttLayer()(l_dense)
# output (batch,500,100) --> Highlight the essence of the word
sentEncoder = Model(sentence_input, l_att)


review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
# returns 15 vectors and applies a dense layer of 200 to each vector : (15,500) * (500,100)
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(GRU(50, return_sequences=True))(review_encoder)
l_dense_sent = TimeDistributed(Dense(100))(l_lstm_sent)
l_att_sent = AttLayer()(l_dense_sent)
preds = Dense(6, activation='sigmoid')(l_att_sent)
model = Model(review_input, preds)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print("model fitting - Hierachical attention network")
model.summary()

model fitting - Hierachical attention network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 20, 512)           0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 20, 100)           58008500  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 20, 100)           45300     
_________________________________________________________________
time_distributed_3 (TimeDist (None, 20, 100)           10100     
_________________________________________________________________
att_layer_2 (AttLayer)       (None, 100)               5100      
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 606       
Total params: 58,069,606
Trainable params: 58,069,606
Non-trainable params: 0
__________________

In [21]:
# model.fit(x_val, y_val,epochs=1, batch_size=16)
model.fit(x_train, y_train, validation_data=(x_val, y_val),epochs=2, batch_size=32)

Train on 143614 samples, validate on 15957 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f3fd4fd3d68>

In [23]:
# Option 1 and 2

y_test = model.predict(t_data, batch_size=32, verbose=1)
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('lstm_attention_baseline_g300.csv', index=False)



## Option 3

In [64]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True)

class AttLayer(Layer):
    def __init__(self, **kwargs):
        self.init = initializers.get('normal')
        self.input_spec = [InputSpec(ndim=3)]
        self.attention_size = 50
        super(AttLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape)==3
        self.W = tf.Variable(tf.random_normal([input_shape[-1], self.attention_size], stddev=0.1))
        self.B = tf.Variable(tf.random_normal([self.attention_size], stddev=0.1))
        self.U = tf.Variable(tf.random_normal([self.attention_size], stddev=0.1))
        self.trainable_weights = [self.W,self.B,self.U]
        super(AttLayer, self).build(input_shape)  # be sure you call this somewhere!

    def call(self, x, mask=None):
        #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
        v = tf.tanh(tf.tensordot(x, self.W, axes=1) + self.B)
        vu = tf.tensordot(v, self.U, axes=1)  # (B,T) shape
        alphas = tf.nn.softmax(vu)         # (B,T) shape
        output = tf.reduce_sum(x * tf.expand_dims(alphas, -1), 1)
        
        return output

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])

def AttentionModel():
    sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sentence_input)
    l_lstm = Bidirectional(CuDNNLSTM(50, return_sequences=True))(embedded_sequences)
    # returns 500 vectors and applies a dense layer of 200 to each
    # l_dense = TimeDistributed(Dense(100))(l_lstm)
    l_att = AttLayer()(l_lstm)
    l_dense = Dense(50)(l_att)
    l_dense = BatchNormalization()(l_dense)
    l_dense = Activation('sigmoid')(l_dense)
    l_dense = Dropout(0.1)(l_dense)
    preds = Dense(6, activation='sigmoid')(l_dense)
    model = Model(sentence_input, preds)

    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0)
    sgd = SGD(lr=0.1, decay=0, momentum=0.9, nesterov=True)
    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    return model

print("model fitting - Hierachical attention network")
model = AttentionModel()
model.summary()

model fitting - Hierachical attention network
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (None, 512)               0         
_________________________________________________________________
embedding_15 (Embedding)     (None, 512, 300)          57888300  
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 512, 100)          140800    
_________________________________________________________________
att_layer_11 (AttLayer)      (None, 100)               5100      
_________________________________________________________________
dense_21 (Dense)             (None, 50)                5050      
_________________________________________________________________
batch_normalization_11 (Batc (None, 50)                200       
_________________________________________________________________
activation_11 (Activation)   (

In [50]:
# WIP
from sklearn.model_selection import KFold,StratifiedKFold
folds = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=16).split(X_t, y))

valid_pred = np.zeros(X_t.shape[0])

for j, (train_idx, valid_idx) in enumerate(folds):
    X_train_cv = X_t[train_idx]
    y_train_cv = y[train_idx]
    X_holdout = X_t[valid_idx]
    Y_holdout= y[valid_idx]
    model = AttentionModel()
    

In [65]:
# # class_weight = {}
# y = train[list_classes].values
# class_weight = []
# for i in range(6):
#     weight = compute_class_weight('balanced',np.unique(y[:,i]),y[:,i])
# #     class_weight[i] = {0:weight[0],1:weight[1]}
#     class_weight.append({0:weight[0],1:weight[1]})
# y_sample = compute_sample_weight('balanced',y)
y_sample = np.max(y,axis=1)+1

In [67]:
model.fit(X_t, y, batch_size=64, epochs=2,validation_split=0.1,sample_weight=y_sample);

Train on 143613 samples, validate on 15958 samples
Epoch 1/2


ResourceExhaustedError: OOM when allocating tensor with shape[192961,300]
	 [[Node: training_5/Adam/mul_3 = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](Adam_10/beta_2/read, training_5/Adam/Variable_16/read)]]
	 [[Node: loss_8/mul/_2305 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_2408_loss_8/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

Caused by op 'training_5/Adam/mul_3', defined at:
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2808, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-66-1f3c98a04575>", line 1, in <module>
    model.fit(X_t, y, batch_size=32, epochs=2,validation_split=0.1,sample_weight=y_sample);
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/training.py", line 1646, in fit
    self._make_train_function()
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/engine/training.py", line 970, in _make_train_function
    loss=self.total_loss)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/keras/optimizers.py", line 456, in get_updates
    v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/ops/variables.py", line 754, in _run_op
    return getattr(ops.Tensor, operator)(a._AsTensor(), *args)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 894, in binary_op_wrapper
    return func(x, y, name=name)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/ops/math_ops.py", line 1117, in _mul_dispatch
    return gen_math_ops._mul(x, y, name=name)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/ops/gen_math_ops.py", line 2726, in _mul
    "Mul", x=x, y=y, name=name)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 2956, in create_op
    op_def=op_def)
  File "/home/chenjennhaur/anaconda3/envs/tensorflow/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1470, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[192961,300]
	 [[Node: training_5/Adam/mul_3 = Mul[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:GPU:0"](Adam_10/beta_2/read, training_5/Adam/Variable_16/read)]]
	 [[Node: loss_8/mul/_2305 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_2408_loss_8/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]


In [32]:
# WIP
from sklearn.metrics import roc_auc_score

In [17]:
model.save('lstm_word_attention.model')

And finally, get predictions for the test set and prepare a submission CSV:

In [31]:
# Option 3

y_test = model.predict([X_te], batch_size=32, verbose=1)
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('lstm_attention_g300_v3_256_ws.csv', index=False)



In [4]:
# sample_submission.to_csv('base_test.csv',index=False)

In [19]:
# test_submission = pd.read_csv('data/sample_submission.csv')
# len(test_submission)

In [None]:
# Baseline Score
# loss: 0.0417 - acc: 0.9840 - val_loss: 0.0451 - val_acc: 0.9829 --> AUC : 0.9787

