In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Flatten, Embedding
from keras.optimizers import RMSprop

Using TensorFlow backend.


In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [4]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [6]:
import spacy
import numpy as np
nlp = spacy.load('en_core_web_lg')

In [7]:
word2vec_embedding_len = len(nlp.vocab['apple'].vector)
print("EMBEDDINGS_LEN=", word2vec_embedding_len )  # 300

EMBEDDINGS_LEN= 300


In [8]:
df = pd.read_csv(os.path.join(os.getcwd(),'sarcasm_headline_dataset.csv'))
df.head()

Unnamed: 0,HEADLINE,IS_SARCASTIC
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [9]:
data = df['HEADLINE'].values.tolist()
labels = df['IS_SARCASTIC'].values.tolist()

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)

In [10]:
word_index = tokenizer.word_index
max_words_feature_space = len(word_index) + 1
max_seq_len = max([len(headline) for headline in data])

In [11]:
X = pad_sequences(sequences, maxlen=max_seq_len)
y = np.asarray(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20)

In [14]:
my_embedding_matrix = np.zeros((max_words_feature_space, word2vec_embedding_len))
for word, idx in word_index.items():
    try:
        embedding = nlp.vocab[word].vector
        my_embedding_matrix[idx] = embedding
    except:
        pass

In [34]:
model1_filename = os.path.join(os.getcwd(),'models','sarcasm','word2vec_model2_1.h5')

In [21]:
from keras.callbacks import ReduceLROnPlateau,ModelCheckpoint,EarlyStopping

plateau_callback =ReduceLROnPlateau(monitor='val_loss',factor=0.1,patience=2)
modelcheckpoint_callback = ModelCheckpoint(filepath=model1_filename,monitor='val_loss',save_best_only=True)
earlystop_callback = EarlyStopping(monitor='val_loss',patience=3)

In [22]:
model1 = Sequential()
model1.add(Embedding(max_words_feature_space,
                     word2vec_embedding_len,
                     weights=[my_embedding_matrix],
                     input_length=max_seq_len,
                     trainable=False))
model1.add(LSTM(300,return_sequences=False))
model1.add(Dense(units=1, activation='sigmoid'))

model1.compile(loss='binary_crossentropy', optimizer=RMSprop(lr=0.001), metrics=['accuracy'])
print(model1.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 254, 300)          8897100   
_________________________________________________________________
lstm_2 (LSTM)                (None, 300)               721200    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 301       
Total params: 9,618,601
Trainable params: 721,501
Non-trainable params: 8,897,100
_________________________________________________________________
None


In [23]:
history1 = model1.fit(X_train, y_train, 
                      epochs=30, batch_size=256,
                      verbose=1,callbacks=[plateau_callback,modelcheckpoint_callback,earlystop_callback],
                      validation_data=(X_val, y_val)
                     )

Train on 17093 samples, validate on 4274 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30


In [25]:
model1.evaluate(X_test, y_test)



[0.32871628757140825, 0.8732684388092866]

In [26]:
model1_5 = Sequential()
model1_5.add(Embedding(max_words_feature_space,
                     word2vec_embedding_len,
                     weights=[my_embedding_matrix],
                     input_length=max_seq_len,
                     trainable=False))
model1_5.add(LSTM(300,return_sequences=False))
model1_5.add(Dense(units=150, activation='relu'))
model1_5.add(Dense(units=1, activation='sigmoid'))

model1_5.compile(loss='binary_crossentropy', optimizer=RMSprop(lr=0.001), metrics=['accuracy'])
print(model1_5.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 254, 300)          8897100   
_________________________________________________________________
lstm_3 (LSTM)                (None, 300)               721200    
_________________________________________________________________
dense_3 (Dense)              (None, 150)               45150     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 151       
Total params: 9,663,601
Trainable params: 766,501
Non-trainable params: 8,897,100
_________________________________________________________________
None


In [28]:
history1_5 = model1_5.fit(X_train, y_train, 
                      epochs=30, batch_size=256,
                      verbose=1,callbacks=[plateau_callback,modelcheckpoint_callback,earlystop_callback],
                      validation_data=(X_val, y_val)
                     )

Train on 17093 samples, validate on 4274 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30


In [29]:
model1_5.evaluate(X_test, y_test)



[0.368838570523869, 0.8685885434604146]

In [31]:
model2 = Sequential()
model2.add(Embedding(max_words_feature_space,
                     word2vec_embedding_len,
                     weights=[my_embedding_matrix],
                     input_length=max_seq_len,
                     trainable=False))
model2.add(LSTM(300,return_sequences=True))
model2.add(LSTM(300,return_sequences=False))
model2.add(Dense(units=1, activation='sigmoid'))

model2.compile(loss='binary_crossentropy', optimizer=RMSprop(lr=0.001), metrics=['accuracy'])
print(model2.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 254, 300)          8897100   
_________________________________________________________________
lstm_4 (LSTM)                (None, 254, 300)          721200    
_________________________________________________________________
lstm_5 (LSTM)                (None, 300)               721200    
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 301       
Total params: 10,339,801
Trainable params: 1,442,701
Non-trainable params: 8,897,100
_________________________________________________________________
None


In [32]:
history2 = model2.fit(X_train, y_train, 
                      epochs=30, batch_size=512,
                      verbose=1,callbacks=[plateau_callback,modelcheckpoint_callback,earlystop_callback],
                      validation_data=(X_val, y_val)
                     )

Train on 17093 samples, validate on 4274 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30


In [33]:
from keras.layers import Dropout

In [35]:
model2_1 = Sequential()
model2_1.add(Embedding(max_words_feature_space,
                     word2vec_embedding_len,
                     weights=[my_embedding_matrix],
                     input_length=max_seq_len,
                     trainable=False))
model2_1.add(LSTM(300,return_sequences=True))
model2_1.add(Dropout(0.2))
model2_1.add(LSTM(300,return_sequences=False))
model2_1.add(Dense(units=1, activation='sigmoid'))

model2_1.compile(loss='binary_crossentropy', optimizer=RMSprop(lr=0.001), metrics=['accuracy'])
print(model2_1.summary())

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 254, 300)          8897100   
_________________________________________________________________
lstm_6 (LSTM)                (None, 254, 300)          721200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 254, 300)          0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 300)               721200    
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 301       
Total params: 10,339,801
Trainable params: 1,442,701
Non-trainable params: 8,897,100
_________________________________________________________________
None


In [36]:
history2_1 = model2_1.fit(X_train, y_train, 
                      epochs=30, batch_size=512,
                      verbose=1,callbacks=[plateau_callback,modelcheckpoint_callback,earlystop_callback],
                      validation_data=(X_val, y_val)
                     )

Train on 17093 samples, validate on 4274 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30


In [37]:
model2_1.evaluate(X_test, y_test)



[0.35054774643753583, 0.8715836765479613]