In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, Embedding, Dense, LSTM, Flatten, TimeDistributed

In [None]:
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

In [None]:
from tensorflow import random
random.set_seed(42)

In [None]:
train = pd.read_csv('C:/Users/Donghyeok/Desktop/Work Folder/Reddit Sarcasm Detection/train-balanced-sarcasm.csv')

In [None]:
test = pd.read_csv('C:/Users/Donghyeok/Desktop/Work Folder/Reddit Sarcasm Detection/test-balanced.csv')

In [None]:
train = train.dropna(subset=['comment'])

In [None]:
def remove_punc(comment):
    '''This function uses str methods from the string class to remove punctuation from the text.'''
    
    # replace punctuation with '' (no space)
    translator = str.maketrans('', '', string.punctuation)
    
    # return the text stripped of punctuation marks
    return comment.translate(translator)
train['comment'] = train['comment'].apply(remove_punc)
train.head()

In [None]:
# load stopwords from NLTK
stopwords_eng = stopwords.words('english')
# view stop words
np.array(stopwords_eng)

In [None]:
def remove_stopwords_and_lowercase(comment):
    '''This function lowercases words and then remove stopwords.'''
    
    comment = \
    [word.lower() for word in comment.split() if word.lower() not in stopwords_eng]
    return ' '.join(comment)
# Apply the function to each comment
train['comment'] = train['comment'].apply(remove_stopwords_and_lowercase)
train.head()

In [None]:
sample_df_idx = pd.read_csv('sample_index.csv').values.ravel()

In [None]:
comments = train['comment'].loc[sample_df_idx]
labels = train['label'].loc[sample_df_idx]

In [None]:
embedding_dim = 32
vocab_size = 8000
length = 100

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(comments)

word_index = tokenizer.word_index

train_seq = tokenizer.texts_to_sequences(comments)
train_pad = pad_sequences(train_seq, maxlen=length, padding='post', truncating='post')

In [None]:
train_seq = np.array(train_seq)
train_pad = np.array(train_pad)
labels = np.array(labels)

In [None]:
#Holdout validation set
X_train, X_test, y_train, y_test = train_test_split(train_pad, labels, test_size=0.2, random_state=24)


In [None]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=length))
model.add(Bidirectional(LSTM(100, return_sequences=True,dropout=0.50),merge_mode='concat'))
model.add(TimeDistributed(Dense(100,activation='relu')))
model.add(Flatten())
model.add(Dense(100,activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)

In [None]:
checkpoint_path = "training_4/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [None]:
num_epochs = 1000
batch_size = 95

history = model.fit(X_train, y_train, validation_split=0.2, epochs=num_epochs, batch_size=100, verbose=1, callbacks=[es, cp_callback])


In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel("Epochs")
plt.ylabel('loss')
plt.legend(['loss', 'val_loss'])
plt.show()
  
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.xlabel("Epochs")
plt.ylabel('accuracy')
plt.legend(['accuracy', 'val_accuracy'])
plt.show()

In [None]:
model.save('lstm_sentence_model')

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel("Epochs")
plt.ylabel('loss')
plt.legend(['loss', 'val_loss'])
plt.show()
  
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.xlabel("Epochs")
plt.ylabel('accuracy')
plt.legend(['accuracy', 'val_accuracy'])
plt.show()

In [None]:
predictions = model.predict(X_test)

In [None]:
y_pred = predictions > 0.5

In [None]:
matrix = confusion_matrix(y_test, y_pred)

In [None]:
tn, fp, fn, tp = matrix.ravel()

In [None]:
df_cm = pd.DataFrame(matrix, index = ["Actual Negative", "Actual Positive"],
                      columns = ["Predicted Negative", "Predicted Positive"])
plt.figure(figsize = (10,7))
sns.heatmap(df_cm, annot=True, fmt='d', cmap="Blues")

In [None]:
print(classification_report(y_test, y_pred))