In [None]:
import tensorflow as tf
import keras
import transformers
from transformers import DistilBertTokenizerFast
import matplotlib.pyplot as plt
from statistics import mean
import os
import csv
import random
import operator
from nltk.corpus import stopwords
import nltk
import matplotlib.pyplot as plt
import re
import warnings
import numpy as np
warnings.filterwarnings('ignore')

# Reading Sarcasm Data

In [None]:
import json
comments = []
labels = []

with open(os.path.join(os.getcwd(),'data/sarcasm.json')) as file:
    data = json.load(file)
    comments_labels = [(row['is_sarcastic'], row['headline']) for row in data]

In [None]:
comments_labels[:5]

# Term Frequency

In [None]:
def term_frequency(comments_labels):
    nltk.download("stopwords", quiet=True)
    stop_words = stopwords.words("english")
    just_sentences = [sentence for label, sentence in comments_labels]    
    tf_dict = {}
    for sentence in just_sentences:
        sentence = re.sub(r"[^a-zA-Z]", " ", sentence.lower())
        split_words = sentence.split(" ")
        for word in split_words:
            if word not in stop_words and str(word) not in ['nan', '']:
                tf_dict[word] = tf_dict.setdefault(word, 0) + 1
    return sorted(tf_dict.items(), key=operator.itemgetter(1),reverse=True)

In [None]:
term_frequency = term_frequency(comments_labels)

In [None]:
plt.bar([x for x,y in term_frequency[:10]], [y for x,y in term_frequency[:10]])

# Data Text Cleaning-Processing

In [None]:
train_data = comments_labels[:int((len(comments_labels)+1)*.80)]
validation_data = comments_labels[int(len(comments_labels)*.80+1):]

In [None]:
def preprocessing(comments_labels):
    import re
    sentences = [(te, te1) for te, te1 in comments_labels if str(te) not in ['nan', '']]
    reduced_sentences = [(te, te1) for te, te1 in sentences if len(te1.split(" ")) >= 5]
    new_sentences = [(te,re.sub(r"[^a-zA-Z0-9]", " ", te1.lower())) for te, te1 in reduced_sentences]
    return [te1 for te, te1 in new_sentences], [int(te) for te, te1 in new_sentences]

In [None]:
train_text, train_labels = preprocessing(train_data)
validation_text, validation_labels = preprocessing(validation_data)

# Import BERT models

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_text, truncation = True, padding = True, max_length = 50)
validation_encodings = tokenizer(validation_text, truncation = True, padding = True, max_length = 50)

In [None]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(validation_encodings),
    validation_labels
))

In [None]:
from transformers import TFDistilBertForSequenceClassification
optimizer = tf.keras.optimizers.RMSprop(learning_rate=5e-5)
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(train_dataset.shuffle(1000).batch(16),
          epochs=3,
          batch_size=16,
          validation_data=val_dataset.shuffle(1000).batch(16))

In [None]:
model.save_pretrained(os.path.join(os.getcwd(),"model/sarcasm_bert_model.h5"))

# Custom Model with Convolutions

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
vocab_size = 1000
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_text)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(train_text)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(validation_text)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
model_custom = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model_custom.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])
model_custom.summary()

num_epochs = 10

training_padded = np.array(training_padded)
training_labels = np.array(train_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(validation_labels)

model_custom.fit(training_padded, training_labels, epochs=num_epochs,
          validation_data=(testing_padded, testing_labels), verbose=1)

In [None]:
test_sentence = "With their homes in ashes, residents share harrowing tales of survival after massive wildfires kill 15"
test_sentence_sarcasm = "So Im guessing you didn't get the part or Italy called and said it was hungry?"
# replace to test_sentence_sarcasm variable, if you want to test 
# sarcasm
predict_input = tokenizer.encode(test_sentence_sarcasm,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")
tf_output = model.predict(predict_input)[0]

In [None]:
predict_input

In [None]:
tf_prediction = tf.nn.softmax(tf_output, axis=1).numpy()[0]
if tf_prediction[1] > tf_prediction[0]:
    print("Sentence is Sarcasm")
else:
    print("Sentence is not Sarcasm")