<a href="https://colab.research.google.com/github/dipanshuverma98/Sarcasm_Model/blob/main/SarcasmDetectionModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Sarcasm
import json
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense
from tensorflow.keras.callbacks import ModelCheckpoint

data = []
with open("Sarcasm_Headlines_Dataset.json", "r") as file:
    for line in file:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Skipping invalid line: {line.strip()} due to error: {e}")

df = pd.DataFrame(data)

print(df.head())

                                        article_link  \
0  https://www.huffingtonpost.com/entry/versace-b...   
1  https://www.huffingtonpost.com/entry/roseanne-...   
2  https://local.theonion.com/mom-starting-to-fea...   
3  https://politics.theonion.com/boehner-just-wan...   
4  https://www.huffingtonpost.com/entry/jk-rowlin...   

                                            headline  is_sarcastic  
0  former versace store clerk sues over secret 'b...             0  
1  the 'roseanne' revival catches up to our thorn...             0  
2  mom starting to fear son's web series closest ...             1  
3  boehner just wants wife to listen, not come up...             1  
4  j.k. rowling wishes snape happy birthday in th...             0  


In [2]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/590.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m337.9/590.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [3]:
import re
import emoji

def preprocess_text(text):
    text = text.lower()

    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r"[^a-zA-Z0-9#@' ]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    return text


In [4]:
df["text"] = df["headline"].apply(preprocess_text)

texts = df["text"].values
labels = df["is_sarcastic"].values


In [5]:
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 50
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [7]:
# BiLSTM
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
model = Sequential([
    Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=128, input_length=MAX_SEQUENCE_LENGTH),

    Bidirectional(LSTM(32, return_sequences=True, dropout=0.5, recurrent_dropout=0.5)),
    Bidirectional(LSTM(16, dropout=0.5, recurrent_dropout=0.5)),

    Dense(32, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])


model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

checkpoint = ModelCheckpoint("best_bilstm_model.h5", save_best_only=True, monitor="val_accuracy", mode="max")


history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10, batch_size=32,
    callbacks=[checkpoint]
)

Epoch 1/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step - accuracy: 0.6646 - loss: 0.5742



[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m153s[0m 204ms/step - accuracy: 0.6648 - loss: 0.5740 - val_accuracy: 0.8534 - val_loss: 0.3342
Epoch 2/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step - accuracy: 0.9035 - loss: 0.2511



[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 201ms/step - accuracy: 0.9035 - loss: 0.2511 - val_accuracy: 0.8688 - val_loss: 0.3285
Epoch 3/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 204ms/step - accuracy: 0.9378 - loss: 0.1752 - val_accuracy: 0.8596 - val_loss: 0.3547
Epoch 4/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 198ms/step - accuracy: 0.9579 - loss: 0.1239 - val_accuracy: 0.8602 - val_loss: 0.4109
Epoch 5/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 199ms/step - accuracy: 0.9663 - loss: 0.0952 - val_accuracy: 0.8545 - val_loss: 0.5182
Epoch 6/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 199ms/step - accuracy: 0.9756 - loss: 0.0741 - val_accuracy: 0.8504 - val_loss: 0.5455
Epoch 7/10
[1m668/668[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 211ms/step - accuracy: 0.9814 - loss: 0.0572 - val_accuracy: 0.8495 - val_loss: 0.6514
Epoch 8/10
[1m

In [9]:

best_model = load_model("best_bilstm_model.h5")

loss, accuracy = best_model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

def predict_sarcasm(text):
    seq = tokenizer.texts_to_sequences([preprocess_text(text)])
    padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")
    pred = best_model.predict(padded)[0][0]
    return "Sarcasm" if pred > 0.5 else "Not Sarcasm"

print(predict_sarcasm("Oh wow, that’s EXACTLY what I wanted!"))




[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 30ms/step - accuracy: 0.8682 - loss: 0.3211
Test Accuracy: 0.8688
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Not Sarcasm
