In [None]:
import json, re, numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import tensorflow as tf
import keras.utils
if not hasattr(keras.utils, "unpack_x_y_sample_weight"):
    keras.utils.unpack_x_y_sample_weight = tf.keras.utils.unpack_x_y_sample_weight

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import (
    Input, Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout,
    Bidirectional, LSTM, MultiHeadAttention, LayerNormalization,
    GlobalAveragePooling1D
)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import legacy


In [None]:
# STEP 2: Load & Prepare Dataset
data = [json.loads(line) for line in open("/content/Sarcasm_Headlines_Dataset.json", "r")]
df = pd.DataFrame(data)[["headline", "is_sarcastic"]]
print(df['headline'])

0        former versace store clerk sues over secret 'b...
1        the 'roseanne' revival catches up to our thorn...
2        mom starting to fear son's web series closest ...
3        boehner just wants wife to listen, not come up...
4        j.k. rowling wishes snape happy birthday in th...
                               ...                        
26704                 american politics in moral free-fall
26705                              america's best 20 hikes
26706                                reparations and obama
26707    israeli ban targeting boycott supporters raise...
26708                    gourmet gifts for the foodie 2014
Name: headline, Length: 26709, dtype: object


In [None]:
def clean_text(text):
    return re.sub(r"[^a-zA-Z']", " ", text).lower()

df["headline"] = df["headline"].apply(clean_text)
X_train, X_test, y_train, y_test = train_test_split(
    df["headline"], df["is_sarcastic"], test_size=0.2, random_state=42)
print(df['headline'])

0        former versace store clerk sues over secret 'b...
1        the 'roseanne' revival catches up to our thorn...
2        mom starting to fear son's web series closest ...
3        boehner just wants wife to listen  not come up...
4        j k  rowling wishes snape happy birthday in th...
                               ...                        
26704                 american politics in moral free fall
26705                              america's best    hikes
26706                                reparations and obama
26707    israeli ban targeting boycott supporters raise...
26708                    gourmet gifts for the foodie     
Name: headline, Length: 26709, dtype: object


In [None]:
# STEP 3: TF-IDF + SVM
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)
svm_pred = svm_model.predict(X_test_tfidf)

print("=== TF-IDF + SVM ===")
print(classification_report(y_test, svm_pred))


=== TF-IDF + SVM ===
              precision    recall  f1-score   support

           0       0.85      0.86      0.86      2996
           1       0.82      0.81      0.81      2346

    accuracy                           0.84      5342
   macro avg       0.83      0.83      0.83      5342
weighted avg       0.84      0.84      0.84      5342



In [None]:
# STEP 4: Tokenization for Deep Learning Models
vocab_size = 15000
maxlen = 25
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=maxlen, padding='post')
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=maxlen, padding='post')


In [None]:
# STEP 5: CNN Model
cnn_model = Sequential([
    Embedding(vocab_size, 100, input_length=maxlen),
    Conv1D(128, 3, activation='relu'),
    MaxPooling1D(2),
    Conv1D(64, 3, activation='relu'),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.fit(X_train_seq, y_train, epochs=5, batch_size=128, validation_split=0.1, verbose=1)

cnn_pred = (cnn_model.predict(X_test_seq) > 0.5).astype(int)
print("=== CNN ===")
print(classification_report(y_test, cnn_pred))


Epoch 1/5




[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 51ms/step - accuracy: 0.6310 - loss: 0.6143 - val_accuracy: 0.8517 - val_loss: 0.3407
Epoch 2/5
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 42ms/step - accuracy: 0.9082 - loss: 0.2383 - val_accuracy: 0.8549 - val_loss: 0.3365
Epoch 3/5
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 52ms/step - accuracy: 0.9633 - loss: 0.1116 - val_accuracy: 0.8563 - val_loss: 0.3909
Epoch 4/5
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 65ms/step - accuracy: 0.9863 - loss: 0.0465 - val_accuracy: 0.8470 - val_loss: 0.5185
Epoch 5/5
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 48ms/step - accuracy: 0.9919 - loss: 0.0284 - val_accuracy: 0.8563 - val_loss: 0.6450
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
=== CNN ===
              precision    recall  f1-score   support

           0       0.86      0.86      0.86      2996
  

In [None]:
# STEP 6: MHA-BiLSTM Model
inputs = Input(shape=(maxlen,))
x = Embedding(vocab_size, 100)(inputs)
x = Bidirectional(LSTM(100, return_sequences=True))(x)
x = Dropout(0.5)(x)

attn_output = MultiHeadAttention(num_heads=4, key_dim=64)(x, x)
x = LayerNormalization()(x + attn_output)
x = GlobalAveragePooling1D()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.3)(x)
outputs = Dense(1, activation='sigmoid')(x)

mha_model = Model(inputs, outputs)
mha_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
mha_model.fit(X_train_seq, y_train, epochs=5, batch_size=128, validation_split=0.1, verbose=1)

mha_pred = (mha_model.predict(X_test_seq) > 0.5).astype(int)
print("=== MHA-BiLSTM ===")
print(classification_report(y_test, mha_pred))



Epoch 1/5
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 399ms/step - accuracy: 0.6869 - loss: 0.5548 - val_accuracy: 0.8615 - val_loss: 0.3272
Epoch 2/5
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 390ms/step - accuracy: 0.9183 - loss: 0.2221 - val_accuracy: 0.8372 - val_loss: 0.3741
Epoch 3/5
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 439ms/step - accuracy: 0.9534 - loss: 0.1328 - val_accuracy: 0.8554 - val_loss: 0.5282
Epoch 4/5
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 382ms/step - accuracy: 0.9736 - loss: 0.0824 - val_accuracy: 0.8503 - val_loss: 0.5850
Epoch 5/5
[1m151/151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 390ms/step - accuracy: 0.9794 - loss: 0.0647 - val_accuracy: 0.8526 - val_loss: 0.6648
[1m167/167[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 37ms/step
=== MHA-BiLSTM ===
              precision    recall  f1-score   support

           0       0.84      0

In [None]:
# STEP 7: BERT Fine-Tuning
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def bert_encode(texts, tokenizer, max_len=40):
    enc = tokenizer(
        list(texts),
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='tf'
    )
    return enc

train_enc = bert_encode(X_train, bert_tokenizer)
test_enc = bert_encode(X_test, bert_tokenizer)

bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
bert_model.compile(
    optimizer=legacy.Adam(learning_rate=2e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

bert_model.fit(
    train_enc['input_ids'], y_train,
    validation_split=0.1,
    epochs=2,
    batch_size=16
)

bert_preds = tf.argmax(bert_model.predict(test_enc['input_ids']).logits, axis=1).numpy()
print("=== BERT ===")
print(classification_report(y_test, bert_preds))

In [None]:
# STEP 8: Comparative Analysis
results = pd.DataFrame({
    "Model": ["TF-IDF+SVM", "CNN", "MHA-BiLSTM", "BERT"],
    "Accuracy": [
        accuracy_score(y_test, svm_pred),
        accuracy_score(y_test, cnn_pred),
        accuracy_score(y_test, mha_pred),
        accuracy_score(y_test, bert_preds)
    ],
    "F1-Score": [
        f1_score(y_test, svm_pred),
        f1_score(y_test, cnn_pred),
        f1_score(y_test, mha_pred),
        f1_score(y_test, bert_preds)
    ]
})
print(results)

# Visualization
plt.figure(figsize=(8,4))
plt.bar(results["Model"], results["Accuracy"], label='Accuracy', alpha=0.7)
plt.bar(results["Model"], results["F1-Score"], label='F1', alpha=0.7)
plt.title("Model Performance Comparison on Sarcasm Headlines Dataset")
plt.ylabel("Score")
plt.legend()
plt.show()
