In [2]:
# --- Sarcasm Detection Model Training Script ---
# This script loads the dataset, preprocesses the data, trains the LSTM model,
# and then saves the trained model and the tokenizer to files.
# Run this script once to prepare the assets for the Streamlit app.

import json
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

# --- Step 1: Load the Dataset ---
file_path = 'Sarcasm_Headlines_Dataset.json'
data = []
with open(file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))
df = pd.DataFrame(data)

# --- Step 2: Data Preprocessing ---
# Hyperparameters
vocab_size = 10000
embedding_dim = 16
max_length = 40
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

# Prepare sentences and labels
sentences = df['headline'].tolist()
labels = np.array(df['is_sarcastic'].tolist())

# Split data to ensure the tokenizer is only fit on training data
train_sentences, _, _, _ = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Tokenize the training text
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)

# Convert all sentences to sequences and pad them
sequences = tokenizer.texts_to_sequences(sentences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Split the processed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, labels, test_size=0.2, random_state=42
)

# --- Step 3: Build the LSTM Model ---
print("\nBuilding the model...")
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# --- Step 4: Train the Model ---
print("\nTraining the model...")
num_epochs = 10
history = model.fit(
    X_train,
    y_train,
    epochs=num_epochs,
    validation_data=(X_test, y_test),
    verbose=2
)

# --- Step 5: Save the Model and Tokenizer ---
print("\nSaving the model and tokenizer...")

# Save the trained model
model.save("sarcasm_model.h5")

# Save the tokenizer
tokenizer_json = tokenizer.to_json()
with open("tokenizer.json", "w", encoding="utf-8") as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

print("\nTraining complete. 'sarcasm_model.h5' and 'tokenizer.json' have been saved.")



Building the model...



Training the model...
Epoch 1/10
668/668 - 26s - 39ms/step - accuracy: 0.7361 - loss: 0.5028 - val_accuracy: 0.8499 - val_loss: 0.3740
Epoch 2/10
668/668 - 21s - 32ms/step - accuracy: 0.8943 - loss: 0.2764 - val_accuracy: 0.8478 - val_loss: 0.3636
Epoch 3/10
668/668 - 41s - 61ms/step - accuracy: 0.9270 - loss: 0.2018 - val_accuracy: 0.8560 - val_loss: 0.3776
Epoch 4/10
668/668 - 22s - 33ms/step - accuracy: 0.9429 - loss: 0.1629 - val_accuracy: 0.8480 - val_loss: 0.4420
Epoch 5/10
668/668 - 23s - 35ms/step - accuracy: 0.9535 - loss: 0.1359 - val_accuracy: 0.8441 - val_loss: 0.5829
Epoch 6/10
668/668 - 22s - 33ms/step - accuracy: 0.9618 - loss: 0.1168 - val_accuracy: 0.8401 - val_loss: 0.5371
Epoch 7/10
668/668 - 22s - 33ms/step - accuracy: 0.9676 - loss: 0.1027 - val_accuracy: 0.8396 - val_loss: 0.4192
Epoch 8/10
668/668 - 23s - 35ms/step - accuracy: 0.9722 - loss: 0.0900 - val_accuracy: 0.8340 - val_loss: 0.6193
Epoch 9/10
668/668 - 23s - 34ms/step - accuracy: 0.9787 - loss: 0.0743 - 




Saving the model and tokenizer...

Training complete. 'sarcasm_model.h5' and 'tokenizer.json' have been saved.
