<a href="https://colab.research.google.com/github/gitpr4596/Sarcasm-Detection-/blob/main/Sarcasm_DetectionSem_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Logistic Regression**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import gensim.downloader as api

# Ensure NLTK tokenizer is downloaded
nltk.download('punkt')

# Load the dataset
df = pd.read_excel('/content/Sarcasm_Headlines_Dataset.xlsx')
df['headline'] = df['headline'].astype(str)  # Ensure all headlines are strings

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['headline'], df['is_sarcastic'], test_size=0.2, random_state=42)

# Define hyperparameter grid for Grid Search
param_grid = {
    'C': [0.01, 0.1, 1, 10],  # Fewer values for regularization strength
    'penalty': ['l2'],        # Use only L2 for simplicity
    'solver': ['liblinear']
}


# ---- TF-IDF Embedding ----
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Grid search with Logistic Regression for TF-IDF features
logreg_tfidf = LogisticRegression()
grid_search_tfidf = GridSearchCV(logreg_tfidf, param_grid, cv=5)
grid_search_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = grid_search_tfidf.predict(X_test_tfidf)
print("TF-IDF Best Parameters:", grid_search_tfidf.best_params_)
print("TF-IDF Classification Report:")
print(classification_report(y_test, y_pred_tfidf))

# ---- Word2Vec Embedding ----
X_train_tokens = X_train.apply(lambda x: word_tokenize(x.lower()))
X_test_tokens = X_test.apply(lambda x: word_tokenize(x.lower()))
word2vec_model = Word2Vec(sentences=X_train_tokens, vector_size=50, window=5, min_count=1, workers=4)

def average_word2vec(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

X_train_w2v = np.array([average_word2vec(tokens, word2vec_model) for tokens in X_train_tokens])
X_test_w2v = np.array([average_word2vec(tokens, word2vec_model) for tokens in X_test_tokens])

# Grid search with Logistic Regression for Word2Vec features
logreg_w2v = LogisticRegression()
grid_search_w2v = GridSearchCV(logreg_w2v, param_grid, cv=5)
grid_search_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = grid_search_w2v.predict(X_test_w2v)
print("Word2Vec Best Parameters:", grid_search_w2v.best_params_)
print("Word2Vec Classification Report:")
print(classification_report(y_test, y_pred_w2v))

# ---- GloVe Embedding ----
glove_model = api.load("glove-wiki-gigaword-100")

def average_glove(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

X_train_glove = np.array([average_glove(tokens, glove_model) for tokens in X_train_tokens])
X_test_glove = np.array([average_glove(tokens, glove_model) for tokens in X_test_tokens])

# Grid search with Logistic Regression for GloVe features
logreg_glove = LogisticRegression()
grid_search_glove = GridSearchCV(logreg_glove, param_grid, cv=5)
grid_search_glove.fit(X_train_glove, y_train)
y_pred_glove = grid_search_glove.predict(X_test_glove)
print("GloVe Best Parameters:", grid_search_glove.best_params_)
print("GloVe Classification Report:")
print(classification_report(y_test, y_pred_glove))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


TF-IDF Best Parameters: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
TF-IDF Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.84      0.82      2996
           1       0.79      0.74      0.76      2346

    accuracy                           0.80      5342
   macro avg       0.80      0.79      0.79      5342
weighted avg       0.80      0.80      0.80      5342

Word2Vec Best Parameters: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Word2Vec Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.77      0.78      2996
           1       0.72      0.73      0.72      2346

    accuracy                           0.76      5342
   macro avg       0.75      0.75      0.75      5342
weighted avg       0.76      0.76      0.76      5342

GloVe Best Parameters: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
GloVe Classification Report:
              precision    recal

**Support Vector Machine**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import gensim.downloader as api

# Ensure NLTK tokenizer is downloaded
nltk.download('punkt')

# Load the dataset
df = pd.read_excel('/content/Sarcasm_Headlines_Dataset.xlsx')
df['headline'] = df['headline'].astype(str)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['headline'], df['is_sarcastic'], test_size=0.2, random_state=42)

# ---- TF-IDF Embedding ----
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Limit to 5000 features
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Fit SVM for TF-IDF features with fixed parameters
svm_tfidf = SVC(kernel='linear', C=1)  # Use a fixed value for C
svm_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = svm_tfidf.predict(X_test_tfidf)

print("TF-IDF Classification Report:")
print(classification_report(y_test, y_pred_tfidf))

# ---- Word2Vec Embedding ----
X_train_tokens = X_train.apply(lambda x: word_tokenize(x.lower()))
X_test_tokens = X_test.apply(lambda x: word_tokenize(x.lower()))
word2vec_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)

def average_word2vec(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

X_train_w2v = np.array([average_word2vec(tokens, word2vec_model) for tokens in X_train_tokens])
X_test_w2v = np.array([average_word2vec(tokens, word2vec_model) for tokens in X_test_tokens])

# Fit SVM for Word2Vec features with fixed parameters
svm_w2v = SVC(kernel='linear', C=1)  # Use a fixed value for C
svm_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = svm_w2v.predict(X_test_w2v)

print("Word2Vec Classification Report:")
print(classification_report(y_test, y_pred_w2v))

# ---- GloVe Embedding ----
glove_model = api.load("glove-wiki-gigaword-100")

def average_glove(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

X_train_glove = np.array([average_glove(tokens, glove_model) for tokens in X_train_tokens])
X_test_glove = np.array([average_glove(tokens, glove_model) for tokens in X_test_tokens])

# Fit SVM for GloVe features with fixed parameters
svm_glove = SVC(kernel='linear', C=1)  # Use a fixed value for C
svm_glove.fit(X_train_glove, y_train)
y_pred_glove = svm_glove.predict(X_test_glove)

print("GloVe Classification Report:")
print(classification_report(y_test, y_pred_glove))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


TF-IDF Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.84      0.81      2996
           1       0.77      0.72      0.74      2346

    accuracy                           0.78      5342
   macro avg       0.78      0.78      0.78      5342
weighted avg       0.78      0.78      0.78      5342

Word2Vec Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.73      0.77      2996
           1       0.69      0.77      0.73      2346

    accuracy                           0.75      5342
   macro avg       0.75      0.75      0.75      5342
weighted avg       0.75      0.75      0.75      5342

GloVe Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.82      0.79      2996
           1       0.74      0.68      0.71      2346

    accuracy                           0.75      5342
   macro avg       0.75      0.75    

**Decision Tree**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier  # Import Decision Tree
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import gensim.downloader as api

# Ensure NLTK tokenizer is downloaded
nltk.download('punkt')

# Load the dataset
df = pd.read_excel('/content/Sarcasm_Headlines_Dataset.xlsx')
df['headline'] = df['headline'].astype(str)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['headline'], df['is_sarcastic'], test_size=0.2, random_state=42)

# ---- TF-IDF Embedding ----
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Limit to 5000 features
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Fit Decision Tree for TF-IDF features
dt_tfidf = DecisionTreeClassifier()  # Initialize Decision Tree Classifier
dt_tfidf.fit(X_train_tfidf, y_train)  # Convert sparse matrix to dense
y_pred_tfidf = dt_tfidf.predict(X_test_tfidf)

print("TF-IDF Classification Report:")
print(classification_report(y_test, y_pred_tfidf))

# ---- Word2Vec Embedding ----
X_train_tokens = X_train.apply(lambda x: word_tokenize(x.lower()))
X_test_tokens = X_test.apply(lambda x: word_tokenize(x.lower()))
word2vec_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)

def average_word2vec(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

X_train_w2v = np.array([average_word2vec(tokens, word2vec_model) for tokens in X_train_tokens])
X_test_w2v = np.array([average_word2vec(tokens, word2vec_model) for tokens in X_test_tokens])

# Fit Decision Tree for Word2Vec features
dt_w2v = DecisionTreeClassifier()  # Initialize Decision Tree Classifier
dt_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = dt_w2v.predict(X_test_w2v)

print("Word2Vec Classification Report:")
print(classification_report(y_test, y_pred_w2v))

# ---- GloVe Embedding ----
glove_model = api.load("glove-wiki-gigaword-100")

def average_glove(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

X_train_glove = np.array([average_glove(tokens, glove_model) for tokens in X_train_tokens])
X_test_glove = np.array([average_glove(tokens, glove_model) for tokens in X_test_tokens])

# Fit Decision Tree for GloVe features
dt_glove = DecisionTreeClassifier()  # Initialize Decision Tree Classifier
dt_glove.fit(X_train_glove, y_train)
y_pred_glove = dt_glove.predict(X_test_glove)

print("GloVe Classification Report:")
print(classification_report(y_test, y_pred_glove))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


TF-IDF Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.75      0.75      2996
           1       0.68      0.69      0.69      2346

    accuracy                           0.72      5342
   macro avg       0.72      0.72      0.72      5342
weighted avg       0.72      0.72      0.72      5342

Word2Vec Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.70      0.69      2996
           1       0.61      0.61      0.61      2346

    accuracy                           0.66      5342
   macro avg       0.65      0.65      0.65      5342
weighted avg       0.66      0.66      0.66      5342

GloVe Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.67      0.67      2996
           1       0.57      0.57      0.57      2346

    accuracy                           0.63      5342
   macro avg       0.62      0.62    

**Random Forest**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
import gensim.downloader as api

# Ensure NLTK tokenizer is downloaded
nltk.download('punkt')

# Load the dataset
df = pd.read_excel('/content/Sarcasm_Headlines_Dataset.xlsx')
df['headline'] = df['headline'].astype(str)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['headline'], df['is_sarcastic'], test_size=0.2, random_state=42)

# ---- TF-IDF Embedding ----
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Limit to 5000 features
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Fit Random Forest for TF-IDF features
rf_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)  # Initialize Random Forest Classifier
rf_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = rf_tfidf.predict(X_test_tfidf)

print("TF-IDF Classification Report:")
print(classification_report(y_test, y_pred_tfidf))

# ---- Word2Vec Embedding ----
X_train_tokens = X_train.apply(lambda x: word_tokenize(x.lower()))
X_test_tokens = X_test.apply(lambda x: word_tokenize(x.lower()))
word2vec_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)

def average_word2vec(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

X_train_w2v = np.array([average_word2vec(tokens, word2vec_model) for tokens in X_train_tokens])
X_test_w2v = np.array([average_word2vec(tokens, word2vec_model) for tokens in X_test_tokens])

# Fit Random Forest for Word2Vec features
rf_w2v = RandomForestClassifier(n_estimators=100, random_state=42)  # Initialize Random Forest Classifier
rf_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = rf_w2v.predict(X_test_w2v)

print("Word2Vec Classification Report:")
print(classification_report(y_test, y_pred_w2v))

# ---- GloVe Embedding ----
glove_model = api.load("glove-wiki-gigaword-100")

def average_glove(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

X_train_glove = np.array([average_glove(tokens, glove_model) for tokens in X_train_tokens])
X_test_glove = np.array([average_glove(tokens, glove_model) for tokens in X_test_tokens])

# Fit Random Forest for GloVe features
rf_glove = RandomForestClassifier(n_estimators=100, random_state=42)  # Initialize Random Forest Classifier
rf_glove.fit(X_train_glove, y_train)
y_pred_glove = rf_glove.predict(X_test_glove)

print("GloVe Classification Report:")
print(classification_report(y_test, y_pred_glove))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


TF-IDF Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.82      0.80      2996
           1       0.75      0.71      0.73      2346

    accuracy                           0.77      5342
   macro avg       0.77      0.76      0.77      5342
weighted avg       0.77      0.77      0.77      5342

Word2Vec Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.77      0.77      2996
           1       0.71      0.72      0.71      2346

    accuracy                           0.75      5342
   macro avg       0.74      0.74      0.74      5342
weighted avg       0.75      0.75      0.75      5342

GloVe Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.85      0.79      2996
           1       0.77      0.62      0.69      2346

    accuracy                           0.75      5342
   macro avg       0.76      0.74    

**Gradient Boost**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from gensim.models import Word2Vec
import gensim.downloader as api

# Download necessary NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')

# Load dataset
data = pd.read_excel("/content/Sarcasm_Headlines_Dataset.xlsx")

# Preprocess text data
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

data['tokens'] = data['headline'].apply(preprocess_text)

# Split data into train and test sets
X = data['headline']
y = data['is_sarcastic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize train and test sets
X_train_tokens = X_train.apply(preprocess_text)
X_test_tokens = X_test.apply(preprocess_text)

# TF-IDF method
def tfidf_method():
    tfidf = TfidfVectorizer()
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    model = GradientBoostingClassifier()
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    print("TF-IDF Classification Report:\n")
    print(classification_report(y_test, y_pred))

# GloVe method
def glove_method():
    # ---- GloVe Embedding ----
    glove_model = api.load("glove-wiki-gigaword-100")

    def average_glove(tokens, model):
        vectors = [model[word] for word in tokens if word in model]
        return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

    X_train_glove = np.array([average_glove(tokens, glove_model) for tokens in X_train_tokens])
    X_test_glove = np.array([average_glove(tokens, glove_model) for tokens in X_test_tokens])

    model = GradientBoostingClassifier()
    model.fit(X_train_glove, y_train)
    y_pred = model.predict(X_test_glove)

    print("GloVe Classification Report:\n")
    print(classification_report(y_test, y_pred))

# Word2Vec method
def word2vec_method():
    w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)

    def get_embedding(tokens):
        embeddings = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
        return np.mean(embeddings, axis=0) if embeddings else np.zeros(100)

    X_train_w2v = np.array([get_embedding(tokens) for tokens in X_train_tokens])
    X_test_w2v = np.array([get_embedding(tokens) for tokens in X_test_tokens])

    model = GradientBoostingClassifier()
    model.fit(X_train_w2v, y_train)
    y_pred = model.predict(X_test_w2v)

    print("Word2Vec Classification Report:\n")
    print(classification_report(y_test, y_pred))

# Run all methods
print("Running Sarcasm Detection with Gradient Boosting:\n")
tfidf_method()
glove_method()
word2vec_method()


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Running Sarcasm Detection with Gradient Boosting:

TF-IDF Classification Report:

              precision    recall  f1-score   support

           0       0.84      0.67      0.75      2996
           1       0.67      0.83      0.74      2346

    accuracy                           0.74      5342
   macro avg       0.75      0.75      0.74      5342
weighted avg       0.76      0.74      0.74      5342

GloVe Classification Report:

              precision    recall  f1-score   support

           0       0.73      0.81      0.77      2996
           1       0.72      0.62      0.66      2346

    accuracy                           0.73      5342
   macro avg       0.73      0.71      0.72      5342
weighted avg       0.73      0.73      0.72      5342

Word2Vec Classification Report:

              precision    recall  f1-score   support

           0       0.60      0.90      0.72      2996
           1       0.65      0.25      0.36      2346

    accuracy                         

**RNN**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import gensim.downloader as api
from gensim.models import Word2Vec

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset
data = pd.read_excel("/content/Sarcasm_Headlines_Dataset.xlsx")

# Preprocess text data
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

data['tokens'] = data['headline'].apply(preprocess_text)

# Split data into train and test sets
X = data['headline']
y = data['is_sarcastic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize train and test sets
X_train_tokens = X_train.apply(preprocess_text)
X_test_tokens = X_test.apply(preprocess_text)

# Tokenizer for text sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding sequences
max_sequence_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post')

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# 1. TF-IDF Method
def tfidf_method():
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
    X_test_tfidf = vectorizer.transform(X_test).toarray()

    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train_tfidf, y_train, epochs=50, batch_size=64, validation_split=0.2, verbose=1)

    y_pred = (model.predict(X_test_tfidf) > 0.5).astype("int32")
    print("TF-IDF Classification Report:\n")
    print(classification_report(y_test, y_pred))

# 2. GloVe Embeddings
def create_glove_embedding_matrix():
    glove_model = api.load("glove-wiki-gigaword-100")
    embedding_dim = 100
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if word in glove_model:
            embedding_matrix[i] = glove_model[word]
    return embedding_matrix

def glove_rnn_model():
    embedding_matrix = create_glove_embedding_matrix()
    embedding_dim = embedding_matrix.shape[1]

    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix],
                  input_length=max_sequence_length, trainable=False),
        SpatialDropout1D(0.2),
        SimpleRNN(128, dropout=0.2, return_sequences=False),  # SimpleRNN expects 3D input
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# 3. Word2Vec Embeddings
def create_word2vec_embedding_matrix():
    w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)
    embedding_dim = w2v_model.vector_size
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if word in w2v_model.wv:
            embedding_matrix[i] = w2v_model.wv[word]
    return embedding_matrix

def word2vec_rnn_model():
    embedding_matrix = create_word2vec_embedding_matrix()
    embedding_dim = embedding_matrix.shape[1]

    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix],
                  input_length=max_sequence_length, trainable=True),
        SpatialDropout1D(0.2),
        SimpleRNN(128, dropout=0.2, return_sequences=False),  # SimpleRNN expects 3D input
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train and Evaluate Models
def train_and_evaluate_rnn(method):
    if method == "TF-IDF":
        tfidf_method()
    elif method == "GloVe":
        model = glove_rnn_model()
        print(model.summary())
        model.fit(X_train_pad, y_train, epochs=50, batch_size=64, validation_split=0.2, verbose=1)
        y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")
        print("GloVe Classification Report:\n")
        print(classification_report(y_test, y_pred))
    elif method == "Word2Vec":
        model = word2vec_rnn_model()
        print(model.summary())
        model.fit(X_train_pad, y_train, epochs=50, batch_size=64, validation_split=0.2, verbose=1)
        y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")
        print("Word2Vec Classification Report:\n")
        print(classification_report(y_test, y_pred))
    else:
        print("Invalid method. Choose from 'TF-IDF', 'GloVe', or 'Word2Vec'.")

# Run all methods
methods = ["TF-IDF", "GloVe", "Word2Vec"]
for method in methods:
    print(f"\nRunning {method} Method:\n")
    train_and_evaluate_rnn(method)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Running TF-IDF Method:



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.6739 - loss: 0.5726 - val_accuracy: 0.8334 - val_loss: 0.3772
Epoch 2/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8945 - loss: 0.2680 - val_accuracy: 0.8318 - val_loss: 0.3898
Epoch 3/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9320 - loss: 0.1897 - val_accuracy: 0.8271 - val_loss: 0.4427
Epoch 4/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9567 - loss: 0.1252 - val_accuracy: 0.8219 - val_loss: 0.5328
Epoch 5/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9800 - loss: 0.0688 - val_accuracy: 0.8196 - val_loss: 0.6446
Epoch 6/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9907 - loss: 0.0337 - val_accuracy: 0.8173 - val_loss: 0.7855
Epoch 7/50
[1m268/268[0m 



None
Epoch 1/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.5410 - loss: 0.6932 - val_accuracy: 0.5538 - val_loss: 0.6910
Epoch 2/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.5444 - loss: 0.6923 - val_accuracy: 0.5538 - val_loss: 0.6878
Epoch 3/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.5591 - loss: 0.6873 - val_accuracy: 0.5538 - val_loss: 0.6878
Epoch 4/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.5609 - loss: 0.6878 - val_accuracy: 0.5533 - val_loss: 0.6934
Epoch 5/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.5561 - loss: 0.6882 - val_accuracy: 0.5538 - val_loss: 0.6881
Epoch 6/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.5641 - loss: 0.6866 - val_accuracy: 0.5538 - val_loss: 0.6876
Epoch 7/50
[1m26

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


None
Epoch 1/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 25ms/step - accuracy: 0.5407 - loss: 0.6921 - val_accuracy: 0.5538 - val_loss: 0.6866
Epoch 2/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.5379 - loss: 0.6949 - val_accuracy: 0.5648 - val_loss: 0.6817
Epoch 3/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.6032 - loss: 0.6562 - val_accuracy: 0.5952 - val_loss: 0.6623
Epoch 4/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.6879 - loss: 0.5860 - val_accuracy: 0.7239 - val_loss: 0.5666
Epoch 5/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.8013 - loss: 0.4604 - val_accuracy: 0.7309 - val_loss: 0.5833
Epoch 6/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.8391 - loss: 0.3950 - val_accuracy: 0.7291 - val_loss: 0.5956
Epoch 7/50
[1m2

**LSTM**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import gensim.downloader as api
from gensim.models import Word2Vec

# Download necessary NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')

# Load dataset from Excel file
data = pd.read_excel("Sarcasm_Headlines_Dataset.xlsx")

# Preprocess text data
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

data['tokens'] = data['headline'].apply(preprocess_text)

# Split data into train and test sets
X = data['headline']
y = data['is_sarcastic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize train and test sets
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding sequences
max_sequence_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post')

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# 1. TF-IDF with LSTM
def tfidf_lstm_model():
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
    X_test_tfidf = vectorizer.transform(X_test).toarray()

    # Reshape TF-IDF output to 3D for LSTM (samples, timesteps, features)
    X_train_tfidf = np.expand_dims(X_train_tfidf, axis=1)
    X_test_tfidf = np.expand_dims(X_test_tfidf, axis=1)

    model = Sequential([
        LSTM(128, input_shape=(X_train_tfidf.shape[1], X_train_tfidf.shape[2]), dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train_tfidf, y_train, epochs=50, batch_size=64, validation_split=0.2, verbose=1)

    y_pred = (model.predict(X_test_tfidf) > 0.5).astype("int32")
    print("TF-IDF Classification Report:\n")
    print(classification_report(y_test, y_pred))

# 2. GloVe Embeddings
def create_glove_embedding_matrix():
    glove_model = api.load("glove-wiki-gigaword-100")
    embedding_dim = 100
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if word in glove_model:
            embedding_matrix[i] = glove_model[word]
    return embedding_matrix

def glove_lstm_model():
    embedding_matrix = create_glove_embedding_matrix()
    embedding_dim = embedding_matrix.shape[1]

    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix],
                  input_length=max_sequence_length, trainable=False),
        SpatialDropout1D(0.2),
        LSTM(128, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# 3. Word2Vec Embeddings
def create_word2vec_embedding_matrix():
    w2v_model = Word2Vec(sentences=data['tokens'], vector_size=100, window=5, min_count=1, workers=4)
    embedding_dim = w2v_model.vector_size
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if word in w2v_model.wv:
            embedding_matrix[i] = w2v_model.wv[word]
    return embedding_matrix

def word2vec_lstm_model():
    embedding_matrix = create_word2vec_embedding_matrix()
    embedding_dim = embedding_matrix.shape[1]

    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix],
                  input_length=max_sequence_length, trainable=True),
        SpatialDropout1D(0.2),
        LSTM(128, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train and Evaluate Models
def train_and_evaluate_lstm(method):
    if method == "TF-IDF":
        tfidf_lstm_model()
    elif method == "GloVe":
        model = glove_lstm_model()
        model.fit(X_train_pad, y_train, epochs=50, batch_size=64, validation_split=0.2, verbose=1)
        y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")
        print("GloVe Classification Report:\n")
        print(classification_report(y_test, y_pred))
    elif method == "Word2Vec":
        model = word2vec_lstm_model()
        model.fit(X_train_pad, y_train, epochs=50, batch_size=64, validation_split=0.2, verbose=1)
        y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")
        print("Word2Vec Classification Report:\n")
        print(classification_report(y_test, y_pred))
    else:
        print("Invalid method. Choose from 'TF-IDF', 'GloVe', or 'Word2Vec'.")

# Run all methods
methods = ["TF-IDF", "GloVe", "Word2Vec"]
for method in methods:
    print(f"\nRunning {method} Method:\n")
    train_and_evaluate_lstm(method)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Running TF-IDF Method:



  super().__init__(**kwargs)


Epoch 1/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 14ms/step - accuracy: 0.6162 - loss: 0.6406 - val_accuracy: 0.8262 - val_loss: 0.4198
Epoch 2/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8426 - loss: 0.3789 - val_accuracy: 0.8367 - val_loss: 0.3731
Epoch 3/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8588 - loss: 0.3186 - val_accuracy: 0.8339 - val_loss: 0.3767
Epoch 4/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8680 - loss: 0.2994 - val_accuracy: 0.8348 - val_loss: 0.3863
Epoch 5/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.8822 - loss: 0.2769 - val_accuracy: 0.8252 - val_loss: 0.3978
Epoch 6/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.8767 - loss: 0.2773 - val_accuracy: 0.8238 - val_loss: 0.4101
Epoch 7/50
[1m268/268[0m



[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 162ms/step - accuracy: 0.5603 - loss: 0.6870 - val_accuracy: 0.5538 - val_loss: 0.6882
Epoch 2/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 176ms/step - accuracy: 0.5625 - loss: 0.6858 - val_accuracy: 0.5538 - val_loss: 0.6876
Epoch 3/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 163ms/step - accuracy: 0.5574 - loss: 0.6869 - val_accuracy: 0.5538 - val_loss: 0.6892
Epoch 4/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 165ms/step - accuracy: 0.5628 - loss: 0.6859 - val_accuracy: 0.5538 - val_loss: 0.6874
Epoch 5/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 159ms/step - accuracy: 0.5582 - loss: 0.6866 - val_accuracy: 0.5538 - val_loss: 0.6895
Epoch 6/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 157ms/step - accuracy: 0.5700 - loss: 0.6833 - val_accuracy: 0.5538 - val_loss: 0.6878
Epoch 7/50
[1m268/26

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/50




[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 186ms/step - accuracy: 0.5651 - loss: 0.6869 - val_accuracy: 0.5538 - val_loss: 0.6880
Epoch 2/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 186ms/step - accuracy: 0.5655 - loss: 0.6851 - val_accuracy: 0.5538 - val_loss: 0.6875
Epoch 3/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 185ms/step - accuracy: 0.5558 - loss: 0.6873 - val_accuracy: 0.5538 - val_loss: 0.6875
Epoch 4/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 185ms/step - accuracy: 0.5643 - loss: 0.6856 - val_accuracy: 0.5538 - val_loss: 0.6877
Epoch 5/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 181ms/step - accuracy: 0.5587 - loss: 0.6865 - val_accuracy: 0.5538 - val_loss: 0.6891
Epoch 6/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 188ms/step - accuracy: 0.5620 - loss: 0.6858 - val_accuracy: 0.5538 - val_loss: 0.6880
Epoch 7/50
[1m268/26

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**GRU**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import gensim.downloader as api
from gensim.models import Word2Vec

# Download necessary NLTK data
nltk.download('punkt')       # Corrected from 'punkt_tab' to 'punkt'
nltk.download('stopwords')

# Load dataset from Excel file
data = pd.read_excel("Sarcasm_Headlines_Dataset.xlsx")

# Preprocess text data
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

data['tokens'] = data['headline'].apply(preprocess_text)

# Split data into train and test sets
X = data['headline']
y = data['is_sarcastic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize train and test sets
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding sequences
max_sequence_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post')

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1

# 1. TF-IDF with GRU
def tfidf_gru_model():
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
    X_test_tfidf = vectorizer.transform(X_test).toarray()

    # Reshape TF-IDF output to 3D for GRU (samples, timesteps, features)
    X_train_tfidf = np.expand_dims(X_train_tfidf, axis=1)
    X_test_tfidf = np.expand_dims(X_test_tfidf, axis=1)

    model = Sequential([
        GRU(128, input_shape=(X_train_tfidf.shape[1], X_train_tfidf.shape[2]), dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_train_tfidf, y_train, epochs=50, batch_size=64, validation_split=0.2, verbose=1)

    y_pred = (model.predict(X_test_tfidf) > 0.5).astype("int32")
    print("TF-IDF Classification Report:\n")
    print(classification_report(y_test, y_pred))

# 2. GloVe Embeddings with GRU
def create_glove_embedding_matrix():
    glove_model = api.load("glove-wiki-gigaword-100")  # Ensure you have internet connection for downloading
    embedding_dim = 100
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if word in glove_model:
            embedding_matrix[i] = glove_model[word]
    return embedding_matrix

def glove_gru_model():
    embedding_matrix = create_glove_embedding_matrix()
    embedding_dim = embedding_matrix.shape[1]

    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix],
                  input_length=max_sequence_length, trainable=False),
        SpatialDropout1D(0.2),
        GRU(128, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# 3. Word2Vec Embeddings with GRU
def create_word2vec_embedding_matrix():
    w2v_model = Word2Vec(sentences=data['tokens'], vector_size=100, window=5, min_count=1, workers=4)
    embedding_dim = w2v_model.vector_size
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if word in w2v_model.wv:
            embedding_matrix[i] = w2v_model.wv[word]
    return embedding_matrix

def word2vec_gru_model():
    embedding_matrix = create_word2vec_embedding_matrix()
    embedding_dim = embedding_matrix.shape[1]

    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix],
                  input_length=max_sequence_length, trainable=True),
        SpatialDropout1D(0.2),
        GRU(128, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train and Evaluate Models
def train_and_evaluate_gru(method):
    if method == "TF-IDF":
        tfidf_gru_model()
    elif method == "GloVe":
        model = glove_gru_model()
        model.fit(X_train_pad, y_train, epochs=50, batch_size=64, validation_split=0.2, verbose=1)
        y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")
        print("GloVe Classification Report:\n")
        print(classification_report(y_test, y_pred))
    elif method == "Word2Vec":
        model = word2vec_gru_model()
        model.fit(X_train_pad, y_train, epochs=50, batch_size=64, validation_split=0.2, verbose=1)
        y_pred = (model.predict(X_test_pad) > 0.5).astype("int32")
        print("Word2Vec Classification Report:\n")
        print(classification_report(y_test, y_pred))
    else:
        print("Invalid method. Choose from 'TF-IDF', 'GloVe', or 'Word2Vec'.")

# Run all methods
methods = ["TF-IDF", "GloVe", "Word2Vec"]
for method in methods:
    print(f"\nRunning {method} Method with GRU:\n")
    train_and_evaluate_gru(method)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Running TF-IDF Method with GRU:



  super().__init__(**kwargs)


Epoch 1/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 47ms/step - accuracy: 0.6325 - loss: 0.6203 - val_accuracy: 0.8350 - val_loss: 0.3917
Epoch 2/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 47ms/step - accuracy: 0.8517 - loss: 0.3516 - val_accuracy: 0.8311 - val_loss: 0.3706
Epoch 3/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 45ms/step - accuracy: 0.8667 - loss: 0.3125 - val_accuracy: 0.8248 - val_loss: 0.3857
Epoch 4/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 44ms/step - accuracy: 0.8738 - loss: 0.2880 - val_accuracy: 0.8252 - val_loss: 0.3922
Epoch 5/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 44ms/step - accuracy: 0.8740 - loss: 0.2942 - val_accuracy: 0.8234 - val_loss: 0.4039
Epoch 6/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 47ms/step - accuracy: 0.8864 - loss: 0.2726 - val_accuracy: 0.8250 - val_loss: 0.4113
Epoch 7/50
[1m2



[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 250ms/step - accuracy: 0.5654 - loss: 0.6873 - val_accuracy: 0.5538 - val_loss: 0.6874
Epoch 2/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 252ms/step - accuracy: 0.5654 - loss: 0.6850 - val_accuracy: 0.5538 - val_loss: 0.6874
Epoch 3/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 241ms/step - accuracy: 0.5605 - loss: 0.6861 - val_accuracy: 0.5538 - val_loss: 0.6874
Epoch 4/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 242ms/step - accuracy: 0.5593 - loss: 0.6863 - val_accuracy: 0.5538 - val_loss: 0.6875
Epoch 5/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 243ms/step - accuracy: 0.5569 - loss: 0.6868 - val_accuracy: 0.5538 - val_loss: 0.6874
Epoch 6/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 247ms/step - accuracy: 0.5620 - loss: 0.6859 - val_accuracy: 0.5538 - val_loss: 0.6873
Epoch 7/50
[1m268/26

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/50




[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 293ms/step - accuracy: 0.5637 - loss: 0.6869 - val_accuracy: 0.5538 - val_loss: 0.6875
Epoch 2/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 304ms/step - accuracy: 0.5628 - loss: 0.6857 - val_accuracy: 0.5538 - val_loss: 0.6873
Epoch 3/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 297ms/step - accuracy: 0.5591 - loss: 0.6864 - val_accuracy: 0.5538 - val_loss: 0.6874
Epoch 4/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 294ms/step - accuracy: 0.5642 - loss: 0.6852 - val_accuracy: 0.5538 - val_loss: 0.6875
Epoch 5/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 297ms/step - accuracy: 0.5580 - loss: 0.6868 - val_accuracy: 0.5538 - val_loss: 0.6873
Epoch 6/50
[1m268/268[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 302ms/step - accuracy: 0.5642 - loss: 0.6854 - val_accuracy: 0.5538 - val_loss: 0.6874
Epoch 7/50
[1m268/26

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Transformer**

In [None]:
pip install pandas transformers datasets torch scikit-learn


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset

# Load data from Excel
file_path = "/content/Sarcasm_Headlines_Dataset.xlsx"  # Replace with your file path
df = pd.read_excel(file_path)

# Ensure proper column names
df.columns = df.columns.str.strip()
assert 'headline' in df.columns and 'is_sarcastic' in df.columns, "Columns 'headline' and 'is_sarcastic' are required"

# Split data into train and test
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['headline'].tolist(),
    df['is_sarcastic'].tolist(),
    test_size=0.2,
    random_state=42
)

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize data
def preprocess_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
    return {'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask'],
            'labels': labels}

train_encodings = preprocess_data(train_texts, train_labels)
val_encodings = preprocess_data(val_texts, val_labels)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict(train_encodings)
val_dataset = Dataset.from_dict(val_encodings)

# Load pre-trained model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Define compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), axis=1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",          # Output directory
    evaluation_strategy="epoch",    # Evaluate after each epoch
    per_device_train_batch_size=16, # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=3,             # Total number of training epochs
    learning_rate=5e-5,             # Learning rate
    weight_decay=0.01,              # Weight decay
    logging_dir="./logs",           # Log directory
    logging_steps=100,
    save_strategy="epoch",
    load_best_model_at_end=True
)

# Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Add compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
metrics = trainer.evaluate()
print(metrics)

# Save the model
model.save_pretrained("./sarcasm_model")
tokenizer.save_pretrained("./sarcasm_model")

print("Model training completed and saved!")
