<a href="https://colab.research.google.com/github/codegitfirst/ML-workshop/blob/main/Sarcasm_Detection_Using_Classical_ML_Techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'news-headlines-dataset-for-sarcasm-detection:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F30764%2F533474%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240729%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240729T080417Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D5557b96d4a66a0609dfa761c9385e58a38de2b52efad5180ef3986094908415c178163f5f1165944e0558706e4b8b81470b2a51769ac00938100a05371a9f931dae450c2e8604053236630c33750862f15c4618732fe2b113ad0a5d260628157a0e0dfc091c0beb2704e6990cb980be57bf61a177453052738ae0ea81ee2f1542b3634aeeb3bccf1111653774a428c8e51d8df91cc973f89f345cc5fbebcdac629afb0784f2f9c6c537aa58de78c24e4a4aa8931bf59de8f0787149766de41f78bdd38bce5d2bcc3f3ef189b79e218049427fa1d60a50b9966b000d4d67f21493222c372b0474c7aad24c65135bd2ba3b382bbe2abeb19c15ff396e1b57f36c8'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# Sarcasm Detection Using Classical ML Techniques

data : https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection

## Reading and exploring the data :

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_json('/kaggle/input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json',lines=True)
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
# Count the number of duplicate headlines in the 'headline' column
df.headline.duplicated().sum()

In [None]:
# Remove rows with duplicate headlines from the DataFrame
df = df.drop(df[df.headline.duplicated()].index, axis=0)

In [None]:
# Plot a bar chart of the counts of sarcastic and non-sarcastic labels
value_counts = df.is_sarcastic.value_counts()
plt.figure(figsize=(8, 6))
sns.barplot(x=value_counts.index, y=value_counts.values, palette='viridis')
plt.title('Counts of Sarcastic and Non-Sarcastic Headlines')
plt.xlabel('Label (0 = Non-Sarcastic, 1 = Sarcastic)')
plt.ylabel('Count')
plt.xticks([0, 1], ['Non-Sarcastic', 'Sarcastic'])
plt.show()

## Data Preprocessing

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from nltk.tokenize import word_tokenize

# Download the stopwords and punkt tokenizer from NLTK
nltk.download('stopwords')
nltk.download('punkt')

# Get the set of English stopwords
stop_words = set(stopwords.words('english'))

# Initialize the WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

def data_preprocessing(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove non-alphabetic characters and replace them with spaces
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Tokenize the text into words
    text = word_tokenize(text)

    # Lemmatize each word and remove stopwords
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]

    # Join the processed words back into a single string
    return ' '.join(text)


## Training and Evaluation

In [None]:
from sklearn.model_selection import train_test_split

# Define the feature set (headlines) and the target variable (sarcasm labels)
X = df.headline
y = df.is_sarcastic

# Split the data into training and testing sets with 30% of the data for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd

# Function to vectorize the training and testing data using TF-IDF
def vectorize_data(X_train, X_test):
    vectorizer = TfidfVectorizer()  # Initialize TF-IDF vectorizer with a max feature limit
    X_train_tfidf = vectorizer.fit_transform(X_train)  # Fit and transform the training data
    X_test_tfidf = vectorizer.transform(X_test)  # Transform the testing data
    return X_train_tfidf, X_test_tfidf

# Function to train a model and evaluate its performance
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)  # Train the model on the training data
    y_pred = model.predict(X_test)  # Predict the labels for the testing data
    accuracy = accuracy_score(y_test, y_pred)  # Calculate the accuracy of the model
    conf_matrix = confusion_matrix(y_test, y_pred)  # Generate the confusion matrix
    return accuracy, conf_matrix

# Main function to vectorize data, train models, and display results
def main(X_train, y_train, X_test, y_test):
    X_train_tfidf, X_test_tfidf = vectorize_data(X_train, X_test)  # Vectorize the data

    results = []  # List to store results
    confusion_matrices = {}  # Dictionary to store confusion matrices

    # Logistic Regression
    lr_model = LogisticRegression()  # Initialize Logistic Regression model
    accuracy_lr, conf_matrix_lr = train_and_evaluate_model(lr_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
    results.append(['Logistic Regression', accuracy_lr])  # Append accuracy to results
    confusion_matrices['Logistic Regression'] = conf_matrix_lr  # Store confusion matrix

    # Support Vector Machine
    svm_model = SVC(kernel='linear')  # Initialize SVM model with linear kernel
    accuracy_svm, conf_matrix_svm = train_and_evaluate_model(svm_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
    results.append(['Support Vector Machine', accuracy_svm])  # Append accuracy to results
    confusion_matrices['Support Vector Machine'] = conf_matrix_svm  # Store confusion matrix

    # Multinomial Naive Bayes
    nb_model = MultinomialNB()  # Initialize Multinomial Naive Bayes model
    accuracy_nb, conf_matrix_nb = train_and_evaluate_model(nb_model, X_train_tfidf, y_train, X_test_tfidf, y_test)
    results.append(['Multinomial Naive Bayes', accuracy_nb])  # Append accuracy to results
    confusion_matrices['Multinomial Naive Bayes'] = conf_matrix_nb  # Store confusion matrix

    # Display the results in a table
    results_df = pd.DataFrame(results, columns=['Model', 'Accuracy'])
    print(results_df)

    return confusion_matrices

confusion_matrices = main(X_train, y_train, X_test, y_test)  # Call the main function and store confusion matrices

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Function to plot confusion matrices for each model
def plot_confusion_matrices(confusion_matrices):
    fig, axes = plt.subplots(2, 2, figsize=(10, 10))  # Create a 2x2 grid of subplots
    model_names = list(confusion_matrices.keys())  # Get the model names from the dictionary keys

    # Iterate over the axes and model names
    for ax, model_name in zip(axes.flat, model_names):
        # Plot the confusion matrix as a heatmap for each model
        sns.heatmap(confusion_matrices[model_name], annot=True, fmt='d', cmap='Blues', ax=ax)
        ax.set_title(f'Confusion Matrix for {model_name}')  # Set the title for each subplot
        ax.set_xlabel('Predicted')  # Set the x-axis label
        ax.set_ylabel('Actual')  # Set the y-axis label

    # Hide any empty subplots if there are fewer than 4 models
    for ax in axes.flat[len(model_names):]:
        ax.set_visible(False)

    plt.tight_layout()  # Adjust the layout to prevent overlap
    plt.show()  # Display the plot

plot_confusion_matrices(confusion_matrices)  # Call the function to plot confusion matrices
