In [13]:
import pandas as pd
import re

# For clarification: cleaned_combined_comments are labeled by our team

# Load the datasets (note: both datasets may contain offensive language since this is public data)
jokes_df = pd.read_csv('../datasets/jokes.csv')
comments_df = pd.read_csv('../datasets/university_comments.csv')

# Define a function to identify one-sentence jokes for normalization
def normalize_sentence(text):
    # Count the number of sentence-ending punctuation marks
    sentences = re.split(r'[.!?]', text)  

    sentences = [s for s in sentences if s.strip() != '']
    return len(sentences) == 1

# Filter jokes to keep only one-sentence jokes
one_sentence_jokes = jokes_df[jokes_df['text'].apply(normalize_sentence)]
one_sentence_jokes['label'] = 1

# Select 2,500 one-sentence jokes
jokes_sample = one_sentence_jokes.sample(n=5000, random_state=42)
jokes_sample = jokes_sample[['text', 'label']]

# Rename 'text' to 'comment' to match the structure with university comments
jokes_sample.rename(columns={'text': 'comment'}, inplace=True)

# Extract 2,500 random entries from university_comments and label as non-sarcastic (0)
comments_sample = comments_df.sample(n=5000, random_state=42)
comments_sample['label'] = 0
comments_sample = comments_sample[['comment', 'label']]

# Combine both datasets
combined_data = pd.concat([jokes_sample, comments_sample], ignore_index=True)

# Add a column for numbering (0 to 4999)
combined_data.reset_index(inplace=True)
combined_data.rename(columns={'index': 'number'}, inplace=True)

# Shuffle the combined data
df = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save dataset
df.to_csv('../datasets/combined_comment.csv', index=False)

# Verify the first few rows of the new dataset
print(df.head())


  comments_df = pd.read_csv('../datasets/university_comments.csv')


   number                                            comment  label
0    6252           [id82234233|Фарида], верно подмечено 😌👍🏻      0
1    4684  Обычно бухгалтера занимаются сВеркой и сВодко...      1
2    1731  Из школьного сочинения:Суворов был настоящим м...      1
3    4742  Василиса - Прекрасная после ночи с Иванушкой ...      1
4    4521  Русский человек даже обыкновенный гвоздь заб...      1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_sentence_jokes['label'] = 1


In [39]:
df = pd.read_csv('../datasets/cleaned_combined_comment.csv')

import re # optional

# Function to clean the text
def preprocess_text(text):
    # Remove special characters, numbers, and lowercase the text
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

# Ensure all entries in the 'comment' column are strings and fill NaN values with empty strings
df['comment'] = df['comment'].fillna('').astype(str)

# Apply the preprocessing function to the 'comment' column
df['comment'] = df['comment'].apply(preprocess_text)

# Display the cleaned data
print(df.head())


   number                                            comment  label
0    6252                  id82234233фарида верно подмечено       0
1    4684  обычно бухгалтера занимаются сверкои и сводкои...      1
2    1731  из школьного сочинениясуворов был настоящим му...      1
3    4742  василиса  прекрасная после ночи с иванушкои  д...      1
4    4521  русскии человек даже обыкновенныи гвоздь забив...      1


In [41]:
from sklearn.model_selection import train_test_split

# Display the column names to verify the label column
print(df.columns)

# Separate features (X) and labels (y)
X = df['comment']  # text data
y = df['label']  # Our labels (0 or 1)

# Split into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Show the size of the training and test sets
print(f'Training set size: {len(X_train)}')
print(f'Test set size: {len(X_test)}')

Index(['number', 'comment', 'label'], dtype='object')
Training set size: 5816
Test set size: 2493


In [42]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert the text data to numerical features (Bag of Words)
vectorizer = CountVectorizer()
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train the model
model = LogisticRegression()
model.fit(X_train_features, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_features)

# Evaluate the model's performance
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.74      0.82      0.78      1013
           1       0.87      0.81      0.84      1480

    accuracy                           0.81      2493
   macro avg       0.81      0.81      0.81      2493
weighted avg       0.82      0.81      0.81      2493



In [44]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to determine sentiment (positive, neutral, or negative)
def get_sentiment(comment):
    score = sia.polarity_scores(comment)
    if score['compound'] >= 0.05:
        return 'positive'
    elif score['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply the sentiment analysis to the 'comment' column
df['sentiment'] = df['comment'].apply(get_sentiment)

# Show a sample of the data
print(df[['comment', 'label', 'sentiment']].head())

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/edoszhan/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


                                             comment  label sentiment
0                  id82234233фарида верно подмечено       0   neutral
1  обычно бухгалтера занимаются сверкои и сводкои...      1   neutral
2  из школьного сочинениясуворов был настоящим му...      1   neutral
3  василиса  прекрасная после ночи с иванушкои  д...      1   neutral
4  русскии человек даже обыкновенныи гвоздь забив...      1   neutral


In [45]:
# Cross-tabulation of sarcasm vs. sentiment
sarcasm_sentiment_stats = pd.crosstab(df['label'], df['sentiment'], rownames=['Sarcasm'], colnames=['Sentiment'])

# Show statistics
print(sarcasm_sentiment_stats)

# Compute percentage breakdown
sarcasm_sentiment_percent = sarcasm_sentiment_stats.div(sarcasm_sentiment_stats.sum(1), axis=0) * 100

# Show percentage breakdown
print("Percentage breakdown of sarcastic comments by sentiment:")
print(sarcasm_sentiment_percent)


Sentiment  negative  neutral  positive
Sarcasm                               
0                 5     3296         8
1                16     4972        12
Percentage breakdown of sarcastic comments by sentiment:
Sentiment  negative    neutral  positive
Sarcasm                                 
0          0.151103  99.607132  0.241765
1          0.320000  99.440000  0.240000
