# Irony Detection in Reddit Comments

See full paper [here](https://aclanthology.org/P15-1100.pdf).

For single cell execution, please see the very bottom of this notebook.


In [22]:
!pip install spacy
!pip install sklearn
!pip install spacy-transformers
!pip install spacytextblob

from collections import Counter
import csv
import pandas as pd
import random
from functools import reduce
from tqdm import tqdm
import numpy as np
import pickle
import os

from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import KFold
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

import spacy
import spacy_transformers
from spacytextblob.spacytextblob import SpacyTextBlob

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Preparing the Data

Load and preprocess data.

Note there are separate functions for the baseline and fully featurized model.



In [4]:
def load_data(filepath):
  """Load .csv file"""
  df = pd.read_csv(filepath)

  return df

def thread_search(df, comments, idx):
  """Recursively ascend comment thread until parent is found (parent_id is None)."""
  # Check if comment is parent
  parent = df.at[idx, 'parent_id']

  if pd.isnull(parent):
    return comments
  
  for parent_idx, row in df[df['comment_id'] == parent].iterrows():
    comments = comments + [row['comment']]
    comments = thread_search(df, comments, parent_idx)
  
  return comments

def preprocess_data(df):
  """Preprocess dataframe.""" 
  nlp = spacy.load('en_core_web_sm')
  nlp.add_pipe('spacytextblob')

  labels = []
  preprocessed = [] # '<sentiment><subreddit><NNP>'
  polarities = [] # To be used in featurization

  for index, row in df.iterrows():
    if pd.isnull(row['label']) == False:
      temp = ''

      # Encode sentiment for comment text
      doc = nlp(row['comment'])
      sent = doc._.blob.polarity 
      if sent > 0:
        temp = '<pos>' # Positive sentiment
      else:
        temp = '<neg>' # Negative sentiment
      
      # Encode subreddit (1 for Conservative, 0 for Progressive)
      if row['subreddit'] == 'Conservative':
        temp = temp + '<cns>'
      else:
        temp = temp + '<lib>'
      
      # Expand comment text to thread text
      comments = thread_search(df, [row['comment']], index) + [row['thread_title']]
      comments = " ".join(sent for sent in comments)

      # Extract NNPs and remove stop words from thread text
      # Note at this point, temp is in the form <sentiment><subreddit>
      doc = nlp(comments)
      final = ''
      for token in doc:
        if token.tag_ == 'NNP' and not token.is_stop:
          final = final + ' ' + temp + token.text
      if final == '':
        continue # Do not include text that does not have any NNPs
      else:
        labels.append(row['label'])
        polarities.append(sent)
        preprocessed.append(final)
          
  return preprocessed, labels, polarities


def baseline_preprocess_data(df):
  """Preprocess data for baseline model."""
  nlp = spacy.load('en_core_web_sm')
  parsed_comments, labels, all_words, preprocessed = [], [], [], []

  # Gather comments and labels from df
  for index, row in df.iterrows():
    if pd.isnull(row['label']) == False:
      parsed_comments.append(nlp(row['comment']))
      labels.append(row['label'])

  # Tokenize comments
  for comment in parsed_comments:
    temp = []
    for token in comment: 
      # Remove whitespace and stop words
      if not token.is_space and not token.is_stop:
        temp.append(token.text)
        all_words.append(token.text)
      
    preprocessed.append(temp)
  
  return preprocessed, labels

In [5]:
df = load_data('subreddit_irony_data.csv')
df

Unnamed: 0,comment_id,comment,subreddit,parent_id,label,thread_title
0,3,We are truly following the patterns of how the...,Conservative,,-1.0,Apparently capitalism doesn't work...
1,3,\n\nBut then because they don't see what else ...,Conservative,,-1.0,Apparently capitalism doesn't work...
2,5,Absolutely.,progressive,4809.0,-1.0,The health-insurance marketplaces at the cente...
3,5,I think we'd be hard pressed to find a websit...,progressive,4809.0,-1.0,The health-insurance marketplaces at the cente...
4,5,6 million or 5 million visits in 18 hours.,progressive,4809.0,-1.0,The health-insurance marketplaces at the cente...
...,...,...,...,...,...,...
4425,3548,"""\n\nI was all set to make jokes about this id...",progressive,,-1.0,GOP Congressional Candidate Told Gay Citizens ...
4426,3548,Just..,progressive,,-1.0,GOP Congressional Candidate Told Gay Citizens ...
4427,3548,how willfully blind do you have to be to beli...,progressive,,-1.0,GOP Congressional Candidate Told Gay Citizens ...
4428,3548,To want it to be that?!,progressive,,-1.0,GOP Congressional Candidate Told Gay Citizens ...


In [8]:
# Baseline preprocessing
#baseline_X, baseline_y = baseline_preprocess_data(df)

# Full model preprocessing
full_X, full_y, polarities = preprocess_data(df)

## Featurization
For baseline, transform comment text into unigrams and bigrams using DictVectorizer.  

For fully featurized, create binary vector of all interaction features in training data using CountVectorizer and append polarity score of comment text.

In [25]:
from sklearn import feature_extraction
def full_feature_fn(X, polarities=None, vectorizer=None, isTest=False):
  """Featurize (nnp+ x sentiment x subreddit) + sentiment"""
  # Create (nnp x sentiment x subreddit) interaction feature 
  if isTest is False:
    vectorizer = CountVectorizer(binary=True, tokenizer=str.split)
    feat_X = vectorizer.fit_transform(X)
  else:
    feat_X = vectorizer.transform(X)
  
  X_array = np.asarray(feat_X.todense())
  polarities = np.array(polarities)
  polarities = polarities.reshape(len(polarities), 1)
  X_array = np.concatenate((X_array, polarities), axis=1)
  
  return X_array, vectorizer
  

def baseline_feature_fn(X, polarities=None, vectorizer=None, isTest=False):
  """Featurize unigrams and bigrams."""
  n = 2
  features = []
  for i,comment in enumerate(X): 
    bigrams = []

    # Add bigrams
    for i in range(0, len(comment)-n+1):
      bigrams.append(' '.join(comment[i:i+n]))
            
    # Combine unigrams and bigrams
    features.append(comment + bigrams)

  dicts = [Counter(comment) for comment in features]

  if isTest is False:
    vectorizer = DictVectorizer()
    X = vectorizer.fit_transform(dicts)
    return X, vectorizer
  else:
    return vectorizer.transform(dicts), vectorizer

In [26]:
# Baseline featurization
#feat_X, dv = baseline_feature_fn(baseline_X, None, None, False)

# Fully featurized model featurization
feat_X, cv = full_feature_fn(full_X, polarities, None, False)

In [27]:
print("Sample output of feature names:")
cv.get_feature_names_out()[0:5]

Sample output of feature names:


array(['<neg><cns>-', '<neg><cns>-evan', '<neg><cns>-liberal',
       '<neg><cns>.', '<neg><cns>/r'], dtype=object)

## Evaluate the Model
Run SGDClassifier using log loss and L2 regularization penalty.  

We use a 80-20 train-test split similar to the paper and run the model for 50 iterations.

In [16]:
def train_classifier(X_train, y_train, max_iter=5):
  # Grid search for alpha 
  params = {'alpha': [1e-3, 1e-2, 1e-1,]}
  # NOTE: hinge loss could also be used here
  clf = SGDClassifier(loss="log", penalty="l2", class_weight="balanced", n_jobs=-1,)
  clf = GridSearchCV(clf, params, scoring="f1",) # Performance of models is evaluated on mean accuracy
  
  return clf.fit(X_train, y_train)

def evaluate_model(X, y, clf):
  """Evaluate precision and recall of predictions."""
  preds = clf.predict(X)

  return precision_score(y, preds), recall_score(y, preds)

def summarize_metric(metric):
  """Find mean, median, 25th, and 75th percentiles of list."""
  metric.sort()

  return np.mean(metric), np.median(metric), np.percentile(metric, 25), np.percentile(metric, 75)


In [31]:
def run_model(X, y, polarities=None, feature_fn=None, is_baseline=False):
  precs = []
  recalls = []

  indices = np.arange(len(X))
  polarities = np.array(polarities)

  # Run model for 50 iterations
  for i in tqdm(range(50)):
   # print("iteration: ", i)
    X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X, y, indices, test_size=0.2)

    if not is_baseline:
      # Fully featurized model
      polarities_train = polarities[indices_train]
      polarities_test = polarities[indices_test]
      X_train_mat, vectorizer = feature_fn(X_train, polarities=polarities_train)
      X_test_mat, _ = feature_fn(X_test, polarities=polarities_test, vectorizer=vectorizer, isTest=True)
    else:
      # Baseline model
      X_train_mat, vectorizer = feature_fn(X_train)
      X_test_mat, _ = feature_fn(X_test, vectorizer=vectorizer, isTest=True)

    clf = train_classifier(X_train_mat, y_train)
    prec, recall = evaluate_model(X_test_mat, y_test, clf)
    precs.append(prec) 
    recalls.append(recall)
      
  return precs, recalls

In [32]:
# Run baseline model
#precs, recalls = run_model(baseline_X, baseline_y, feature_fn=baseline_feature_fn, is_baseline=True)

# Run fully featurized model
#precs, recalls = run_model(full_X, full_y, polarities=polarities, feature_fn=full_feature_fn)

In [33]:
#print("(mean, median, 25th percentile, 75th percentile)")
#print(summarize_metric(recalls))
#print(summarize_metric(precs))

## One Cell Execution

In [34]:
def main():
  # Load dataset
  df = load_data('subreddit_irony_data.csv')

  # Preprocess
  baseline_X, baseline_y = baseline_preprocess_data(df)
  full_X, full_y, polarities = preprocess_data(df)

  # Run baseline and fully featurized models
  print("######## RUNNING BASELINE MODEL ########")
  baseline_precs, baseline_recalls = run_model(baseline_X, baseline_y, feature_fn=baseline_feature_fn, is_baseline=True)
  
  print("######## RUNNING FULLY FEATURIZED MODEL ########")
  full_precs, full_recalls = run_model(full_X, full_y, polarities=polarities, feature_fn=full_feature_fn)

  # Generate summary metrics
  bow_mean_precision, bow_median_precision, bow_25th_perc_precision, bow_75th_perc_precision = summarize_metric(baseline_precs)
  bow_mean_recall, bow_median_recall, bow_25th_perc_recall, bow_75th_perc_recall = summarize_metric(baseline_recalls)

  np_mean_precision, np_median_precision, np_25th_perc_precision, np_75th_perc_precision = summarize_metric(full_precs)
  np_mean_recall, np_median_recall, np_25th_perc_recall, np_75th_perc_recall = summarize_metric(full_recalls)
  
  #do not edit below this line

  def fformat(f):
    return "%.2f" % f

  print("Bag of Words Baseline")
  print("Precision")
  print(fformat(bow_mean_precision), fformat(bow_median_precision), fformat(bow_25th_perc_precision), fformat(bow_75th_perc_precision))
  print("Recall")
  print(fformat(bow_mean_recall), fformat(bow_median_recall), fformat(bow_25th_perc_recall), fformat(bow_75th_perc_recall))

  print("NP Sentiment Context Model")
  print("Precision")
  print(fformat(np_mean_precision), fformat(np_median_precision), fformat(np_25th_perc_precision), fformat(np_75th_perc_precision))
  print("Recall")
  print(fformat(np_mean_recall), fformat(np_median_recall), fformat(np_25th_perc_recall), fformat(np_75th_perc_recall))


In [35]:
main()

######## RUNNING BASELINE MODEL ########


100%|██████████| 50/50 [00:25<00:00,  1.99it/s]


######## RUNNING FULLY FEATURIZED MODEL ########


100%|██████████| 50/50 [04:48<00:00,  5.76s/it]

Bag of Words Baseline
Precision
0.11 0.10 0.09 0.12
Recall
0.29 0.30 0.25 0.33
NP Sentiment Context Model
Precision
0.19 0.18 0.15 0.21
Recall
0.41 0.43 0.35 0.45



