## The majority of the code in this notebook is work produced by Khodak et al. (2017) and can be found at the following links: https://github.com/NLPrinceton/SARC and https://github.com/NLPrinceton/text_embedding

## Code from this repositories has been referenced below.



## Import libraries and data

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import json
from sklearn.model_selection import train_test_split

import argparse
import nltk
from sklearn.linear_model import LogisticRegressionCV as LogitCV
from sklearn.preprocessing import normalize
from sklearn.metrics import f1_score

from collections import Counter
from itertools import chain
from itertools import groupby
from operator import itemgetter
#from string import punctuation
from unicodedata import category
import nltk
import numpy as np
from scipy import sparse as sp

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')


### SARC Directory Paths ###
SARC_POL = '/content/gdrive/My Drive/SARC pol/project_data/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
############################################ Khodak et al. 2017

def tokenize(documents):
  '''tokenizes documents
  Args:
    documents: iterable of strings
  Returns:
    list of list of strings
  '''

  return [list(split_on_punctuation(doc)) for doc in documents]

###############################################################

In [0]:
############################################ Khodak et al. 2017

#PUNCTUATION = set(punctuation)
PUNCTUATION = {'M', 'P', 'S'}
UINT = np.uint16
from unicodedata import category

def split_on_punctuation(document):
  '''tokenizes string by splitting on spaces and punctuation
  Args:
    document: string
  Returns:
    str generator
  '''

  for token in document.split():
    if len(token) == 1:
      yield token
    else:
      chunk = token[0]
      for char0, char1 in zip(token[:-1], token[1:]):
        #if (char0 in PUNCTUATION) == (char1 in PUNCTUATION):
        if (category(char0)[0] in PUNCTUATION) == (category(char1)[0] in PUNCTUATION):
          chunk += char1
        else:
          yield chunk
          chunk = char1
      if chunk:
        yield chunk
        
###############################################################

In [0]:
############################################ Khodak et al. 2017

def feature_counts(documents):
  '''computes feature counts from featurized documents
  Args:
    documents: iterable of lists of hashable features
  Returns:
    dict mapping features to counts
  '''

  return Counter(feat for doc in documents for feat in doc)


def feature_vocab(documents, min_count=1, sorted_features=sorted):
  '''gets feature vocabulary from featurized documents
  Args:
    documents: iterable of lists of hashable features
    min_count: minimum number of times feature must appear to be included in the vocabulary
    sorted_features: function that sorts the features
  Returns:
    {feature: index} dict
  '''
  
  return {feat: i for i, feat in enumerate(sorted_features(feat for feat, count in feature_counts(documents).items() if count >= min_count))}


def docs2bofs(documents, vocabulary=None, weights=None, default=1.0, format='csr', **kwargs):
  '''constructs sparse BoF representations from featurized documents
  Args:
    documents: iterable of lists of hashable features
    vocabulary: dict mapping features to indices (nonnegative ints) or a list of features; if None will compute automatically from documents
    weights: dict mapping features to weights (floats) or a list/np.ndarray of weights; if None will compute unweighted BoFs
    default: default feature weight if not feature in weights; ignored if weights is None
    format: sparse matrix format
    kwargs: passed to feature_vocab; ignored if not vocabulary is None
  Returns:
    sparse BoF matrix in CSR format of size (len(documents), len(vocabulary))
  '''

  if vocabulary is None:
    vocabulary = feature_vocab(documents, **kwargs)
  elif type(vocabulary) == list:
    vocabulary = {feat: i for i, feat in enumerate(vocabulary)}

  rows, cols, values = zip(*((row, col, count) for (row, col), count in Counter((i, vocabulary.get(feat, -1)) for i, doc in enumerate(documents) for feat in doc).items() if not col==-1))
  m = len(documents)
  V = len(vocabulary)
  if weights is None:
    return sp.coo_matrix((values, (rows, cols)), shape=(m, V), dtype=UINT).asformat(format)
  bofs = sp.coo_matrix((values, (rows, cols)), shape=(m, V)).tocsr()

  if type(weights) == dict:
    diag = np.empty(V)
    for feat, i in vocabulary.items():
      diag[i] = weights.gets(feat, default)
  else:
    assert len(weights) == V, "if weights passed as a list/np.ndarray, length must be same as vocabulary size"
    if type(weights) == list:
      diag = np.array(weights)
    else:
      diag = weights
  return bofs.dot(sp.diags(diag, 0)).asformat(format)

        
###############################################################

## Run the classifier

In [0]:
filename = ['project_training_100.csv','project_training_50.csv','project_training_25.csv','project_training_12.csv','project_training_6.csv']

# Build a dict to contain all the wrong predictions for each split
wrongPredDict = dict()

# Let's set up lists to append into so we can plot our results
trainAccList = []
testAccList = []
balScoreList = []
f1ScoreList = []

#Load in the test set
validdf = pd.read_csv(SARC_POL+'balanced_test.csv', index_col = 0)

for fname in filename:
  
  #Load in the training set
  traindf = pd.read_csv(SARC_POL+fname, index_col = 0)
  
  # Only use responses for this method. Ignore ancestors.
  train_resp = traindf['response'].values.tolist()
  test_resp = validdf['response'].values.tolist()
  train_labels = traindf['label'].values.tolist()
  test_labels = validdf['label'].values.tolist()

  ########## Manipulate the data so that it is in the same form that the code from Khodak et al. uses
  # Train responses
  first_resp = []
  second_resp = []
  for idx, resp in enumerate(train_resp):

    if idx % 2 == 0:
      first_resp.append(resp)
    else:
      second_resp.append(resp)

  train_docs = {0: first_resp, 1: second_resp}


  # Test responses
  first_resp = []
  second_resp = []
  for idx, resp in enumerate(test_resp):

    if idx % 2 == 0:
      first_resp.append(resp)
    else:
      second_resp.append(resp)

  test_docs = {0: first_resp, 1: second_resp}


  # Train labels
  first_label = []
  second_label = []
  for idx, label in enumerate(train_labels):

    if idx % 2 == 0:
      first_label.append(label)
    else:
      second_label.append(label)

  train_labels = {0: first_label, 1: second_label}


  # Test labels
  first_label = []
  second_label = []
  for idx, label in enumerate(test_labels):

    if idx % 2 == 0:
      first_label.append(label)
    else:
      second_label.append(label)

  test_labels = {0: first_label, 1: second_label}
  
  ############################################ Khodak et al. 2017

  # Train a classifier on all responses in training data. We will later use this
  # classifier to determine for every sequence which of the 2 responses is more sarcastic.
  train_all_docs_tok = tokenize(train_docs[0] + train_docs[1])
  test_all_docs_tok = tokenize(test_docs[0] + test_docs[1])
  train_all_labels = np.array(train_labels[0] + train_labels[1])
  test_all_labels = np.array(test_labels[0] + test_labels[1])

  n = 1
  min_count = 1
  train_ngrams = [sum((list(nltk.ngrams(doc, k)) for k in range(1, n+1)), []) for doc in train_all_docs_tok]
  test_ngrams = [sum((list(nltk.ngrams(doc, k)) for k in range(1, n+1)), []) for doc in test_all_docs_tok]
  vocabulary = feature_vocab(train_ngrams, min_count=min_count)
  train_all_vecs = docs2bofs(train_ngrams, vocabulary)
  test_all_vecs = docs2bofs(test_ngrams, vocabulary)
  
  print(fname, "\n----------------------------------")
  
  wrongPred = []
    
  for seed in [0, 24, 729, 857, 403]:

    # Evaluate this classifier on all responses.
    clf = LogitCV(Cs=[10**i for i in range(-2, 3)], fit_intercept=False, cv=2, dual=np.less(*train_all_vecs.shape), solver='liblinear', n_jobs=-1, random_state=seed) 
    clf.fit(train_all_vecs, train_all_labels)
    print('Train acc: ', clf.score(train_all_vecs, train_all_labels))
    print('Test acc: ', clf.score(test_all_vecs, test_all_labels))
    
  ###############################################################
    
    # Balanced Test Score Calculation
    nAncestors = len(test_labels[0])
    countCorrect = 0
    for i in range(nAncestors):
      
      
      # For each ancestor grab the two responses
      scoreResponse0 = clf.predict_proba(test_all_vecs[i])[0][1]
      scoreResponse1 = clf.predict_proba(test_all_vecs[i+nAncestors])[0][1]
      
      # Calculate which response is more likely to be sarcastic and check whether it is the case
      if scoreResponse0 > scoreResponse1 and test_labels[0][i] == 1:
        countCorrect += 1
        
      elif scoreResponse1 > scoreResponse0 and test_labels[1][i] == 1:
        countCorrect += 1
        

      # Collect the responses that we have misclassified
      if seed == 403:

        if scoreResponse0 > scoreResponse1 and test_labels[0][i] == 0:
          wrongPred.append(test_all_docs_tok[i])

        elif scoreResponse1 > scoreResponse0 and test_labels[1][i] == 0:
          wrongPred.append(test_all_docs_tok[nAncestors+i])
      
    print("Balanced Test Score:", countCorrect/nAncestors)
    
    # F1 Prediction
    
    y_pred = []
    y_true = []
    
    for i in range(len(test_labels[0])+len(test_labels[1])):
      
      # For each response, classify the response as the label with the highest probability and stores it's true label
      scoreResponse = clf.predict_proba(test_all_vecs[i])[0][1]
      
      if i < nAncestors:
        
        y_true.append(test_labels[0][i])
        
      else:
        
        y_true.append(test_labels[1][i-nAncestors])
      
      if scoreResponse > 0.5:
        y_pred.append(1)
        
      else:
        y_pred.append(0)
       
    f1Score = f1_score(y_true, y_pred, pos_label=1, average='binary', sample_weight=None)
    print("F1 Score:", f1Score, "\n")
    
    # Append all of our results

    trainAccList.append(clf.score(train_all_vecs, train_all_labels))
    testAccList.append(clf.score(test_all_vecs, test_all_labels))
    balScoreList.append(countCorrect/nAncestors)
    f1ScoreList.append(f1Score)
        
  wrongPredDict[fname] = wrongPred     
   
  print("---------------------------------- \n\n")
  

project_training_100.csv 
----------------------------------
Train acc:  0.7784891165172856
Test acc:  0.694950088079859
Balanced Test Score: 0.7469172049324722
F1 Score: 0.6854374810778081 

Train acc:  0.7784891165172856
Test acc:  0.694950088079859
Balanced Test Score: 0.7469172049324722
F1 Score: 0.6854374810778081 

Train acc:  0.7784891165172856
Test acc:  0.694950088079859
Balanced Test Score: 0.7469172049324722
F1 Score: 0.6854374810778081 

Train acc:  0.7784891165172856
Test acc:  0.694950088079859
Balanced Test Score: 0.7469172049324722
F1 Score: 0.6854374810778081 

Train acc:  0.7784891165172856
Test acc:  0.694950088079859
Balanced Test Score: 0.7469172049324722
F1 Score: 0.6854374810778081 

---------------------------------- 


project_training_50.csv 
----------------------------------
Train acc:  0.7943651664837176
Test acc:  0.6723429242513213
Balanced Test Score: 0.731650029359953
F1 Score: 0.6582976117575016 

Train acc:  0.7943651664837176
Test acc:  0.67234292425

## Finished