## The majority of the code in this notebook is work produced by Khodak et al. (2017) and can be found at the following links: https://github.com/NLPrinceton/SARC and https://github.com/NLPrinceton/text_embedding

## Code from these repositories has been referenced below.

## Import libraries and data

In [1]:
!pip install allennlp

Collecting allennlp
[?25l  Downloading https://files.pythonhosted.org/packages/64/32/d6d0a93a23763f366df2dbd4e007e45ce4d2ad97e6315506db9da8af7731/allennlp-0.8.2-py3-none-any.whl (5.6MB)
[K    100% |████████████████████████████████| 5.6MB 3.1MB/s 
[?25hCollecting unidecode (from allennlp)
[?25l  Downloading https://files.pythonhosted.org/packages/31/39/53096f9217b057cb049fe872b7fc7ce799a1a89b76cf917d9639e7a558b5/Unidecode-1.0.23-py2.py3-none-any.whl (237kB)
[K    100% |████████████████████████████████| 245kB 25.0MB/s 
[?25hCollecting sqlparse==0.2.4 (from allennlp)
  Downloading https://files.pythonhosted.org/packages/65/85/20bdd72f4537cf2c4d5d005368d502b2f464ede22982e724a82c86268eda/sqlparse-0.2.4-py2.py3-none-any.whl
Collecting matplotlib==2.2.3 (from allennlp)
[?25l  Downloading https://files.pythonhosted.org/packages/9e/59/f235ab21bbe7b7c6570c4abf17ffb893071f4fa3b9cf557b09b60359ad9a/matplotlib-2.2.3-cp36-cp36m-manylinux1_x86_64.whl (12.6MB)
[K    100% |██████████████████████

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import json
from sklearn.model_selection import train_test_split
import pickle

import argparse
import nltk
from sklearn.linear_model import LogisticRegressionCV as LogitCV
from sklearn.preprocessing import normalize
from sklearn.metrics import f1_score

from collections import Counter
from itertools import chain
from itertools import groupby
from operator import itemgetter
#from string import punctuation
from unicodedata import category
import nltk
import numpy as np
from scipy import sparse as sp

from allennlp.commands.elmo import ElmoEmbedder

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

SARC_POL = '/content/gdrive/My Drive/SARC pol/project_data/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
############################################ Khodak et al. 2017

def tokenize(documents):
  '''tokenizes documents
  Args:
    documents: iterable of strings
  Returns:
    list of list of strings
  '''

  return [list(split_on_punctuation(doc)) for doc in documents]

###############################################################

In [0]:
############################################ Khodak et al. 2017

#PUNCTUATION = set(punctuation)
PUNCTUATION = {'M', 'P', 'S'}
UINT = np.uint16
from unicodedata import category

def split_on_punctuation(document):
  '''tokenizes string by splitting on spaces and punctuation
  Args:
    document: string
  Returns:
    str generator
  '''

  for token in document.split():
    if len(token) == 1:
      yield token
    else:
      chunk = token[0]
      for char0, char1 in zip(token[:-1], token[1:]):
        #if (char0 in PUNCTUATION) == (char1 in PUNCTUATION):
        if (category(char0)[0] in PUNCTUATION) == (category(char1)[0] in PUNCTUATION):
          chunk += char1
        else:
          yield chunk
          chunk = char1
      if chunk:
        yield chunk

###############################################################

In [0]:
############################################ Khodak et al. 2017

def feature_counts(documents):
  '''computes feature counts from featurized documents
  Args:
    documents: iterable of lists of hashable features
  Returns:
    dict mapping features to counts
  '''

  return Counter(feat for doc in documents for feat in doc)


def feature_vocab(documents, min_count=1, sorted_features=sorted):
  '''gets feature vocabulary from featurized documents
  Args:
    documents: iterable of lists of hashable features
    min_count: minimum number of times feature must appear to be included in the vocabulary
    sorted_features: function that sorts the features
  Returns:
    {feature: index} dict
  '''
  
  return {feat: i for i, feat in enumerate(sorted_features(feat for feat, count in feature_counts(documents).items() if count >= min_count))}


def docs2bofs(documents, vocabulary=None, weights=None, default=1.0, format='csr', **kwargs):
  '''constructs sparse BoF representations from featurized documents
  Args:
    documents: iterable of lists of hashable features
    vocabulary: dict mapping features to indices (nonnegative ints) or a list of features; if None will compute automatically from documents
    weights: dict mapping features to weights (floats) or a list/np.ndarray of weights; if None will compute unweighted BoFs
    default: default feature weight if not feature in weights; ignored if weights is None
    format: sparse matrix format
    kwargs: passed to feature_vocab; ignored if not vocabulary is None
  Returns:
    sparse BoF matrix in CSR format of size (len(documents), len(vocabulary))
  '''

  if vocabulary is None:
    vocabulary = feature_vocab(documents, **kwargs)
  elif type(vocabulary) == list:
    vocabulary = {feat: i for i, feat in enumerate(vocabulary)}

  rows, cols, values = zip(*((row, col, count) for (row, col), count in Counter((i, vocabulary.get(feat, -1)) for i, doc in enumerate(documents) for feat in doc).items() if not col==-1))
  m = len(documents)
  V = len(vocabulary)
  if weights is None:
    return sp.coo_matrix((values, (rows, cols)), shape=(m, V), dtype=UINT).asformat(format)
  bofs = sp.coo_matrix((values, (rows, cols)), shape=(m, V)).tocsr()

  if type(weights) == dict:
    diag = np.empty(V)
    for feat, i in vocabulary.items():
      diag[i] = weights.gets(feat, default)
  else:
    assert len(weights) == V, "if weights passed as a list/np.ndarray, length must be same as vocabulary size"
    if type(weights) == list:
      diag = np.array(weights)
    else:
      diag = weights
  return bofs.dot(sp.diags(diag, 0)).asformat(format)

###############################################################

In [0]:
############################################ Khodak et al. 2017
import numpy as np
from numpy.linalg import norm
from scipy.linalg import svd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import normalize


FLOAT = np.float32
# NOTE: filepath for Common Crawl GloVe embeddings goes here
CCGLOVE = "/content/gdrive/My Drive/SARC pol/embeddings/glove/amazon_glove1600.txt"


# NOTE: Some files have 2d or 2d+2 numbers on each line, with the last d of them being meaningless; avoid loading them by setting dimension=d
def load(vectorfile, vocabulary=None, dimension=None):
  '''generates word embeddings from file
  Args:
    vectorfile: word embedding text file or HDF5 file with keys 'words' and 'vectors'
    vocabulary: dict/set of strings, or int specifying number of words to load; if None loads all words from file
    dimension: number of dimensions to load
  Returns:
    (word, vector) generator
  '''

  try:
    f = h5py.File(vectorfile, 'r')
    words, vectors = np.array(f['words']), np.array(f['vectors'])
    for word, vector in zip(words, vectors):
      if vocabulary is None or word in vocabulary:
        yield word, vector
    f.close()

  except OSError:
    if vocabulary is None:
      V = float('inf')
    elif type(vocabulary) == int:
      V = vocabulary
      vocabulary = None
    else:
      V = len(vocabulary)
    dimension = -1 if dimension is None else dimension

    with open(vectorfile, 'r') as f:
      n = 0
      for line in f:
        index = line.index(' ')
        word = line[:index]
        if vocabulary is None or word in vocabulary:
          yield word, np.fromstring(line[index+1:], dtype=FLOAT, count=dimension, sep=' ')
          n += 1
        if n == V:
          break


def text2hdf5(textfile, hdf5file, **kwargs):
  '''converts word embeddings file from text to HDF5 format
  Args:
      textfile: word embeddings file in format "word float ... float\n"
      hdf5file: output file ; will have keys 'words' and 'vectors'
      kwargs: passed to load
  Returns:
      None
  '''

  words, vectors = zip(*load(textfile, **kwargs))
  f = h5py.File(hdf5file)
  f.create_dataset('words', (len(words),), dtype=h5py.special_dtype(vlen=str))
  for i, word in enumerate(words):
      f['words'][i] = word
  f.create_dataset('vectors', data=np.vstack(vectors))
  f.close()


def vocab2mat(vocabulary=None, random=None, vectorfile=CCGLOVE, dimension=None, unit=True):
  '''constructs matrix of word vectors
  Args:
    vocabulary: dict mapping strings to indices, or iterable of strings, or int specifying vocab size; if None loads all words in vectorfile
    random: type ('Gaussian' or 'Rademacher') of random vectors to use; if None uses pretrained vectors; if tuple (low, high) uses uniform distribution over [low, high)
    vectorfile: word embedding text file; ignored if not random is None
    dimension: embedding dimension
    unit: normalize embeddings
  Returns:
    numpy matrix of size (len(vocabulary), dimension)
  '''

  assert random is None or not vocabulary is None, "needs vocabulary size information for random vectors"
  assert random is None or not dimension is None, "needs dimension information for random vectors"

  if random is None:

    if type(vocabulary) == set:
      vocabulary = sorted(vocabulary)
    if type(vocabulary) == list:
      vocabulary = {word: i for i, word in enumerate(vocabulary)}
    if type(vocabulary) == dict:
      matrix = np.zeros((len(vocabulary), dimension), dtype=FLOAT)
      for word, vector in load(vectorfile, vocabulary, dimension):
        matrix[vocabulary[word]] = vector
    else:
      matrix = np.vstack(vector for _, vector in load(vectorfile, vocabulary, dimension))
  
  else:

    if not type(vocabulary) == int:
      vocabulary = len(vocabulary)
    if type(random) == tuple:
      return np.random.uniform(*random, size=(vocabulary, dimension)).astype(FLOAT)
    elif random.lower() == 'gaussian':
      matrix = np.random.normal(scale=1.0/np.sqrt(dimension), size=(vocabulary, dimension)).astype(FLOAT)
    elif random.lower() == 'rademacher':
      return (2.0*np.random.randint(2, size=(vocabulary, dimension)).astype(FLOAT)-1.0)/np.sqrt(dimension)
    else:
      raise(NotImplementedError)

  if unit:
    return normalize(matrix)
  return matrix


def vocab2vecs(vocabulary=None, random=None, vectorfile=CCGLOVE, dimension=None, unit=True):
  '''constructs dict mapping words to vectors
  Args:
    vocabulary: iterable of strings, or int specifying vocab size; if None loads all words in vectorfile
    random: type ('Gaussian' or 'Rademacher') of random vectors to use; if None uses pretrained vectors
    vectorfile: word embedding text file; ignored if not random is None
    dimension: embedding dimension
    unit: normalize embeddings
  Returns:
    {word: vector} dict; words not in vectorfile are not included
  '''

  assert random is None or not (vocabulary is None or type(vocabulary) == int), "needs word information for random vectors"

  if random is None:
    if unit:
      return {word: vector/norm(vector) for word, vector in load(vectorfile, vocabulary, dimension)}
    return dict(load(vectorfile, vocabulary, dimension))
  return dict(zip(vocabulary, vocab2mat(vocabulary, random=random, dimension=dimension, unit=unit)))


def docs2vecs(documents, f2v=None, weights=None, default=1.0, avg=False, **kwargs):
  '''computes document embeddings from documents
  Args:
    documents: iterable of lists of hashable features
    f2v: dict mapping features to vectors; if None will compute this using vocab2vecs
    weights: dict mapping features to weights; unweighted if None
    default: default weight to assign if feature not in weights; ignored if weights is None
    avg: divide embeddings by the document length
    kwargs: passed to vocab2vecs; ignored if not f2v is None
  Returns:
    matrix of size (len(documents), dimension)
  '''

  if f2v is None:
    f2v = vocab2vecs({word for document in documents for word in documents}, **kwargs)
    dimension = kwargs.get('dimension', 300)
  else:
    dimensions = {v.shape for v in f2v.values()}
    assert len(dimensions) == 1, "all feature vectors must have same dimension"
    dimension = dimensions.pop()
  if not weights is None:
    f2v = {feat: weights.get(feat, default)*vec for feat, vec in f2v.items()}
    
  z = np.zeros(dimension, dtype=FLOAT)
  if avg:
    return np.vstack(sum((f2v.get(feat, z) for feat in document), z) / max(1.0, len(document)) for document in documents)
  return np.vstack(sum((f2v.get(feat, z) for feat in document), z) for document in documents)


class OrthogonalProcrustes:
  '''sklearn-style class for solving the Orthogonal Procrustes problem
  '''

  def __init__(self, fit_intercept=False):
    '''initializes object
    Args:
      fit_intercept: whether to find best transformation after translation
    Returns:
      None
    '''

    self.fit_intercept = fit_intercept

  def fit(self, X, Y):
    '''finds orthogonal matrix M minimizing |XM^T-Y|
    Args:
      X: numpy array of shape (n, d)
      Y: numpy array of shape (n, d)
    Returns:
      self (with attribute coef_, a numpy array of shape (d, d)
    '''

    if self.fit_intercept:
      Xbar, Ybar = np.mean(X, axis=0), np.mean(Y, axis=0)
      X, Y = X-Xbar, Y-Ybar
    U, _, VT = svd(Y.T.dot(X))
    self.coef_ = U.dot(VT)
    if self.fit_intercept:
      self.intercept_ = Ybar - self.coef_.dot(Xbar)
    else:
      self.intercept_ = np.zeros(self.coef_.shape[0], dtype=self.coef_.dtype)
    return self


def align_vocab(func):
  '''wrapper to align vocab to allow word-to-vector dict inputs to functions taking two word-vector matrices as inputs
  '''

  def wrapper(X, Y, **kwargs):
    assert type(X) == type(Y), "first two arguments must be the same type"
    if type(X) == dict:
      vocab = sorted(set(X.keys()).intersection(Y.keys()))
      X = np.vstack(X[w] for w in vocab)
      Y = np.vstack(Y[w] for w in vocab)
    else:
      assert type(X) == np.ndarray, "first two arguments must be 'dict' or 'numpy.ndarray'"
    return func(X, Y, **kwargs)

  return wrapper


@align_vocab
def best_transform(source, target, orthogonal=True, fit_intercept=False):
  '''computes best matrix between two sets of word embeddings in terms of least-squares error
  Args:
    source: numpy array of size (len(vocabulary), dimension) or dict mapping words to vectors; must be same type as target
    target: numpy array of size (len(vocabulary), dimension) or dict mapping words to vectors; must be same type as source
    orthogonal: if True constrains best transform to be orthogonal
    fit_intercept: whether to find best transformation after translation
  Returns:
    numpy array of size (dimension, dimension)
  '''

  if orthogonal:
    transform = OrthogonalProcrustes(fit_intercept=fit_intercept).fit(source, target)
  else:
    transform = LinearRegression(fit_intercept=fit_intercept).fit(source, target)
    if not fit_intercept:
      transform.intercept_ = np.zeros(target.shape[1])
  return transform.coef_.astype(target.dtype), transform.intercept_.astype(target.dtype)


@align_vocab
def average_cosine_similarity(X, Y):
  '''computes the average cosine similarity between two sets of word embeddings
  Args:
    X: numpy array of size (len(vocabulary), dimension) or dict mapping words to vectors; must be same type as target
    Y: numpy array of size (len(vocabulary), dimension) or dict mapping words to vectors; must be same type as source
  Returns:
    average cosine similarity as a float
  '''

  return np.mean((normalize(X) * normalize(Y)).sum(1))

###############################################################

In [0]:
def get_elmo_embeddings(elmo, tokens):

    embeddings = []
    
    for elmo_embedding in elmo.embed_sentences(tokens):  
        # Mean pool the 3 layers returned from ELMo
        avg_elmo_embedding = np.average(elmo_embedding, axis=0)
        
        #Mean pool over the words
        avg_elmo_embedding = np.average(avg_elmo_embedding, axis=0)
             
        embeddings.append(avg_elmo_embedding)        
            
    return np.array(embeddings)

In [0]:
CUDA_VISIBLE_DEVICES = 1

In [10]:
elmo = ElmoEmbedder(cuda_device=0)

100%|██████████| 336/336 [00:00<00:00, 31821.67B/s]
100%|██████████| 374434792/374434792 [00:14<00:00, 26421775.04B/s]


## Let's go

## Embed the test set once

In [0]:
testdf = pd.read_csv(SARC_POL+'balanced_test.csv', index_col = 0)

In [0]:
test_resp = testdf['response'].values.tolist()
test_labels = testdf['label'].values.tolist()

# Test resp
first_resp = []
second_resp = []
for idx, resp in enumerate(test_resp):

  if idx % 2 == 0:
    first_resp.append(resp)
  else:
    second_resp.append(resp)

test_docs = {0: first_resp, 1: second_resp}

In [0]:
# Test labels
first_label = []
second_label = []
for idx, label in enumerate(test_labels):

  if idx % 2 == 0:
    first_label.append(label)
  else:
    second_label.append(label)

test_labels = {0: first_label, 1: second_label}

In [0]:
test_all_docs_tok = tokenize(test_docs[0] + test_docs[1])
test_all_labels = np.array(test_labels[0] + test_labels[1])
#test_all_vecs = get_elmo_embeddings(elmo, test_all_docs_tok)

In [0]:
#with open("/content/gdrive/My Drive/SARC pol/elmo/testembeddings.pickle", 'wb') as handle:
#  pickle.dump(test_all_vecs, handle)

In [0]:
with open("/content/gdrive/My Drive/SARC pol/elmo/testembeddings.pickle", 'rb') as handle:
  test_all_vecs = pickle.load(handle)

### 100% project training set

In [17]:
filename =  ['project_training_100.csv']

record_100 = pd.DataFrame()

#Load in the test set
validdf = pd.read_csv(SARC_POL+'balanced_test.csv', index_col = 0)

for fname in filename:
  
  #Load in the training set
  traindf = pd.read_csv(SARC_POL+fname, index_col = 0)
  
  # Only use responses for this method. Ignore ancestors.
  train_resp = traindf['response'].values.tolist()
  
  train_labels = traindf['label'].values.tolist()
  

  # Train resp
  first_resp = []
  second_resp = []
  for idx, resp in enumerate(train_resp):

    if idx % 2 == 0:
      first_resp.append(resp)
    else:
      second_resp.append(resp)

  train_docs = {0: first_resp, 1: second_resp}

  # Train labels
  first_label = []
  second_label = []
  for idx, label in enumerate(train_labels):

    if idx % 2 == 0:
      first_label.append(label)
    else:
      second_label.append(label)

  train_labels = {0: first_label, 1: second_label}

  # Train a classifier on all responses in training data. We will later use this
  # classifier to determine for every sequence which of the 2 responses is more sarcastic.
  train_all_docs_tok = tokenize(train_docs[0] + train_docs[1])

  train_all_labels = np.array(train_labels[0] + train_labels[1])

  print(fname, "\n----------------------------------")
    
  for seed in [45, 89, 123, 9, 54]:
    
    train_all_vecs = get_elmo_embeddings(elmo, train_all_docs_tok)
    
    #with open(f"/content/gdrive/My Drive/SARC pol/elmo/100embeddings_{seed}.pickle", 'wb') as handle:
    #  pickle.dump(train_all_vecs, handle)

  ############################################ Khodak et al. 2017
      
    # Evaluate this classifier on all responses.
#     print('Evaluate the classifier on all responses')
    clf = LogitCV(Cs=[10**i for i in range(-2, 3)], fit_intercept=False, cv=2, dual=np.less(*train_all_vecs.shape), solver='liblinear', n_jobs=-1, random_state=seed) 
    clf.fit(train_all_vecs, train_all_labels)
    #print('Train acc: ', clf.score(train_all_vecs, train_all_labels))
    test_acc = clf.score(test_all_vecs, test_all_labels)
    #print('Test acc: ', test_acc )
    
  ############################################  
  
    # Balanced Test Score Calculation
    nAncestors = len(test_labels[0])
    countCorrect = 0
    for i in range(nAncestors):
      
      scoreResponse0 = clf.predict_proba(test_all_vecs[i].reshape(1,-1))[0,1]
      scoreResponse1 = clf.predict_proba(test_all_vecs[i+nAncestors].reshape(1,-1))[0,1]
     
      if scoreResponse0 > scoreResponse1 and test_labels[0][i] == 1:
        countCorrect += 1
        
      elif scoreResponse1 > scoreResponse0 and test_labels[1][i] == 1:
        countCorrect += 1
       
    bal_test_score =  countCorrect/nAncestors
    #print("Balanced Test Score:", bal_test_score )
    
    # F1 Prediction
    
    y_pred = []
    y_true = []
    
    for i in range(len(test_labels[0])+len(test_labels[1])):
      
      scoreResponse = clf.predict_proba(test_all_vecs[i].reshape(1,-1))[0][1]
      
      if i < nAncestors:
        
        y_true.append(test_labels[0][i])
        
      else:
        
        y_true.append(test_labels[1][i-nAncestors])
      
      if scoreResponse > 0.5:
        y_pred.append(1)
        
      else:
        y_pred.append(0)
       
    f1Score = f1_score(y_true, y_pred, pos_label=1, average='binary', sample_weight=None)
    #print("F1 Score:", f1Score)
    
    #Store the results
    record_100 = record_100.append({'Acc (bal, bal)': bal_test_score, 'Acc (bal, reg)':test_acc , 'F1 (bal, reg)': f1Score}, ignore_index=True)
    
print(record_100)

with open("/content/gdrive/My Drive/SARC pol/elmo/test100.pickle", 'wb') as handle:
  pickle.dump(record_100, handle)
  

project_training_100.csv 
----------------------------------
   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.745156        0.682913       0.678188
1        0.745156        0.682913       0.678188
2        0.745156        0.682913       0.678188
3        0.745156        0.682913       0.678188
4        0.745156        0.682913       0.678188


### 50% project training set

In [0]:
filename =  ['project_training_50.csv']

record_50 = pd.DataFrame()

for fname in filename:
  
  #Load in the training set
  traindf = pd.read_csv(SARC_POL+fname, index_col = 0)
  
  # Only use responses for this method. Ignore ancestors.
  train_resp = traindf['response'].values.tolist()
  
  train_labels = traindf['label'].values.tolist()
  

  # Train resp
  first_resp = []
  second_resp = []
  for idx, resp in enumerate(train_resp):

    if idx % 2 == 0:
      first_resp.append(resp)
    else:
      second_resp.append(resp)

  train_docs = {0: first_resp, 1: second_resp}

  # Train labels
  first_label = []
  second_label = []
  for idx, label in enumerate(train_labels):

    if idx % 2 == 0:
      first_label.append(label)
    else:
      second_label.append(label)

  train_labels = {0: first_label, 1: second_label}

  ############################################ Khodak et al. 2017
  
  # Train a classifier on all responses in training data. We will later use this
  # classifier to determine for every sequence which of the 2 responses is more sarcastic.
  train_all_docs_tok = tokenize(train_docs[0] + train_docs[1])

  train_all_labels = np.array(train_labels[0] + train_labels[1])

  print(fname, "\n----------------------------------")
    
  for seed in [1, 2, 3, 4, 5]:
    
    train_all_vecs = get_elmo_embeddings(elmo, train_all_docs_tok)
    
    with open(f"/content/gdrive/My Drive/SARC pol/elmo/50embeddings_{seed}.pickle", 'wb') as handle:
      pickle.dump(train_all_vecs, handle)

    # Evaluate this classifier on all responses.
#     print('Evaluate the classifier on all responses')
    clf = LogitCV(Cs=[10**i for i in range(-2, 3)], fit_intercept=False, cv=2, dual=np.less(*train_all_vecs.shape), solver='liblinear', n_jobs=-1, random_state=seed) 
    clf.fit(train_all_vecs, train_all_labels)
    #print('Train acc: ', clf.score(train_all_vecs, train_all_labels))
    test_acc = clf.score(test_all_vecs, test_all_labels)
    #print('Test acc: ', test_acc )
    
    ############################################
    
    # Balanced Test Score Calculation
    nAncestors = len(test_labels[0])
    countCorrect = 0
    for i in range(nAncestors):
      
      scoreResponse0 = clf.predict_proba(test_all_vecs[i].reshape(1,-1))[0,1]
      scoreResponse1 = clf.predict_proba(test_all_vecs[i+nAncestors].reshape(1,-1))[0,1]
     
      if scoreResponse0 > scoreResponse1 and test_labels[0][i] == 1:
        countCorrect += 1
        
      elif scoreResponse1 > scoreResponse0 and test_labels[1][i] == 1:
        countCorrect += 1
       
    bal_test_score =  countCorrect/nAncestors
    #print("Balanced Test Score:", bal_test_score )
    
    # F1 Prediction
    
    y_pred = []
    y_true = []
    
    for i in range(len(test_labels[0])+len(test_labels[1])):
      
      scoreResponse = clf.predict_proba(test_all_vecs[i].reshape(1,-1))[0][1]
      
      if i < nAncestors:
        
        y_true.append(test_labels[0][i])
        
      else:
        
        y_true.append(test_labels[1][i-nAncestors])
      
      if scoreResponse > 0.5:
        y_pred.append(1)
        
      else:
        y_pred.append(0)
       
    f1Score = f1_score(y_true, y_pred, pos_label=1, average='binary', sample_weight=None)
    #print("F1 Score:", f1Score)
    
    #Store the results
    record_50 = record_50.append({'Acc (bal, bal)': bal_test_score, 'Acc (bal, reg)':test_acc , 'F1 (bal, reg)': f1Score}, ignore_index=True)
    
print(record_50)

with open("/content/gdrive/My Drive/SARC pol/elmo/test50.pickle", 'wb') as handle:
  pickle.dump(record_50, handle)
  

project_training_50.csv 
----------------------------------
   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.729888         0.66559       0.664901
1        0.729888         0.66559       0.664901
2        0.729888         0.66559       0.664901
3        0.729888         0.66559       0.664901
4        0.729888         0.66559       0.664901


### 25% project training set

In [0]:
filename =  ['project_training_25.csv']

record_25 = pd.DataFrame()

for fname in filename:
  
  #Load in the training set
  traindf = pd.read_csv(SARC_POL+fname, index_col = 0)
  
  # Only use responses for this method. Ignore ancestors.
  train_resp = traindf['response'].values.tolist()
  
  train_labels = traindf['label'].values.tolist()
  

  # Train resp
  first_resp = []
  second_resp = []
  for idx, resp in enumerate(train_resp):

    if idx % 2 == 0:
      first_resp.append(resp)
    else:
      second_resp.append(resp)

  train_docs = {0: first_resp, 1: second_resp}

  # Train labels
  first_label = []
  second_label = []
  for idx, label in enumerate(train_labels):

    if idx % 2 == 0:
      first_label.append(label)
    else:
      second_label.append(label)

  train_labels = {0: first_label, 1: second_label}
  
  ############################################ Khodak et al. 2017
  
  # Train a classifier on all responses in training data. We will later use this
  # classifier to determine for every sequence which of the 2 responses is more sarcastic.
  train_all_docs_tok = tokenize(train_docs[0] + train_docs[1])

  train_all_labels = np.array(train_labels[0] + train_labels[1])

  print(fname, "\n----------------------------------")
    
  for seed in [1, 2, 3, 4, 5]:
    
    train_all_vecs = get_elmo_embeddings(elmo, train_all_docs_tok)
    
    with open(f"/content/gdrive/My Drive/SARC pol/elmo/25embeddings_{seed}.pickle", 'wb') as handle:
      pickle.dump(train_all_vecs, handle)

    # Evaluate this classifier on all responses.
#     print('Evaluate the classifier on all responses')
    clf = LogitCV(Cs=[10**i for i in range(-2, 3)], fit_intercept=False, cv=2, dual=np.less(*train_all_vecs.shape), solver='liblinear', n_jobs=-1, random_state=seed) 
    clf.fit(train_all_vecs, train_all_labels)
    #print('Train acc: ', clf.score(train_all_vecs, train_all_labels))
    test_acc = clf.score(test_all_vecs, test_all_labels)
    #print('Test acc: ', test_acc )
     
   ############################################   
   
    # Balanced Test Score Calculation
    nAncestors = len(test_labels[0])
    countCorrect = 0
    for i in range(nAncestors):
      
      scoreResponse0 = clf.predict_proba(test_all_vecs[i].reshape(1,-1))[0,1]
      scoreResponse1 = clf.predict_proba(test_all_vecs[i+nAncestors].reshape(1,-1))[0,1]
     
      if scoreResponse0 > scoreResponse1 and test_labels[0][i] == 1:
        countCorrect += 1
        
      elif scoreResponse1 > scoreResponse0 and test_labels[1][i] == 1:
        countCorrect += 1
       
    bal_test_score =  countCorrect/nAncestors
    #print("Balanced Test Score:", bal_test_score )
    
    # F1 Prediction
    
    y_pred = []
    y_true = []
    
    for i in range(len(test_labels[0])+len(test_labels[1])):
      
      scoreResponse = clf.predict_proba(test_all_vecs[i].reshape(1,-1))[0][1]
      
      if i < nAncestors:
        
        y_true.append(test_labels[0][i])
        
      else:
        
        y_true.append(test_labels[1][i-nAncestors])
      
      if scoreResponse > 0.5:
        y_pred.append(1)
        
      else:
        y_pred.append(0)
       
    f1Score = f1_score(y_true, y_pred, pos_label=1, average='binary', sample_weight=None)
    #print("F1 Score:", f1Score)
    
    #Store the results
    record_25 = record_25.append({'Acc (bal, bal)': bal_test_score, 'Acc (bal, reg)':test_acc , 'F1 (bal, reg)': f1Score}, ignore_index=True)
    
print(record_25)

with open("/content/gdrive/My Drive/SARC pol/elmo/test25.pickle", 'wb') as handle:
  pickle.dump(record_25, handle)
  

project_training_25.csv 
----------------------------------
   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.713447        0.647093       0.643957
1        0.713447        0.647093       0.643957
2        0.713447        0.647093       0.643957
3        0.713447        0.647093       0.643957
4        0.713447        0.647093       0.643957


###12.5% project training set

In [0]:
filename =  ['project_training_12.csv']

record_12 = pd.DataFrame()

for fname in filename:
  
  #Load in the training set
  traindf = pd.read_csv(SARC_POL+fname, index_col = 0)
  
  # Only use responses for this method. Ignore ancestors.
  train_resp = traindf['response'].values.tolist()
  
  train_labels = traindf['label'].values.tolist()
  

  # Train resp
  first_resp = []
  second_resp = []
  for idx, resp in enumerate(train_resp):

    if idx % 2 == 0:
      first_resp.append(resp)
    else:
      second_resp.append(resp)

  train_docs = {0: first_resp, 1: second_resp}

  # Train labels
  first_label = []
  second_label = []
  for idx, label in enumerate(train_labels):

    if idx % 2 == 0:
      first_label.append(label)
    else:
      second_label.append(label)

  train_labels = {0: first_label, 1: second_label}

 ############################################ Khodak et al. 2017
  
  # Train a classifier on all responses in training data. We will later use this
  # classifier to determine for every sequence which of the 2 responses is more sarcastic.
  train_all_docs_tok = tokenize(train_docs[0] + train_docs[1])

  train_all_labels = np.array(train_labels[0] + train_labels[1])

  print(fname, "\n----------------------------------")
    
  for seed in [1, 2, 3, 4, 5]:
    
    train_all_vecs = get_elmo_embeddings(elmo, train_all_docs_tok)
    
    with open(f"/content/gdrive/My Drive/SARC pol/elmo/12embeddings_{seed}.pickle", 'wb') as handle:
      pickle.dump(train_all_vecs, handle)

    # Evaluate this classifier on all responses.
#     print('Evaluate the classifier on all responses')
    clf = LogitCV(Cs=[10**i for i in range(-2, 3)], fit_intercept=False, cv=2, dual=np.less(*train_all_vecs.shape), solver='liblinear', n_jobs=-1, random_state=seed) 
    clf.fit(train_all_vecs, train_all_labels)
    #print('Train acc: ', clf.score(train_all_vecs, train_all_labels))
    test_acc = clf.score(test_all_vecs, test_all_labels)
    #print('Test acc: ', test_acc )
  
  ############################################ 
  
    # Balanced Test Score Calculation
    nAncestors = len(test_labels[0])
    countCorrect = 0
    for i in range(nAncestors):
      
      scoreResponse0 = clf.predict_proba(test_all_vecs[i].reshape(1,-1))[0,1]
      scoreResponse1 = clf.predict_proba(test_all_vecs[i+nAncestors].reshape(1,-1))[0,1]
     
      if scoreResponse0 > scoreResponse1 and test_labels[0][i] == 1:
        countCorrect += 1
        
      elif scoreResponse1 > scoreResponse0 and test_labels[1][i] == 1:
        countCorrect += 1
       
    bal_test_score =  countCorrect/nAncestors
    #print("Balanced Test Score:", bal_test_score )
    
    # F1 Prediction
    
    y_pred = []
    y_true = []
    
    for i in range(len(test_labels[0])+len(test_labels[1])):
      
      scoreResponse = clf.predict_proba(test_all_vecs[i].reshape(1,-1))[0][1]
      
      if i < nAncestors:
        
        y_true.append(test_labels[0][i])
        
      else:
        
        y_true.append(test_labels[1][i-nAncestors])
      
      if scoreResponse > 0.5:
        y_pred.append(1)
        
      else:
        y_pred.append(0)
       
    f1Score = f1_score(y_true, y_pred, pos_label=1, average='binary', sample_weight=None)
    #print("F1 Score:", f1Score)
    
    #Store the results
    record_12 = record_12.append({'Acc (bal, bal)': bal_test_score, 'Acc (bal, reg)':test_acc , 'F1 (bal, reg)': f1Score}, ignore_index=True)
    
print(record_12)

with open("/content/gdrive/My Drive/SARC pol/elmo/test12.pickle", 'wb') as handle:
  pickle.dump(record_12, handle)
  

project_training_12.csv 
----------------------------------
   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.694069         0.64357       0.636962
1        0.694069         0.64357       0.636962
2        0.694069         0.64357       0.636962
3        0.694069         0.64357       0.636962
4        0.694069         0.64357       0.636962


###6.25% project training set

In [0]:
filename =  ['project_training_6.csv']

record_6 = pd.DataFrame()

for fname in filename:
  
  #Load in the training set
  traindf = pd.read_csv(SARC_POL+fname, index_col = 0)
  
  # Only use responses for this method. Ignore ancestors.
  train_resp = traindf['response'].values.tolist()
  
  train_labels = traindf['label'].values.tolist()
  

  # Train resp
  first_resp = []
  second_resp = []
  for idx, resp in enumerate(train_resp):

    if idx % 2 == 0:
      first_resp.append(resp)
    else:
      second_resp.append(resp)

  train_docs = {0: first_resp, 1: second_resp}

  # Train labels
  first_label = []
  second_label = []
  for idx, label in enumerate(train_labels):

    if idx % 2 == 0:
      first_label.append(label)
    else:
      second_label.append(label)

  train_labels = {0: first_label, 1: second_label}

  ############################################ Khodak et al. 2017 
  
  # Train a classifier on all responses in training data. We will later use this
  # classifier to determine for every sequence which of the 2 responses is more sarcastic.
  train_all_docs_tok = tokenize(train_docs[0] + train_docs[1])

  train_all_labels = np.array(train_labels[0] + train_labels[1])

  print(fname, "\n----------------------------------")
    
  for seed in [1, 2, 3, 4, 5]:
    
    train_all_vecs = get_elmo_embeddings(elmo, train_all_docs_tok)
    
    with open(f"/content/gdrive/My Drive/SARC pol/elmo/6embeddings_{seed}.pickle", 'wb') as handle:
      pickle.dump(train_all_vecs, handle)

   
      
    # Evaluate this classifier on all responses.
#     print('Evaluate the classifier on all responses')
    clf = LogitCV(Cs=[10**i for i in range(-2, 3)], fit_intercept=False, cv=2, dual=np.less(*train_all_vecs.shape), solver='liblinear', n_jobs=-1, random_state=seed) 
    clf.fit(train_all_vecs, train_all_labels)
    #print('Train acc: ', clf.score(train_all_vecs, train_all_labels))
    test_acc = clf.score(test_all_vecs, test_all_labels)
    #print('Test acc: ', test_acc )
   
  ############################################ 
    # Balanced Test Score Calculation
    nAncestors = len(test_labels[0])
    countCorrect = 0
    for i in range(nAncestors):
      
      scoreResponse0 = clf.predict_proba(test_all_vecs[i].reshape(1,-1))[0,1]
      scoreResponse1 = clf.predict_proba(test_all_vecs[i+nAncestors].reshape(1,-1))[0,1]
     
      if scoreResponse0 > scoreResponse1 and test_labels[0][i] == 1:
        countCorrect += 1
        
      elif scoreResponse1 > scoreResponse0 and test_labels[1][i] == 1:
        countCorrect += 1
       
    bal_test_score =  countCorrect/nAncestors
    #print("Balanced Test Score:", bal_test_score )
    
    # F1 Prediction
    
    y_pred = []
    y_true = []
    
    for i in range(len(test_labels[0])+len(test_labels[1])):
      
      scoreResponse = clf.predict_proba(test_all_vecs[i].reshape(1,-1))[0][1]
      
      if i < nAncestors:
        
        y_true.append(test_labels[0][i])
        
      else:
        
        y_true.append(test_labels[1][i-nAncestors])
      
      if scoreResponse > 0.5:
        y_pred.append(1)
        
      else:
        y_pred.append(0)
       
    f1Score = f1_score(y_true, y_pred, pos_label=1, average='binary', sample_weight=None)
    #print("F1 Score:", f1Score)
    
    #Store the results
    record_6 = record_6.append({'Acc (bal, bal)': bal_test_score, 'Acc (bal, reg)':test_acc , 'F1 (bal, reg)': f1Score}, ignore_index=True)
    
print(record_6)

with open("/content/gdrive/My Drive/SARC pol/elmo/test6.pickle", 'wb') as handle:
  pickle.dump(record_6, handle)
  

project_training_6.csv 
----------------------------------
   Acc (bal, bal)  Acc (bal, reg)  F1 (bal, reg)
0        0.675279        0.622137       0.622912
1        0.675279        0.623312       0.624524
2        0.675279        0.623312       0.624524
3        0.675279        0.623312       0.624524
4        0.675279        0.623312       0.624524


## One data frame for GloVE results

In [0]:
with open("/content/gdrive/My Drive/SARC pol/elmo/test6.pickle", 'rb') as handle:
  record_6 = pickle.load(handle)
  
with open("/content/gdrive/My Drive/SARC pol/elmo/test12.pickle", 'rb') as handle:
  record_12 = pickle.load(handle)

with open("/content/gdrive/My Drive/SARC pol/elmo/test25.pickle", 'rb') as handle:
  record_25 = pickle.load(handle)

with open("/content/gdrive/My Drive/SARC pol/elmo/test50.pickle", 'rb') as handle:
  record_50 = pickle.load(handle)

with open("/content/gdrive/My Drive/SARC pol/elmo/test100.pickle", 'rb') as handle:
  record_100 = pickle.load(handle)

In [0]:
Elmo_results = pd.DataFrame()

In [0]:
Elmo_results = Elmo_results.append(np.mean(record_100, axis=0), ignore_index=True)
Elmo_results = Elmo_results.append(np.mean(record_50, axis=0), ignore_index=True)
Elmo_results = Elmo_results.append(np.mean(record_25, axis=0), ignore_index=True)
Elmo_results = Elmo_results.append(np.mean(record_12, axis=0), ignore_index=True)
Elmo_results = Elmo_results.append(np.mean(record_6, axis=0), ignore_index=True)
Elmo_results.index = ['100%','50%','25%','12.5%','6.25%']

In [0]:
with open("/content/gdrive/My Drive/SARC pol/elmo/elmo_logreg_results.pickle", 'wb') as handle:
  pickle.dump(Elmo_results, handle)

In [0]:
with open("/content/gdrive/My Drive/SARC pol/elmo/elmo_logreg_results.pickle", 'rb') as handle:
  Elmo_results = pickle.load(handle)

In [24]:
Elmo_results

Unnamed: 0,"Acc (bal, bal)","Acc (bal, reg)","F1 (bal, reg)"
100%,0.745156,0.682913,0.678188
50%,0.729888,0.66559,0.664901
25%,0.713447,0.647093,0.643957
12.5%,0.694069,0.64357,0.636962
6.25%,0.675279,0.623077,0.624202
