# Irony Prediction

This notebook is going to cover the main scripts I wrote. Other files used during this process are commented and in the repo, but are not going to be included in the script for sake of intelligibility.

## Imports

My first experiments use Bert as a Service, so make sure that is running in the background before running the following scripts.

In [None]:
## from benchmark script

from nltk.tokenize import TweetTokenizer
from sklearn.datasets import dump_svmlight_file
from sklearn import metrics
import numpy as np
import logging
import codecs

## Additional packages used by Elliott Gruzin

from bert_serving.client import BertClient
from torch.utils.data import TensorDataset
from dataset import Dataset
from feedforward_network import LSTMNetwork
from feedforward_network import Feedforward
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pickle
from transformers import AutoTokenizer ## from hugging face
from nltk.sentiment.vader import SentimentIntensityAnalyzer

## First, define method for parsing the dataset

This is identical to the code used in the benchmark system.

In [None]:
def parse_dataset(fp):
    '''
    Loads the dataset .txt file with label-tweet on each line and parses the dataset.
    :param fp: filepath of dataset
    :return:
        corpus: list of tweet strings of each tweet.
        y: list of labels
    '''
    y = []
    corpus = []
    with open(fp, 'rt') as data_in:
        for line in data_in:
            if not line.lower().startswith("tweet index"): # discard first line if it contains metadata
                line = line.rstrip() # remove trailing whitespace
                label = int(line.split("\t")[1])
                tweet = line.split("\t")[2]
                y.append(label)
                corpus.append(tweet)

    return corpus, y

## Next define a process for creating embeddings and sentiment scores for each sentence

In [None]:
def featurize(corpus):
    '''
    Tokenizes and creates sentence vectors.
    :param corpus: A list of strings each string representing document.
    :return: X: List of BERT-embedded sentences, as well as retokenized corpus.
    '''
    
    # Part 1: Compute Sentiment scores
    
    sentiments = []
    analyser = SentimentIntensityAnalyzer()
    for sentence in corpus:
        sentiment_values = np.array(list(analyser.polarity_scores(sentence).values()))
        sentiments.append(sentiment_values)
        
    # Part 2: Derive sentence embeddings
        
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize
    new_corpus = [' '.join(tokenizer(sentence)) for sentence in corpus]
    corpus = new_corpus
    bc = BertClient()
    X = bc.encode(corpus)
    return X, sentiments

## Now actually compute them and store them

The method of storage here is crucial for the dataloading later in the process. To see how this is relevant, observe how the ids are chosen and later referenced in the dataset.py script

In [None]:
# Create and store BERT Embeddings

# STEP 1: Make dictionaries to identify samples, and give label

partition_dict = {'train':[],'test':[]}
label_dict = {}

# STEP 2: Compute and store

for set in ['train','test']:

    dataset = "./{}_no_hashtag.txt".format(set)
    corpus, y = parse_dataset(dataset)
    print('Extracting BERT embeddings for each tweet...')
    
    X, sentiments = featurize(corpus)
    print('Embeddings extracted. Storing embeddings...')
    
    for i in range(len(y)):
        
        # Give each sentence an id, and link with irony label
        
        id = set+str(i)
        partition_dict[set].append(id)
        label_dict[id] = y[i]
        sentence_and_sentiment = np.append(X[i],sentiments[i])
        
        # Save sentence and sentiment as a tensor
        
        embedding = torch.from_numpy(sentence_and_sentiment)
        torch.save(embedding, 'data/dehashtagged/{}.pt'.format(id))
    
    print('Embeddings stored for the {} dataset.\n'.format(set))

print('Writing dictionaries...')

p_dic = open('data/dehashtagged/partition_dict.pkl','wb')
l_dic = open('data/dehashtagged/label_dict.pkl','wb')
pickle.dump(partition_dict, p_dic)
pickle.dump(label_dict, l_dic)
p_dic.close()
l_dic.close()

## Now that we have sentence embeddings stored, we can use them to train our models

### First use Cuda, and set some hyperparameters

In [None]:
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

# Choosing parameters

batch_size = 32
layer_number = 3
layer_width = 25
lr8 = 0.001

# Setting parameters
train_params = {'batch_size': batch_size,
          'shuffle': True,
          'num_workers': 1}

test_params = {'batch_size': 1,
            'shuffle': False,
            'num_workers': 1}

max_epochs = 100

## Now make the training generators to be able to perform training in batches

In [None]:
jar = open('data/twitter_tokenizing_and_sentiment/partition_dict.pkl','rb')
partition = pickle.load(jar)
jar2 = open('data/twitter_tokenizing_and_sentiment/label_dict.pkl','rb')
labels = pickle.load(jar2)
jar.close()
jar2.close()

# Generators

training_set = Dataset(partition['train'], labels,'twitter_tokenizing_and_sentiment')
training_generator = torch.utils.data.DataLoader(training_set, **train_params)

test_set = Dataset(partition['test'], labels,'twitter_tokenizing_and_sentiment')
test_generator = torch.utils.data.DataLoader(test_set, **test_params)


## I define my Feedforward network to take sentence embeddings and output vectors of size 1. 

I also define the test loss function here.

In [None]:
ffn = Feedforward(772, layer_number, layer_width)
ffn.cuda()
# criterion = nn.BCELoss()
criterion = nn.L1Loss()
optimizer = optim.SGD(ffn.parameters(), lr=float(lr8), momentum=0.9)

def test_loss():
    'Calculates loss from model on test set'
    ffn.eval()
    test_loss = 0
    for x, y in test_generator:

        x = x.to(device=torch.device('cuda:0')).float()
        y = y.to(device=torch.device('cuda:0'))
        pred_output = ffn(x)
        loss = criterion(pred_output.float(), y.unsqueeze(1).float())
        test_loss += loss.data.cpu().numpy()

    return test_loss


## Below, we perform model training.

In [None]:
num_epochs = 0
test_loss_list = []

# Training begins

for epoch in range(max_epochs):
    
    print('epoch {} done'.format(num_epochs))
    ffn.train()
    num_batches = 0
    
    for x, y in training_generator:
        
        # Transfer to GPU
        
        x, y = x.to(device).float(), y.to(device)
        y = y.unsqueeze(1)
        
        # Model computations
        
        pred_labels = ffn(x)
        loss = criterion(pred_labels.float(), y.float())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # Get test loss
    
    t_loss = test_loss()
    print('test loss: ', t_loss)
    test_loss_list.append(t_loss)


    # Early stop if current loss on the test set > loss from 10 epochs ago -- i.e. the model starts to overfit to the training data
    
    if num_epochs > 10:
        test_loss_list.pop(0)
        if t_loss > test_loss_list[0]:
            break

    num_epochs += 1

## We can now evaluate the model by pushing the test set through the model

In [None]:
pred_output_list=[]
labels=[]

# push test set through one more time

ffn.eval()
for x, y in test_generator:
    x = x.to(device=torch.device('cuda:0')).float()
    y = y.to(device=torch.device('cuda:0'))
    pred_output = ffn(x)
    pred_output_list.append(pred_output.data.cpu().numpy()[0][0])
    labels.append(y.data.cpu().numpy()[0])

# generate predictions given models outputs, then compute f1

pred_labels = [0 if output-0.5<0 else 1 for output in pred_output_list]
print('\n\n\n ################# RESULTS ##############\n\n')
print(metrics.f1_score(labels, pred_labels, pos_label=1))


# But this is just one method... What about lexical embeddings in an LSTM?

## We need to define some new processes -- first, lets extract GloVe embeddings. The embeddings I use are the 100 dimensional embeddings trained on Twitter data downloaded from: http://nlp.stanford.edu/data/wordvecs/glove.twitter.27B.zip

In [None]:
def make_embedding_dict():

    # make UNK vector (glove has none by default -- code taken from https://stackoverflow.com/questions/49239941/what-is-unk-in-the-pretrained-glove-vector-files-e-g-glove-6b-50d-txt/53717345#53717345)

    with open('glove.twitter.27B.100d.txt','r') as f:
        for i, line in enumerate(f):
            pass
    n_vec = i + 1
    hidden_dim = len(line.split(' ')) - 1

    vecs = np.zeros((n_vec, hidden_dim), dtype=np.float32)

    with open('glove.twitter.27B.100d.txt', 'r') as f:
        for i, line in enumerate(f):
            vecs[i] = np.array([float(n) for n in line.split(' ')[1:]], dtype=np.float32)

    average_vec = np.mean(vecs, axis=0)
    word2embed = {}
    word2embed['UNK'] = average_vec

    # Now link every word with its embedding
    
    with open('glove.twitter.27B.100d.txt','r') as glove:
        for line in glove:
            sentence = line.split()
            word = sentence[0]
            vector = np.asarray(sentence[1:], 'float32')
            word2embed[word] = vector

    return word2embed


## We redefine how we 'featurize' the corpus

Instead of making one sentence embedding, the sentence is stored as a series of lexical embeddings

In [None]:
def featurize(corpus, embed_dict):
    '''
    Tokenizes and creates sentence vectors.
    :param corpus: A list of strings each string representing document.
    :return: X: List of sentences, which are lists of embedded words.
    '''
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize
    new_corpus = [' '.join(tokenizer(sentence)) for sentence in corpus]
    corpus = new_corpus
    X = []
    for line in corpus:
        sentence = []
        for word in line:
            try:
                sentence.append(embed_dict[word])
            except KeyError:
                sentence.append(embed_dict['UNK'])
        X.append(np.asarray(sentence))
    return X

## Now compute and store as before

In [None]:
print('Extracting GloVe embedding dictionary...')

word2embed = make_embedding_dict()

print('Using embedding dictionary\n\n')

# STEP 1: Make dictionaries to identify samples, and give label

partition_dict = {'train':[],'test':[]}
label_dict = {}

# STEP 2: Compute and store

for set in ['train','test']:

    dataset = "./{}_no_hashtag.txt".format(set)
    corpus, y = parse_dataset(dataset)
    print('Extracting embeddings for each tweet...')
    X = featurize(corpus, word2embed)
    print('Embeddings extracted. Storing embeddings...')
    for i in range(len(y)):
        id = set+str(i)
        partition_dict[set].append(id)
        label_dict[id] = y[i]
        embedding = torch.from_numpy(X[i])
        torch.save(embedding, 'data/glove_lstm/{}.pt'.format(id))
    print('Embeddings stored for the {} dataset.\n'.format(set))

print('Writing dictionaries...')

p_dic = open('data/glove_lstm/partition_dict.pkl','wb')
l_dic = open('data/glove_lstm/label_dict.pkl','wb')
pickle.dump(partition_dict, p_dic)
pickle.dump(label_dict, l_dic)
p_dic.close()
l_dic.close()


## Now we have the data (stored in a different directory) we can train an LSTM on it.

## I reset some parameters below, and generate the appropriate dataloaders

In [None]:
# Choosing parameters

batch_size = 1
layer_number = 3
layer_width = 50
lr8 = 0.001

# Setting parameters
train_params = {'batch_size': batch_size,
          'shuffle': True,
          'num_workers': 1}

test_params = {'batch_size': 1,
            'shuffle': False,
            'num_workers': 1}

max_epochs = 100

jar = open('data/glove_lstm/partition_dict.pkl','rb')
partition = pickle.load(jar)
jar2 = open('data/glove_lstm/label_dict.pkl','rb')
labels = pickle.load(jar2)
jar.close()
jar2.close()

# Generators

training_set = Dataset(partition['train'], labels,'glove_lstm')
training_generator = torch.utils.data.DataLoader(training_set, **train_params)

test_set = Dataset(partition['test'], labels,'glove_lstm')
test_generator = torch.utils.data.DataLoader(test_set, **test_params)

## Define the LSTM and new test loss function

In [None]:
# Models

lstm = LSTMNetwork(100, layer_number, layer_width)
lstm.cuda()
# criterion = nn.BCELoss()
criterion = nn.L1Loss()
optimizer = optim.SGD(lstm.parameters(), lr=float(lr8), momentum=0.9)

def test_loss():
    'Calculates loss from model on test set'
    lstm.eval()
    test_loss = 0
    for x, y in test_generator:
        lstm.zero_grad()
        x = torch.transpose(x,0,1)
        x = x.to(device=torch.device('cuda:0')).float()
        y = y.to(device=torch.device('cuda:0'))
        pred_output = lstm(x).squeeze(1)
        pred_output = pred_output[-1,:]
        loss = criterion(pred_output.float(), y.unsqueeze(1).float())
        test_loss += loss.data.cpu().numpy()

    return test_loss

## Now train the new model

Certain things are not ideal here. I did not have time to implement a batching system -- that would have involved padding. The resulting training takes very long compared to the feedforward system.

In [None]:
num_epochs = 0
test_loss_list = []

# Training begins

for epoch in range(max_epochs):
    print('epoch {} done'.format(num_epochs))
    lstm.train()
    num_batches = 0
    for x, y in training_generator:

        lstm.zero_grad()
        # transpose such that sequence length comes first, batch second
        x = torch.transpose(x,0,1)
        num_batches += 1
        # Transfer to GPU
        x, y = x.to(device).float(), y.to(device)
        # Model computations
        pred_labels = lstm(x).squeeze(1)
        pred_labels = pred_labels[-1,:]
        loss = criterion(pred_labels.float(), y.float())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # compute test loss
    t_loss = test_loss()
    print('test loss: ', t_loss)
    test_loss_list.append(t_loss)


    # Early stop just as before
    if num_epochs > 10:
        test_loss_list.pop(0)
        if t_loss > test_loss_list[0]:
            break

    num_epochs += 1

## Just as before, we can evaluate by pushing the test set through and computing an F1

In [None]:
pred_output_list=[]
labels=[]

lstm = torch.load('lstm.p')

# push test set through one more time

lstm.eval()
for x, y in test_generator:
    x = x.to(device=torch.device('cuda:0')).float().transpose(0,1)
    y = y.to(device=torch.device('cuda:0'))
    pred_output = lstm(x)[-1,:,:]
    # print(pred_output)
    # print(pred_output.cpu().detach().numpy()[0][0])
    # exit()

    pred_output_list.append(pred_output.cpu().detach().numpy()[0][0])
    labels.append(y.cpu().numpy()[0])

# generate predictions given models outputs, then compute f1

pred_labels = [0 if output-0.5<0 else 1 for output in pred_output_list]
print('\n\n\n ################# RESULTS ##############\n\n')
print(metrics.f1_score(labels, pred_labels, pos_label=1))