##Code for replicating gao et al research on VUA Classifcation Model

In [1]:
# mount drive
from google.colab import drive
ROOT = '/content/drive'
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# add repo directory to path
import os
import sys
from os.path import join 
repo_dir = '/content/drive/MyDrive/Repos/metaphor-detection'
if repo_dir not in sys.path:
    sys.path.append(repo_dir)
print(sys.path)

['', '/content', '/env/python', '/usr/lib/python37.zip', '/usr/lib/python3.7', '/usr/lib/python3.7/lib-dynload', '/usr/local/lib/python3.7/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.7/dist-packages/IPython/extensions', '/root/.ipython', '/content/drive/MyDrive/Repos/metaphor-detection']


In [3]:
import random
random.seed(0)

In [None]:
# pip install requirements (takes a while)
!cd drive/MyDrive/Repos/metaphor-detection/; pip install -r gao-g-requirements.txt
!pip install --upgrade google-cloud-storage

In [5]:
from core.gao_files.classification.util import get_num_lines, get_vocab, embed_sequence, get_word2idx_idx2word, get_embedding_matrix
from core.gao_files.classification.util  import TextDatasetWithGloveElmoSuffix as TextDataset
from core.gao_files.classification.util  import evaluate, write_predictions
from core.gao_files.classification.model import RNNSequenceClassifier

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader

import csv
import h5py
# import matplotlib
# matplotlib.use('Agg')  # to avoid the error: _tkinter.TclError: no display name and no $DISPLAY environment variable
# matplotlib.use('tkagg') # to display the graph on remote server
import matplotlib.pyplot as plt
import pandas as pd

In [6]:
print("PyTorch version:")
print(torch.__version__)
print("GPU Detected:")
print(torch.cuda.is_available())
using_GPU = torch.cuda.is_available()

PyTorch version:
1.10.0+cu111
GPU Detected:
True


In [7]:
# directories
# to download glove and elmo vectors see: notebooks/Download_large_data.ipynb
data_dir = repo_dir + '/resources/metaphor-in-context/data/'
glove_dir = repo_dir + '/resources/glove/'
elmo_dir = repo_dir + '/resources/elmo/'

### Gao code

In [8]:
"""
1. Data pre-processing
"""
'''
1.1 VUA
get raw dataset as a list:
  Each element is a triple:
    a sentence: string
    a index: int: idx of the focus verb
    a label: int 1 or 0
'''
raw_train_vua = []
with open(data_dir + 'VUA/VUA_formatted_train.csv', encoding='latin-1') as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
        raw_train_vua.append([line[3], int(line[4]), int(line[5])])

raw_val_vua = []
with open(data_dir+ 'VUA/VUA_formatted_val.csv', encoding='latin-1') as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
        raw_val_vua.append([line[3], int(line[4]), int(line[5])])

raw_test_vua = []
with open(data_dir+ 'VUA/VUA_formatted_test.csv', encoding='latin-1') as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
        raw_test_vua.append([line[3], int(line[4]), int(line[5])])
print('VUA dataset division: ', '\ntrain:', len(raw_train_vua), '\nval:',len(raw_val_vua), '\ntest:',len(raw_test_vua))

VUA dataset division:  
train: 15516 
val: 1724 
test: 5873


In [9]:
"""
2. Data preparation
"""
'''
2. 1
get vocabulary and glove embeddings in raw dataset 
'''
# vocab is a set of words
vocab = get_vocab(raw_train_vua + raw_val_vua + raw_test_vua)
# two dictionaries. <PAD>: 0, <UNK>: 1
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(glove_dir + 'glove.840B.300d.txt', word2idx, idx2word, normalization=False)
# elmo_embeddings
elmos_train_vua = h5py.File(elmo_dir + 'VUA_train.hdf5', 'r')
elmos_val_vua = h5py.File(elmo_dir + 'VUA_val.hdf5', 'r')
# suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50
suffix_embeddings = nn.Embedding(2, 50)

vocab size:  18695


100%|██████████| 2196017/2196017 [00:51<00:00, 43029.95it/s]


Number of pre-trained word vectors loaded:  17941
Embeddings mean:  -0.0001772342948243022
Embeddings stdev:  0.37537267804145813


In [10]:
'''
2. 2
embed the datasets
'''
embedded_train_vua = [[embed_sequence(example[0], example[1], word2idx,
                                      glove_embeddings, elmos_train_vua, suffix_embeddings), example[2]]
                      for example in raw_train_vua]
embedded_val_vua = [[embed_sequence(example[0], example[1], word2idx,
                                    glove_embeddings, elmos_val_vua, suffix_embeddings), example[2]]
                    for example in raw_val_vua]

In [11]:
'''
2. 3
set up Dataloader for batching
'''
# Separate the input (embedded_sequence) and labels in the indexed train sets.
train_dataset_vua = TextDataset([example[0] for example in embedded_train_vua],
                                [example[1] for example in embedded_train_vua])
val_dataset_vua = TextDataset([example[0] for example in embedded_val_vua],
                              [example[1] for example in embedded_val_vua])

# Data-related hyperparameters
batch_size = 64
# Set up a DataLoader for the training, validation, and test dataset
train_dataloader_vua = DataLoader(dataset=train_dataset_vua, batch_size=batch_size, shuffle=True,
                                  collate_fn=TextDataset.collate_fn)
val_dataloader_vua = DataLoader(dataset=val_dataset_vua, batch_size=batch_size,
                                collate_fn=TextDataset.collate_fn)

In [12]:
"""
3. Model training
"""
'''
3. 1 
set up model, loss criterion, optimizer
'''
# Instantiate the model
# embedding_dim = glove + elmo + suffix indicator
# dropout1: dropout on input to RNN
# dropout2: dropout in RNN; would be used if num_layers=1
# dropout3: dropout on hidden state of RNN to linear layer
rnn_clf = RNNSequenceClassifier(num_classes=2, embedding_dim=300 + 1024 + 50, hidden_size=300, num_layers=1, bidir=True,
                                dropout1=0.3, dropout2=0.2, dropout3=0.2)
# Move the model to the GPU if available
if using_GPU:
    rnn_clf = rnn_clf.cuda()
# Set up criterion for calculating loss
nll_criterion = nn.NLLLoss()
# Set up an optimizer for updating the parameters of the rnn_clf
rnn_clf_optimizer = optim.SGD(rnn_clf.parameters(), lr=0.01,momentum=0.9)
# Number of epochs (passes through the dataset) to train the model for.
num_epochs = 20

  "num_layers={}".format(dropout, num_layers))


In [None]:
'''
3. 2
train model
'''
training_loss = []
val_loss = []

training_f1 = []
val_f1 = []

training_accuracy = []
val_accuracy = []

training_precision = []
val_precision = []

training_recall = []
val_recall = []

training_fus_f1 = []
val_fus_f1 = []

# A counter for the number of gradient updates
num_iter = 0
for epoch in range(num_epochs):
    print("Starting epoch {}".format(epoch + 1))
    for (example_text, example_lengths, labels) in train_dataloader_vua:
        example_text = Variable(example_text)
        example_lengths = Variable(example_lengths)
        labels = Variable(labels)
        if using_GPU:
            example_text = example_text.cuda()
            example_lengths = example_lengths.cuda()
            labels = labels.cuda()
        # predicted shape: (batch_size, 2)
        predicted = rnn_clf(example_text, example_lengths)
        batch_loss = nll_criterion(predicted, labels)
        rnn_clf_optimizer.zero_grad()
        batch_loss.backward()
        rnn_clf_optimizer.step()
        num_iter += 1
        # Calculate validation and training set loss and accuracy every 200 gradient updates
        if num_iter % 200 == 0:
            avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(val_dataloader_vua, rnn_clf,
                                                                                   nll_criterion, using_GPU, print_verbose=False)
            val_loss.append(avg_eval_loss)
            val_f1.append(f1)
            val_accuracy.append(eval_accuracy)
            val_precision.append(precision)
            val_recall.append(recall)
            val_fus_f1.append(fus_f1)

            # print metrics less often
            print(
                  "Iteration {}. Validation Loss {}. Validation Accuracy {}. Validation Precision {}. Validation Recall {}. Validation F1 {}. Validation class-wise F1 {}.".format(
                      num_iter, avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1))
            # filename = '../models/LSTMSuffixElmoAtt_???_all_iter_' + str(num_iter) + '.pt'
            # torch.save(rnn_clf, filename)
            # avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(train_dataloader_vua, rnn_clf,
            #                                                                        nll_criterion, using_GPU)
            # training_loss.append(avg_eval_loss)
            # training_f1.append(f1)
            # training_accuracy.append(eval_accuracy)
            # training_precision.append(precision)
            # training_recall.append(recall)
            # training_fus_f1.append(fus_f1)
            
#             print(
#                 "Iteration {}. Training Loss {}. Training Accuracy {}. Training Precision {}. Training Recall {}. Training F1 {}. Training class-wise F1 {}.".format(
#                     num_iter, avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1))
print("Training done!")

In [14]:
import numpy as np
print('Precision on vua = ', np.mean(np.array(val_precision)))
print('Recall on vua = ', np.mean(np.array(val_recall)))
print('F1 on vua = ', np.mean(np.array(val_f1)))

Precision on vua =  nan
Recall on vua =  50.269396551724135
F1 on vua =  nan


In [15]:
"""
4. test the model
the following code is for test data of VUA
"""
'''
VUA
'''
elmos_test_vua = h5py.File(elmo_dir + 'VUA_test.hdf5', 'r')
embedded_test_vua = [[embed_sequence(example[0], example[1], word2idx,
                                     glove_embeddings, elmos_test_vua, suffix_embeddings), example[2]]
                     for example in raw_test_vua]
test_dataset_vua = TextDataset([example[0] for example in embedded_test_vua],
                               [example[1] for example in embedded_test_vua])
test_dataloader_vua = DataLoader(dataset=test_dataset_vua, batch_size=batch_size,
                                 collate_fn=TextDataset.collate_fn)
avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(test_dataloader_vua, rnn_clf,
                                                                       nll_criterion, using_GPU)
print("Test Accuracy {}. Test Precision {}. Test Recall {}. Test F1 {}. Test class-wise F1 {}.".format(
    eval_accuracy, precision, recall, f1, fus_f1))

  eval_text = Variable(eval_text, volatile=True)
  eval_lengths = Variable(eval_lengths, volatile=True)
  eval_labels = Variable(eval_labels, volatile=True)


[[3705.  663.]
 [ 407. 1098.]]
Test Accuracy 81.78103637695312. Test Precision 72.95681063122923. Test Recall 62.35093696763203. Test F1 67.23821187997551. Test class-wise F1 77.31014367583681.


In [16]:
import pandas as pd
gao_scores = [53.4, 65.6, 58.9, 69.1, 53.4]
our_scores = [precision,
  recall,
  f1,
  eval_accuracy.item(),
  fus_f1]
our_scores = [round(score,1) for score in our_scores]
all_scores = [gao_scores, our_scores]
all_scores_df = pd.DataFrame(all_scores, columns= ['P', 'R', 'F1', 'Acc', 'MaF1'], index=['Gao et al', 'US'])
print("vua classification model: classification task\n")
all_scores_df

vua classification model: classification task



Unnamed: 0,P,R,F1,Acc,MaF1
Gao et al,53.4,65.6,58.9,69.1,53.4
US,73.0,62.4,67.2,81.8,77.3


In [17]:
def write_predictions(raw_dataset, evaluation_dataloader, model, using_GPU, rawdata_filename):
    """
    Evaluate the model on the given evaluation_dataloader

    :param raw_dataset
    :param evaluation_dataloader:
    :param model:
    :param using_GPU: a boolean
    :return: a list of
    """
    # Set model to eval mode, which turns off dropout.
    model.eval()

    predictions = []
    for (example_text, example_lengths, labels) in evaluation_dataloader:
        eval_text = Variable(example_text, volatile=True)
        eval_lengths = Variable(example_lengths, volatile=True)
        eval_labels = Variable(labels, volatile=True)
        if using_GPU:
            eval_text = eval_text.cuda()
            eval_lengths = eval_lengths.cuda()
            eval_labels = eval_labels.cuda()

        # predicted shape: (batch_size, seq_len, 2)
        predicted = model(eval_text, eval_lengths)
        # get 0 or 1 predictions
        # predicted_labels: (batch_size, seq_len)
        # print("predicted", predicted.data )
        _, predicted_labels = torch.max(predicted.data, 1)
        predictions.extend(predicted_labels)

    # Set the model back to train mode, which activates dropout again.
    model.train()
    assert (len(predictions) == len(raw_dataset))

    # read original data
    data = []
    with open(rawdata_filename, encoding='latin-1') as f:
        lines = csv.reader(f)
        for line in lines:
            data.append(line)

    # append predictions to the original data
    data[0].append('prediction')
    for i in range(len(predictions)):
        data[i + 1].append(predictions[i])
    return data

In [18]:
cls_test_pred = write_predictions(raw_test_vua, test_dataloader_vua, rnn_clf , using_GPU, data_dir + 'VUA/VUA_formatted_test.csv')

  app.launch_new_instance()


In [19]:
def get_performance_VUAverb_test(data_path, seq_test_pred):
    """
    Similar treatment as get_performance_VUAverb_val
    Read the VUA-verb test data, and the VUA-sequence test data.
    Extract the predictions for VUA-verb test data from the VUA-sequence test data.
    Prints the performance of LSTM sequence model on VUA-verb test set based on genre
    Prints the performance of LSTM sequence model on VUA-verb test set regardless of genre

    :return: the averaged performance across genre and performance on verb test
    regardless of genre
    """
    # get the VUA-ver test set
    ID_verbidx_label = []  # ID tuple, verb_idx, label 1 or 0
    with open(data_path + 'VUA/VUA_formatted_test.csv', encoding='latin-1') as f:
        lines = csv.reader(f)
        next(lines)
        for line in lines:
          ID_verbidx_label.append([(line[0], line[1]), int(line[4]), int(line[5])])

    # get genre
    ID2genre = {}
    with open(data_path + 'VUAsequence/VUA_seq_formatted_test.csv', encoding='latin-1') as f:
      lines = csv.reader(f)
      next(lines)
      for line in lines:
        ID2genre[(line[0], line[1])] = line[6]

    # get the prediction from LSTM sequence model
    ID2sen_labelseq = {}  # ID tuple --> [genre, label_sequence]
    for line in seq_test_pred[1:]:
      ID2sen_labelseq[(line[0], line[1])] = line[6]
    # with open('/predictions/vua_seq_test_predictions_LSTMsequence_vua.csv', encoding='latin-1') as f:
    #     # txt_id	sen_ix	sentence	label_seq	pos_seq	labeled_sentence	genre   predictions
    #     lines = csv.reader(f)
    #     next(lines)
    #     for line in lines:
    #         ID2sen_labelseq[(line[0], line[1])] = [line[6], ast.literal_eval(line[7])]
    # compute confusion_matrix
    predictions = []
    genres = ['news', 'fiction', 'academic', 'conversation']
    confusion_matrix = np.zeros((4, 2, 2))
    for ID, verbidx, label in ID_verbidx_label:
        pred = ID2sen_labelseq[ID]
        # pred = ID2sen_labelseq[ID][1][verbidx]
        predictions.append(pred)
        genre = ID2genre[ID]
        genre_idx = genres.index(genre)
        confusion_matrix[genre_idx][pred][label] += 1
    assert (np.sum(confusion_matrix) == len(ID_verbidx_label))

    print('Tagging model performance on test-verb: genre')
    avg_performance = []
    for i in range(len(genres)):
        precision = 100 * confusion_matrix[i, 1, 1] / np.sum(confusion_matrix[i, 1])
        recall = 100 * confusion_matrix[i, 1, 1] / np.sum(confusion_matrix[i, :, 1])
        f1 = 2 * precision * recall / (precision + recall)
        accuracy = 100 * (confusion_matrix[i, 1, 1] + confusion_matrix[i, 0, 0]) / np.sum(confusion_matrix[i])
        print(genres[i], 'Precision, Recall, F1, Accuracy: ', precision, recall, f1, accuracy)
        avg_performance.append([precision, recall, f1, accuracy])
    avg_performance = np.array(avg_performance)
    macro_avg_performance = avg_performance.mean(0)

    print('Tagging model performance on test-verb: regardless of genre')
    confusion_matrix = confusion_matrix.sum(axis=0)
    precision = 100 * confusion_matrix[1, 1] / np.sum(confusion_matrix[1])
    recall = 100 * confusion_matrix[1, 1] / np.sum(confusion_matrix[:, 1])
    f1 = 2 * precision * recall / (precision + recall)
    accuracy = 100 * (confusion_matrix[1, 1] + confusion_matrix[0, 0]) / np.sum(confusion_matrix)
    overall_performance = np.array([precision, recall, f1, accuracy])
    print('Precision, Recall, F1, Accuracy: ', precision, recall, f1, accuracy)

    return macro_avg_performance, overall_performance

In [20]:
macro_avg_performance, overall_verb_performance = get_performance_VUAverb_test(data_dir, cls_test_pred)

Tagging model performance on test-verb: genre
news Precision, Recall, F1, Accuracy:  61.53846153846154 57.24508050089445 59.31417979610751 64.25081433224756
fiction Precision, Recall, F1, Accuracy:  41.904761904761905 48.35164835164835 44.89795918367347 76.60649819494584
academic Precision, Recall, F1, Accuracy:  58.208955223880594 55.01567398119122 56.56728444802579 57.188244638602065
conversation Precision, Recall, F1, Accuracy:  34.751773049645394 33.67697594501718 34.20593368237347 81.15942028985508
Tagging model performance on test-verb: regardless of genre
Precision, Recall, F1, Accuracy:  52.383720930232556 51.1641113003975 51.766733697213446 71.41154435552528


In [21]:
macro_F1 = macro_avg_performance[2]
classification_performance = np.append(overall_verb_performance, macro_F1)

In [22]:
gao_scores = [53.4, 65.6, 58.9, 69.1, 53.4]
our_scores = classification_performance
our_scores = [round(score,1) for score in our_scores]
all_scores = [gao_scores, our_scores]
all_scores_df = pd.DataFrame(all_scores, columns= ['P', 'R', 'F1', 'Acc', 'MaF1'], index=['Gao et al', 'US'])
print("vua classification model: classification task\n")
all_scores_df

vua classification model: classification task



Unnamed: 0,P,R,F1,Acc,MaF1
Gao et al,53.4,65.6,58.9,69.1,53.4
US,52.4,51.2,51.8,71.4,48.7
