## Code for replicating gao et al research on VUA Sequence Model

In [2]:
# mount drive
from google.colab import drive
ROOT = '/content/drive'
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# add repo directory to path
import os
import sys
from os.path import join 
repo_dir = '/content/drive/MyDrive/Repos/metaphor-detection'
if repo_dir not in sys.path:
    sys.path.append(repo_dir)
print(sys.path)

['', '/content', '/env/python', '/usr/lib/python37.zip', '/usr/lib/python3.7', '/usr/lib/python3.7/lib-dynload', '/usr/local/lib/python3.7/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.7/dist-packages/IPython/extensions', '/root/.ipython', '/content/drive/MyDrive/Repos/metaphor-detection']


In [2]:
# directories
# to download glove and elmo vectors see: notebooks/Download_large_data.ipynb
data_dir = repo_dir + '/resources/metaphor-in-context/data/'
glove_dir = repo_dir + '/resources/glove/'
elmo_dir = repo_dir + '/resources/elmo/'

In [4]:
%ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


Gao code

In [5]:
# pip install requirements (takes a while)
!cd drive/MyDrive/Repos/metaphor-detection/; pip install -r gao-g-requirements.txt
!pip install --upgrade google-cloud-storage



In [61]:
from core.gao_files.sequence.util import get_num_lines, get_pos2idx_idx2pos, index_sequence, get_vocab, embed_indexed_sequence, \
    get_word2idx_idx2word, get_embedding_matrix, write_predictions, get_performance_VUAverb_val, \
    get_performance_VUAverb_test, get_performance_VUA_test
from core.gao_files.sequence.util import TextDatasetWithGloveElmoSuffix as TextDataset
from core.gao_files.sequence.util import evaluate
from core.gao_files.sequence.model import RNNSequenceModel

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader

import csv
import h5py
import numpy as np
import matplotlib.pyplot as plt
import math
import random
import ast

In [9]:
print("PyTorch version:")
print(torch.__version__)
print("GPU Detected:")
print(torch.cuda.is_available())
using_GPU = torch.cuda.is_available()

PyTorch version:
1.10.0+cu111
GPU Detected:
True


In [32]:
"""
1. Data pre-processing
"""
'''
1.1 VUA
get raw dataset as a list:
  Each element is a triple:
    a sentence: string
    a list of labels: 
    a list of pos: 
'''
pos_set = set()
raw_train_vua = []
with open(data_dir + 'VUAsequence/VUA_seq_formatted_train.csv', encoding='latin-1') as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
        pos_seq = ast.literal_eval(line[4])
        label_seq = ast.literal_eval(line[3])
        assert (len(pos_seq) == len(label_seq))
        assert (len(line[2].split()) == len(pos_seq))
        raw_train_vua.append([line[2], label_seq, pos_seq])
        pos_set.update(pos_seq)

raw_val_vua = []
with open(data_dir + 'VUAsequence/VUA_seq_formatted_val.csv', encoding='latin-1') as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
        pos_seq = ast.literal_eval(line[4])
        label_seq = ast.literal_eval(line[3])
        assert (len(pos_seq) == len(label_seq))
        assert (len(line[2].split()) == len(pos_seq))
        raw_val_vua.append([line[2], label_seq, pos_seq])
        pos_set.update(pos_seq)

# embed the pos tags
pos2idx, idx2pos = get_pos2idx_idx2pos(pos_set)

for i in range(len(raw_train_vua)):
    raw_train_vua[i][2] = index_sequence(pos2idx, raw_train_vua[i][2])
for i in range(len(raw_val_vua)):
    raw_val_vua[i][2] = index_sequence(pos2idx, raw_val_vua[i][2])
print('size of training set, validation set: ', len(raw_train_vua), len(raw_val_vua))

size of training set, validation set:  6323 1550


In [57]:
raw_train_vua[:4]

[["Ca n't fail to be entertaining .",
  [0, 0, 0, 0, 0, 0, 0],
  [5, 6, 5, 12, 5, 13, 8]],
 ['How much was he going to tell her ?',
  [0, 0, 0, 0, 0, 0, 0, 0, 0],
  [6, 13, 5, 0, 5, 12, 5, 0, 8]],
 ['Up until that news hit the Committee , Don had won the day with his UK Vehicle Division proposals .',
  [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
  [11, 11, 7, 14, 5, 7, 10, 8, 10, 5, 5, 7, 14, 11, 13, 10, 10, 10, 14, 8]],
 ["Could go on to the rugby and go with them could n't he ?",
  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  [5, 5, 12, 11, 7, 14, 9, 5, 11, 0, 5, 6, 0, 8]]]

In [63]:
"""
2. Data preparation
"""
'''
2. 1
get vocabulary and glove embeddings in raw dataset 
'''
# vocab is a set of words
vocab = get_vocab(raw_train_vua)
# two dictionaries. <PAD>: 0, <UNK>: 1
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(glove_dir + 'glove.840B.300d.txt',word2idx, idx2word, normalization=False)
# elmo_embeddings
elmos_train_vua = h5py.File(elmo_dir + 'VUA_train.hdf5', 'r')
elmos_val_vua = h5py.File(elmo_dir + 'VUA_val.hdf5', 'r')
# no suffix embeddings for sequence labeling
suffix_embeddings = None

vocab size:  13843


100%|██████████| 2196017/2196017 [00:54<00:00, 40279.73it/s]


Number of pre-trained word vectors loaded:  13404
Embeddings mean:  0.0005707233212888241
Embeddings stdev:  0.3729434907436371


In [74]:
print(len(vocab))
glove_embeddings.weight.shape
# 300d embeddings for the 13843 words in the vocab

13843


torch.Size([13845, 300])

In [68]:
'''
2. 2
embed the datasets
'''
# raw_train_vua: sentence, label_seq, pos_seq
# embedded_train_vua: embedded_sentence, pos, labels
embedded_train_vua = [[embed_indexed_sequence(example[0], example[2], word2idx,
                                      glove_embeddings, elmos_train_vua, suffix_embeddings),
                       example[2], example[1]]
                      for example in raw_train_vua]
embedded_val_vua = [[embed_indexed_sequence(example[0], example[2], word2idx,
                                    glove_embeddings, elmos_val_vua, suffix_embeddings),
                     example[2], example[1]]
                    for example in raw_val_vua]

In [92]:
# embedded_train_vua is a list of lists -- one list per sentence
# each sentence list contains 
#     an array of embeddings (seq_length x embedding_dim)
#     list of pos tag ids
#     list of labels
print(len(embedded_train_vua))
print(len(embedded_train_vua[0]))
print(embedded_train_vua[0][0].shape)
print(embedded_train_vua[0][1])
print(embedded_train_vua[0][2])

6323
3
(7, 1324)
[5, 6, 5, 12, 5, 13, 8]
[0, 0, 0, 0, 0, 0, 0]


In [93]:
'''
2. 3
set up Dataloader for batching
'''
# Separate the input (embedded_sequence) and labels in the indexed train sets.
# embedded_train_vua: embedded_sentence, pos, labels
train_dataset_vua = TextDataset([example[0] for example in embedded_train_vua],
                                [example[1] for example in embedded_train_vua],
                                [example[2] for example in embedded_train_vua])
val_dataset_vua = TextDataset([example[0] for example in embedded_val_vua],
                              [example[1] for example in embedded_val_vua],
                              [example[2] for example in embedded_val_vua])

# Data-related hyperparameters
batch_size = 64
# Set up a DataLoader for the training, validation, and test dataset
train_dataloader_vua = DataLoader(dataset=train_dataset_vua, batch_size=batch_size, shuffle=True,
                              collate_fn=TextDataset.collate_fn)
val_dataloader_vua = DataLoader(dataset=val_dataset_vua, batch_size=batch_size,
                            collate_fn=TextDataset.collate_fn)

<core.gao_files.sequence.util.TextDatasetWithGloveElmoSuffix at 0x7f9bb683e610>

In [98]:
"""
3. Model training
"""
'''
3. 1 
set up model, loss criterion, optimizer
'''
# Instantiate the model
# embedding_dim = glove + elmo + suffix indicator
# dropout1: dropout on input to RNN
# dropout2: dropout in RNN; would be used if num_layers!=1
# dropout3: dropout on hidden state of RNN to linear layer
RNNseq_model = RNNSequenceModel(num_classes=2, embedding_dim=300 + 1024, hidden_size=300, num_layers=1, bidir=True,
                                dropout1=0.5, dropout2=0, dropout3=0.1)
# Move the model to the GPU if available
if using_GPU:
    RNNseq_model = RNNseq_model.cuda()
# Set up criterion for calculating loss
loss_criterion = nn.NLLLoss()
# Set up an optimizer for updating the parameters of the rnn_clf
rnn_optimizer = optim.Adam(RNNseq_model.parameters(), lr=0.005)
# Number of epochs (passes through the dataset) to train the model for.
num_epochs = 10

In [99]:
'''
3. 2
train model
'''
train_loss = []
val_loss = []
performance_matrix = None
val_f1s = []
train_f1s = []
# A counter for the number of gradient updates
num_iter = 0
comparable = []
for epoch in range(num_epochs):
    print("Starting epoch {}".format(epoch + 1))
    for (__, example_text, example_lengths, labels) in train_dataloader_vua:
        example_text = Variable(example_text)
        example_lengths = Variable(example_lengths)
        labels = Variable(labels)
        if using_GPU:
            example_text = example_text.cuda()
            example_lengths = example_lengths.cuda()
            labels = labels.cuda()
        # predicted shape: (batch_size, seq_len, 2)
        predicted = RNNseq_model(example_text, example_lengths)
        batch_loss = loss_criterion(predicted.view(-1, 2), labels.view(-1))
        rnn_optimizer.zero_grad()
        batch_loss.backward()
        rnn_optimizer.step()
        num_iter += 1
        # Calculate validation and training set loss and accuracy every 200 gradient updates
        if num_iter % 200 == 0:
            avg_eval_loss, performance_matrix = evaluate(idx2pos, val_dataloader_vua, RNNseq_model,
                                                         loss_criterion, using_GPU)
            val_loss.append(avg_eval_loss)
            val_f1s.append(performance_matrix[:, 2])
            print("Iteration {}. Validation Loss {}.".format(num_iter, avg_eval_loss))
#             avg_eval_loss, performance_matrix = evaluate(idx2pos, train_dataloader_vua, RNNseq_model,
#                                                          loss_criterion, using_GPU)
#             train_loss.append(avg_eval_loss)
#             train_f1s.append(performance_matrix[:, 2])
#             print("Iteration {}. Training Loss {}.".format(num_iter, avg_eval_loss))

Starting epoch 1
Starting epoch 2
Starting epoch 3


  eval_text = Variable(eval_text, volatile=True)
  eval_lengths = Variable(eval_lengths, volatile=True)
  eval_labels = Variable(eval_labels, volatile=True)


------------------------------
total_eval_loss.shape torch.Size([])
PRFA performance for  PRON nan 0.0 nan 99.68609865470852
PRFA performance for  INTJ nan 0.0 nan 98.74213836477988
PRFA performance for  NUM nan 0.0 nan 99.77272727272727
PRFA performance for  SYM nan nan nan 100.0
PRFA performance for  X nan nan nan 100.0
PRFA performance for  VERB 72.55278310940498 56.417910447761194 63.47607052896725 87.83557046979865
PRFA performance for  ADV 69.04761904761905 37.66233766233766 48.73949579831933 94.52669358456707
PRFA performance for  DET 86.09022556390977 91.6 88.75968992248062 98.3620446201638
PRFA performance for  PUNCT 100.0 60.0 75.0 99.94811932555123
PRFA performance for  CCONJ nan nan nan 100.0
PRFA performance for  PROPN 100.0 4.545454545454546 8.695652173913043 98.87940234791888
PRFA performance for  ADP 83.56481481481481 80.88125466766243 82.2011385199241 89.70588235294117
PRFA performance for  PART 57.89473684210526 56.70103092783505 57.291666666666664 92.78803869832893
P

  precision = 100 * grid[1, 1] / np.sum(grid[1])
  recall = 100 * grid[1, 1] / np.sum(grid[:, 1])


Starting epoch 4
Starting epoch 5
------------------------------
total_eval_loss.shape torch.Size([])
PRFA performance for  PRON 0.0 0.0 nan 99.64125560538116
PRFA performance for  INTJ nan 0.0 nan 98.74213836477988
PRFA performance for  NUM nan 0.0 nan 99.77272727272727
PRFA performance for  SYM nan nan nan 100.0
PRFA performance for  X nan nan nan 100.0
PRFA performance for  VERB 70.1619778346121 61.417910447761194 65.49940310385992 87.87751677852349
PRFA performance for  ADV 71.27659574468085 43.506493506493506 54.032258064516135 94.88559892328398
PRFA performance for  DET 84.46969696969697 89.2 86.77042801556419 98.07963852019203
PRFA performance for  PUNCT 100.0 60.0 75.0 99.94811932555123
PRFA performance for  CCONJ nan nan nan 100.0
PRFA performance for  PROPN 30.0 13.636363636363637 18.75 98.61259338313768
PRFA performance for  ADP 80.98995695839311 84.31665421956684 82.61983168679107 89.57418788410887
PRFA performance for  PART 65.34653465346534 68.04123711340206 66.6666666666

  f1 = 2 * precision * recall / (precision + recall)


Starting epoch 6
Starting epoch 7
------------------------------
total_eval_loss.shape torch.Size([])
PRFA performance for  PRON 0.0 0.0 nan 99.64125560538116
PRFA performance for  INTJ nan 0.0 nan 98.74213836477988
PRFA performance for  NUM nan 0.0 nan 99.77272727272727
PRFA performance for  SYM nan nan nan 100.0
PRFA performance for  X nan nan nan 100.0
PRFA performance for  VERB 70.77175697865353 64.32835820895522 67.39640344018764 88.33892617449665
PRFA performance for  ADV 72.94117647058823 40.25974025974026 51.88284518828452 94.84073575594437
PRFA performance for  DET 85.44776119402985 91.6 88.41698841698842 98.30556340016945
PRFA performance for  PUNCT 80.0 80.0 80.0 99.94811932555123
PRFA performance for  CCONJ 0.0 nan nan 99.92695398100804
PRFA performance for  PROPN 55.55555555555556 22.727272727272727 32.258064516129025 98.87940234791888
PRFA performance for  ADP 83.54525056095737 83.42046303211352 83.48281016442452 90.29850746268657
PRFA performance for  PART 67.12328767123

In [100]:
val_loss

[tensor(0.1815, device='cuda:0'),
 tensor(0.1117, device='cuda:0'),
 tensor(0.0905, device='cuda:0'),
 tensor(0.0858, device='cuda:0')]

In [101]:
"""
for additional training
"""
rnn_optimizer = optim.Adam(RNNseq_model.parameters(), lr=0.0001)
for epoch in range(10):
    print("Starting epoch {}".format(epoch + 1))
    for (__, example_text, example_lengths, labels) in train_dataloader_vua:
        example_text = Variable(example_text)
        example_lengths = Variable(example_lengths)
        labels = Variable(labels)
        if using_GPU:
            example_text = example_text.cuda()
            example_lengths = example_lengths.cuda()
            labels = labels.cuda()
        # predicted shape: (batch_size, seq_len, 2)
        predicted = RNNseq_model(example_text, example_lengths)
        batch_loss = loss_criterion(predicted.view(-1, 2), labels.view(-1))
        rnn_optimizer.zero_grad()
        batch_loss.backward()
        rnn_optimizer.step()
        num_iter += 1
        # Calculate validation and training set loss and accuracy every 200 gradient updates
        if num_iter % 200 == 0:
            avg_eval_loss, performance_matrix = evaluate(idx2pos, val_dataloader_vua, RNNseq_model,
                                                         loss_criterion, using_GPU)
            val_loss.append(avg_eval_loss)
            val_f1s.append(performance_matrix[:, 2])
            print("Iteration {}. Validation Loss {}.".format(num_iter, avg_eval_loss))

#             avg_eval_loss, performance_matrix = evaluate(idx2pos, train_dataloader_vua, RNNseq_model,
#                                                          loss_criterion, using_GPU)
#             train_loss.append(avg_eval_loss)
#             train_f1s.append(performance_matrix[:, 2])
#             print("Iteration {}. Training Loss {}.".format(num_iter, avg_eval_loss))
#             comparable.append(get_performance())

print("Training done!")

Starting epoch 1


  eval_text = Variable(eval_text, volatile=True)
  eval_lengths = Variable(eval_lengths, volatile=True)
  eval_labels = Variable(eval_labels, volatile=True)


------------------------------
total_eval_loss.shape torch.Size([])
PRFA performance for  PRON 0.0 0.0 nan 99.64125560538116
PRFA performance for  INTJ 100.0 50.0 66.66666666666667 99.37106918238993
PRFA performance for  NUM nan 0.0 nan 99.77272727272727
PRFA performance for  SYM nan nan nan 100.0
PRFA performance for  X nan nan nan 100.0
PRFA performance for  VERB 69.14556962025317 65.22388059701493 67.12749615975423 88.03131991051454
PRFA performance for  ADV 61.53846153846154 57.142857142857146 59.25925925925926 94.57155675190668
PRFA performance for  DET 87.109375 89.2 88.14229249011858 98.30556340016945
PRFA performance for  PUNCT 100.0 80.0 88.88888888888889 99.97405966277562
PRFA performance for  CCONJ nan nan nan 100.0
PRFA performance for  PROPN 53.84615384615385 31.818181818181817 39.99999999999999 98.87940234791888
PRFA performance for  ADP 81.70731707317073 85.06348020911128 83.35162824734724 90.01316944688323
PRFA performance for  PART 65.76576576576576 75.25773195876289 7

  f1 = 2 * precision * recall / (precision + recall)
  precision = 100 * grid[1, 1] / np.sum(grid[1])
  recall = 100 * grid[1, 1] / np.sum(grid[:, 1])


Starting epoch 2
Starting epoch 3
------------------------------
total_eval_loss.shape torch.Size([])
PRFA performance for  PRON 0.0 0.0 nan 99.64125560538116
PRFA performance for  INTJ 100.0 50.0 66.66666666666667 99.37106918238993
PRFA performance for  NUM nan 0.0 nan 99.77272727272727
PRFA performance for  SYM nan nan nan 100.0
PRFA performance for  X nan nan nan 100.0
PRFA performance for  VERB 67.93154761904762 68.13432835820896 68.03278688524591 88.00335570469798
PRFA performance for  ADV 62.4113475177305 57.142857142857146 59.66101694915254 94.66128308658591
PRFA performance for  DET 87.02290076335878 91.2 89.0625 98.41852584015815
PRFA performance for  PUNCT 100.0 80.0 88.88888888888889 99.97405966277562
PRFA performance for  CCONJ nan nan nan 100.0
PRFA performance for  PROPN 50.0 31.818181818181817 38.888888888888886 98.82604055496265
PRFA performance for  ADP 82.01595358955765 84.46601941747574 83.22295805739515 89.99122036874451
PRFA performance for  PART 65.17857142857143 

In [103]:
"""
test on genres by POS tags
"""
print("**********************************************************")
print("Evalutation on test set: ")

raw_test_vua = []
with open(data_dir + 'VUAsequence/VUA_seq_formatted_test.csv', encoding='latin-1') as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
        # txt_id	sen_ix	sentence	label_seq	pos_seq	labeled_sentence	genre
        pos_seq = ast.literal_eval(line[4])
        label_seq = ast.literal_eval(line[3])
        assert(len(pos_seq) == len(label_seq))
        assert(len(line[2].split()) == len(pos_seq))
        raw_test_vua.append([line[2], label_seq, pos_seq])
print('number of examples(sentences) for test_set ', len(raw_test_vua))

for i in range(len(raw_test_vua)):
    raw_test_vua[i][2] = index_sequence(pos2idx, raw_test_vua[i][2])

elmos_test_vua = h5py.File(elmo_dir + 'VUA_test.hdf5', 'r')
# raw_train_vua: sentence, label_seq, pos_seq
# embedded_train_vua: embedded_sentence, pos, labels
embedded_test_vua = [[embed_indexed_sequence(example[0], example[2], word2idx,
                                      glove_embeddings, elmos_test_vua, suffix_embeddings),
                       example[2], example[1]]
                      for example in raw_test_vua]

# Separate the input (embedded_sequence) and labels in the indexed train sets.
# embedded_train_vua: embedded_sentence, pos, labels
test_dataset_vua = TextDataset([example[0] for example in embedded_test_vua],
                              [example[1] for example in embedded_test_vua],
                              [example[2] for example in embedded_test_vua])

# Set up a DataLoader for the test dataset
test_dataloader_vua = DataLoader(dataset=test_dataset_vua, batch_size=batch_size,
                              collate_fn=TextDataset.collate_fn)

print("Tagging model performance on VUA test set by POS tags: regardless of genres")
avg_eval_loss, performance_matrix = evaluate(idx2pos, test_dataloader_vua, RNNseq_model, loss_criterion, using_GPU)

**********************************************************
Evalutation on test set: 
number of examples(sentences) for test_set  2694
Tagging model performance on VUA test set by POS tags: regardless of genres


  eval_text = Variable(eval_text, volatile=True)
  eval_lengths = Variable(eval_lengths, volatile=True)
  eval_labels = Variable(eval_labels, volatile=True)


------------------------------
total_eval_loss.shape torch.Size([])
PRFA performance for  PRON 0.0 0.0 nan 99.82300884955752
PRFA performance for  INTJ nan 0.0 nan 99.49748743718592
PRFA performance for  NUM nan nan nan 100.0
PRFA performance for  SYM nan nan nan 100.0
PRFA performance for  X nan nan nan 100.0
PRFA performance for  VERB 71.1572700296736 63.54001059883413 67.13325867861143 88.10777957860616
PRFA performance for  ADV 74.12935323383084 61.0655737704918 66.96629213483146 95.66755083996463
PRFA performance for  DET 90.0 94.5107398568019 92.20023282887077 98.3729966002914
PRFA performance for  PUNCT 0.0 0.0 nan 99.9213217938631
PRFA performance for  CCONJ nan nan nan 100.0
PRFA performance for  PROPN 58.333333333333336 22.580645161290324 32.55813953488372 98.47769028871392
PRFA performance for  ADP 88.4476534296029 88.02395209580838 88.23529411764706 92.60377358490567
PRFA performance for  PART 55.483870967741936 57.718120805369125 56.578947368421055 90.97744360902256
PRFA p

  f1 = 2 * precision * recall / (precision + recall)
  precision = 100 * grid[1, 1] / np.sum(grid[1])
  recall = 100 * grid[1, 1] / np.sum(grid[:, 1])


In [106]:
performance_matrix

array([[  0.        ,   0.        ,          nan,  99.82300885],
       [         nan,   0.        ,          nan,  99.49748744],
       [         nan,          nan,          nan, 100.        ],
       [         nan,          nan,          nan, 100.        ],
       [         nan,          nan,          nan, 100.        ],
       [ 71.15727003,  63.5400106 ,  67.13325868,  88.10777958],
       [ 74.12935323,  61.06557377,  66.96629213,  95.66755084],
       [ 90.        ,  94.51073986,  92.20023283,  98.3729966 ],
       [  0.        ,   0.        ,          nan,  99.92132179],
       [         nan,          nan,          nan, 100.        ],
       [ 58.33333333,  22.58064516,  32.55813953,  98.47769029],
       [ 88.44765343,  88.0239521 ,  88.23529412,  92.60377358],
       [ 55.48387097,  57.71812081,  56.57894737,  90.97744361],
       [ 65.45454545,  53.23475046,  58.71559633,  89.78562421],
       [ 69.79472141,  55.30596437,  61.71132239,  89.68327899]])

In [None]:
# Evaluate the model on the test data
!git status
