In [1]:
from google.colab import drive
ROOT = '/content/drive'
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
import sys
from os.path import join 
repo_dir = '/content/drive/MyDrive/metaphor-detection'

In [3]:
## directories for resources
data_dir = repo_dir + '/resources/metaphor-in-context/data/'
glove_dir = repo_dir + '/resources/glove/'
elmo_dir = repo_dir + '/resources/elmo/'

In [4]:
## installing the requirements
%cd 'drive/MyDrive/metaphor-detection/' 
#!pip install allennlp
#!pip install -r gao-g-requirements.txt
#!pip install --upgrade google-cloud-storage

/content/drive/MyDrive/metaphor-detection


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader

from core.gao_files.sequence.model import RNNSequenceModel
import time
import matplotlib
from core.gao_files.sequence.util import *
from core.gao_files.sequence.util import TextDatasetWithGloveElmoSuffix as TextDataset
from core.data.gao_data import *
import h5py
import math
import numpy as np
import random

## Data Preperation

In [6]:
### Read MOH-x Data 
data_dir = os.path.join("resources", "metaphor-in-context", "data")

In [7]:
raw_mohx = []

with open(data_dir + '/MOH-X/MOH-X_formatted_svo_cleaned.csv') as f:
    # arg1  	arg2	verb	sentence	verb_idx	label
    lines = csv.reader(f)
    next(lines)
    for line in lines:
        sentence = line[3]
        label_seq = [0] * len(sentence.split())
        pos_seq = [0] * len(label_seq)
        verb_idx = int(line[4])
        verb_label = int(line[5])
        label_seq[verb_idx] = verb_label
        pos_seq[verb_idx] = 1   # idx2pos = {0: 'words that are not focus verbs', 1: 'focus verb'}
        raw_mohx.append([sentence.strip(), label_seq, pos_seq])


In [8]:
## Sample data format
raw_mohx[0]

['He absorbed the knowledge or beliefs of his tribe .',
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]]

In [9]:
# vocab is a set of words
vocab = get_vocab(raw_mohx)
# two dictionaries. <PAD>: 0, <UNK>: 1
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(glove_dir + 'glove840B300d.txt' 
                                        ,word2idx, idx2word, normalization=False)
# elmo_embeddings
# set elmos_mohx=None to exclude elmo vectors. Also need to change the embedding_dim in later model initialization
elmos_mohx = h5py.File(elmo_dir + 'MOH-X_cleaned.hdf5', 'r')

random.seed(0)
random.shuffle(raw_mohx)

# second argument is the post sequence, which we don't need
embedded_mohx = [[embed_indexed_sequence(example[0], example[2], word2idx,
                                      glove_embeddings, elmos_mohx, None),
                       example[2], example[1]]
                      for example in raw_mohx]


vocab size:  1732


100%|██████████| 2196017/2196017 [00:48<00:00, 45172.65it/s]


Number of pre-trained word vectors loaded:  1730
Embeddings mean:  -0.0001946780103025958
Embeddings stdev:  0.3732253909111023


In [10]:
sentences = [data[0] for data in embedded_mohx]
poss = [data[1] for data in embedded_mohx]
labels = [data[2] for data in embedded_mohx]

## K-Fold Training

In [11]:
using_GPU = True
ten_folds = []
fold_size = int(647 / 10)
for i in range(10):
    ten_folds.append((sentences[i * fold_size:(i + 1) * fold_size],
                      poss[i * fold_size:(i + 1) * fold_size],
                      labels[i * fold_size:(i + 1) * fold_size]))

idx2pos = {0: 'words that are not focus verbs', 1: 'focus verb'}

optimal_f1s = []
optimal_ps = []
optimal_rs = []
optimal_accs = []
predictions_all = []
for i in range(10):
    '''
    2. 3
    set up Dataloader for batching
    '''
    training_sentences = []
    training_labels = []
    training_poss = []
    for j in range(10):
        if j != i:
            training_sentences.extend(ten_folds[j][0])
            training_poss.extend(ten_folds[j][1])
            training_labels.extend(ten_folds[j][2])
    training_dataset_mohx = TextDataset(training_sentences, training_poss, training_labels)
    val_dataset_mohx = TextDataset(ten_folds[i][0], ten_folds[i][1], ten_folds[i][2])

    # Data-related hyperparameters
    batch_size = 10
    # Set up a DataLoader for the training, validation, and test dataset
    train_dataloader_mohx = DataLoader(dataset=training_dataset_mohx, batch_size=batch_size, shuffle=True,
                                       collate_fn=TextDataset.collate_fn)
    val_dataloader_mohx = DataLoader(dataset=val_dataset_mohx, batch_size=batch_size, shuffle=False,
                                     collate_fn=TextDataset.collate_fn)

    """
    3. Model training
    """
    '''
    3. 1 
    set up model, loss criterion, optimizer
    '''
    # Instantiate the model
    # embedding_dim = glove + elmo + suffix indicator
    # dropout1: dropout on input to RNN
    # dropout2: dropout in RNN; would be used if num_layers=1
    # dropout3: dropout on hidden state of RNN to linear layer
    RNNseq_model = RNNSequenceModel(num_classes=2, embedding_dim=300+1024, hidden_size=300,
                                    num_layers=1, bidir=True,
                                    dropout1=0.5, dropout2=0, dropout3=0)
    # Move the model to the GPU if available
    if using_GPU:
        RNNseq_model = RNNseq_model.cuda()
    # Set up criterion for calculating loss
    loss_criterion = nn.NLLLoss()
    # Set up an optimizer for updating the parameters of the rnn_clf
    rnn_optimizer = optim.Adam(RNNseq_model.parameters(), lr=0.001)
    # Number of epochs (passes through the dataset) to train the model for.
    num_epochs = 10

    '''
    3. 2
    train model
    '''
    train_loss = []
    val_loss = []
    performance_matrix = None
    val_f1 = []
    val_p = []
    val_r = []
    val_acc = []
    train_f1 = []
    # A counter for the number of gradient updates
    num_iter = 0
    model_index = 0
    comparable = []
    for epoch in range(num_epochs):
        print("Starting epoch {}".format(epoch + 1))
        for (__, example_text, example_lengths, labels) in train_dataloader_mohx:
            example_text = Variable(example_text)
            example_lengths = Variable(example_lengths)
            labels = Variable(labels)
            if using_GPU:
                example_text = example_text.cuda()
                example_lengths = example_lengths.cuda()
                labels = labels.cuda()
            # predicted shape: (batch_size, seq_len, 2)
            predicted = RNNseq_model(example_text, example_lengths)
            batch_loss = loss_criterion(predicted.view(-1, 2), labels.view(-1))
            rnn_optimizer.zero_grad()
            batch_loss.backward()
            rnn_optimizer.step()
            num_iter += 1
            # Calculate validation and training set loss and accuracy every 200 gradient updates
            if num_iter % 50 == 0:
                avg_eval_loss, performance_matrix = evaluate(idx2pos, val_dataloader_mohx, RNNseq_model,
                                                             loss_criterion, using_GPU)
                val_loss.append(avg_eval_loss)
                val_p.append(performance_matrix[1][0])
                val_r.append(performance_matrix[1][1])
                val_f1.append(performance_matrix[1][2])
                val_acc.append(performance_matrix[1][3])
                #print("Iteration {}. Validation Loss {}.".format(num_iter, avg_eval_loss))
    #print("Training done for fold {}".format(i))
    """
    store the best f1
    """
    print('val_f1: ', val_f1)
    idx = 0
    if math.isnan(max(val_f1)):
        optimal_f1s.append(max(val_f1[6:]))
        idx = val_f1.index(optimal_f1s[-1])
        optimal_ps.append(val_p[idx])
        optimal_rs.append(val_r[idx])
        optimal_accs.append(val_acc[idx])
    else:
        optimal_f1s.append(max(val_f1))
        idx = val_f1.index(optimal_f1s[-1])
        optimal_ps.append(val_p[idx])
        optimal_rs.append(val_r[idx])
        optimal_accs.append(val_acc[idx])


"""
print out the performance
plot the performance on each fold
"""
print('F1 on MOH-X by 10-fold = ', optimal_f1s)
print('Precision on MOH-X = ', np.mean(np.array(optimal_ps)))
print('Recall on MOH-X = ', np.mean(np.array(optimal_rs)))
print('F1 on MOH-X = ', np.mean(np.array(optimal_f1s)))
print('Accuracy on MOH-X = ', np.mean(np.array(optimal_accs)))

Starting epoch 1


  eval_text = Variable(eval_text, volatile=True)
  eval_lengths = Variable(eval_lengths, volatile=True)
  eval_labels = Variable(eval_labels, volatile=True)
  precision = 100 * grid[1, 1] / np.sum(grid[1])
  recall = 100 * grid[1, 1] / np.sum(grid[:, 1])


------------------------------
total_eval_loss.shape torch.Size([])
PRFA performance for  words that are not focus verbs nan nan nan 100.0
PRFA performance for  focus verb 66.66666666666667 56.0 60.869565217391305 71.875
Starting epoch 2
------------------------------
total_eval_loss.shape torch.Size([])
PRFA performance for  words that are not focus verbs nan nan nan 100.0
PRFA performance for  focus verb 68.0 68.0 68.0 75.0
Starting epoch 3
------------------------------
total_eval_loss.shape torch.Size([])
PRFA performance for  words that are not focus verbs nan nan nan 100.0
PRFA performance for  focus verb 76.19047619047619 64.0 69.56521739130434 78.125
Starting epoch 4
------------------------------
total_eval_loss.shape torch.Size([])
PRFA performance for  words that are not focus verbs nan nan nan 100.0
PRFA performance for  focus verb 51.282051282051285 80.0 62.50000000000001 62.5
Starting epoch 5
------------------------------
total_eval_loss.shape torch.Size([])
PRFA perform

In [12]:
import pandas as pd
gao_scores = [79.1, 73.5, 75.6, 77.2]
our_scores = [np.mean(np.array(optimal_ps)),
  np.mean(np.array(optimal_rs)),
  np.mean(np.array(optimal_f1s)),
  np.mean(np.array(optimal_accs))]
our_scores = [round(score,1) for score in our_scores]
all_scores = [gao_scores, our_scores]
all_scores_df = pd.DataFrame(all_scores, columns= ['P', 'R', 'F1', 'Acc'], index=['Gao et al', 'US'])
print("Moh-X seq model: classification task\n")
all_scores_df

Moh-X seq model: classification task



Unnamed: 0,P,R,F1,Acc
Gao et al,79.1,73.5,75.6,77.2
US,74.8,77.2,75.5,75.9
