In [None]:
# mount google drive 
from google.colab import drive
ROOT = '/content/drive'
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import sys
from os.path import join 
repo_dir = '/content/drive/MyDrive/metaphor-detection'

In [None]:
## directories for resources
data_dir = repo_dir + '/resources/metaphor-in-context/data/'
glove_dir = repo_dir + '/resources/glove/'
elmo_dir = repo_dir + '/resources/elmo/'


In [None]:
# installing the requirements
%cd 'drive/MyDrive/metaphor-detection/' 
#!pip install allennlp
#!pip install -r gao-g-requirements.txt
#!pip install --upgrade google-cloud-storage

[Errno 2] No such file or directory: 'drive/MyDrive/metaphor-detection/'
/content/drive/MyDrive/metaphor-detection


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader

from core.gao_files.classification.model import RNNSequenceClassifier
import time
import matplotlib
from core.gao_files.classification.util import *
from core.data.gao_data import *
import h5py
import math
import numpy as np

## Data Preperation

In [None]:
### Read MOH-x Data 
data_dir = os.path.join("resources", "metaphor-in-context", "data")
data_container = ExperimentData(data_dir)
data_container.read_moh_x_data(to_pandas = False)
moh_x_data = data_container.moh_x_formatted_svo_cleaned


MOH-X formatted svo nrow: 647
MOH-X formatted svo cleaned nrow: 647


In [None]:
### Pre-Process Data
vocab = get_vocab(moh_x_data)
word2idx, idx2word = get_word2idx_idx2word(vocab)
glove_embeddings = get_embedding_matrix(glove_dir + 'glove840B300d.txt', 
                                        word2idx, 
                                        idx2word, 
                                        normalization=False)

vocab size:  453


100%|██████████| 2196017/2196017 [00:47<00:00, 46028.71it/s]


Number of pre-trained word vectors loaded:  453
Embeddings mean:  -0.0009290720336139202
Embeddings stdev:  0.38682886958122253


In [None]:
NUM_SUFFIX_TAG = 2
elmo_embeddings = h5py.File(elmo_dir + 'MOH-X_cleaned.hdf5', 'r')
suffix_embeddings = nn.Embedding(NUM_SUFFIX_TAG, 50)

In [None]:
## Embedding Datasets
embedded_data = [[embed_sequence(data[3].strip(), int(data[4]), word2idx, glove_embeddings, elmo_embeddings, suffix_embeddings), int(data[-1])] for data in moh_x_data]
sentences = [data[0] for data in embedded_data]
labels = [data[1] for data in embedded_data]

## K-Fold Training

In [None]:
print(f"Data Length is {len(moh_x_data)}")
NUMBER_FOLD = 10
fold_size = round(len(moh_x_data) / NUMBER_FOLD)
print(f"Each fold size is {fold_size}")

Data Length is 647
Each fold size is 65


In [None]:
folds = []
for i in range(NUMBER_FOLD):
    folds.append((sentences[i * fold_size:(i + 1) * fold_size], labels[i * fold_size: (i + 1) * fold_size]))


In [None]:
optimal_f1s = []
accuracies = []
precisions = []
recalls = []
BATCH_SIZE = 10
using_GPU = True
NUM_EPOCHS = 10
for i in range(NUMBER_FOLD):
    ### DATA BATCHING
    training_sentences = []
    training_labels = []
    for j in range(NUMBER_FOLD):
        if j != i:
            training_sentences.extend(folds[j][0])
            training_labels.extend(folds[j][1])
    training_dataset_mohX = TextDatasetWithGloveElmoSuffix(training_sentences, 
                                                           training_labels)
    val_dataset_mohX = TextDatasetWithGloveElmoSuffix(folds[i][0], 
                                                      folds[i][1])

    # Data-related hyperparameters
    # Set up a DataLoader for the training, validation, and test dataset
    train_dataloader_mohX = DataLoader(dataset=training_dataset_mohX, 
                                       batch_size=BATCH_SIZE, 
                                       shuffle=True,
                                      collate_fn=TextDatasetWithGloveElmoSuffix
                                                .collate_fn)
    val_dataloader_mohX = DataLoader(dataset=val_dataset_mohX, 
                                     batch_size=BATCH_SIZE, 
                                     shuffle=True,
                                      collate_fn=TextDatasetWithGloveElmoSuffix
                                                .collate_fn)
    rnn_clf = RNNSequenceClassifier(num_classes=2, 
                                    embedding_dim=300+1024+50, 
                                    hidden_size=300, num_layers=1, 
                                    bidir=True,
                                    dropout1=0.2, dropout2=0, dropout3=0.2)
    nll_criterion = nn.NLLLoss()
    if using_GPU:
        rnn_clf = rnn_clf.cuda()
        nll_criterion = nll_criterion.cuda()

    rnn_clf_optimizer = optim.SGD(rnn_clf.parameters(), lr=0.02, momentum=0.9)
    #### TRAIN ####
    training_loss = []
    val_loss = []
    val_p = []
    val_r = []
    val_acc = []
    training_f1 = []
    val_f1 = []
    num_iter = 0
    for epoch in range(NUM_EPOCHS):
        print("-----Starting epoch {}------".format(epoch + 1))
        now = time.time()
        for (example_text, example_lengths, labels) in train_dataloader_mohX:
            example_text = Variable(example_text)
            example_lengths = Variable(example_lengths)
            labels = Variable(labels)
            if using_GPU:
                example_text = example_text.cuda()
                example_lengths = example_lengths.cuda()
                labels = labels.cuda()

            # predicted shape: (batch_size, 2)
            predicted = rnn_clf(example_text, example_lengths)
            batch_loss = nll_criterion(predicted, labels)
            rnn_clf_optimizer.zero_grad()
            batch_loss.backward()
            rnn_clf_optimizer.step()
            num_iter += 1
            # Calculate validation and training set loss and accuracy every 200 gradient updates
            if num_iter % 200 == 0:
              avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1 = evaluate(val_dataloader_mohX, rnn_clf,
                                                                                    nll_criterion, using_GPU)
              val_loss.append(avg_eval_loss)
              val_f1.append(f1)
              val_p.append(precision)
              val_r.append(recall)
              val_acc.append(eval_accuracy.item())
              print(
                  """Iteration {}. Validation Loss {}. Validation Accuracy {}. 
                    Validation Precision {}. Validation Recall {}. 
                    Validation F1 {}. Validation class-wise F1 {}.""".format(
              num_iter, avg_eval_loss, eval_accuracy, precision, recall, f1, fus_f1))
                        
            
    print("Training done for fold {}".format(i))
    idx = 0
    try:
      if math.isnan(max(val_f1)):
          optimal_f1s.append(max(val_f1[6:]))
          idx = val_f1.index(optimal_f1s[-1])
          precisions.append(val_p[idx])
          recalls.append(val_r[idx])
          accuracies.append(val_acc[idx])
      else:
          optimal_f1s.append(max(val_f1))
          idx = val_f1.index(optimal_f1s[-1])
          precisions.append(val_p[idx])
          recalls.append(val_r[idx])
          accuracies.append(val_acc[idx])
    except:
      print(idx)
      print(val_p)
                

print('F1 on MOH-X by 10-fold = ', optimal_f1s)
print('F1 on MOH-X = ', np.mean(np.array(optimal_f1s)))
print('precisions on MOH-X = ', np.mean(np.array(precisions)))
print('recalls on MOH-X = ', np.mean(np.array(recalls)))
print('accuracies on MOH-X = ', np.mean(np.array(accuracies)))


In [None]:
gao_results = [75.3, 84.3, 79.1, 78.5]
our_results = [round(np.mean(np.array(precisions)), 2),
               round(np.mean(np.array(recalls)), 2),
               round(np.mean(np.array(optimal_f1s)), 2),
               round(np.mean(np.array(accuracies)), 2)]

In [None]:
results = pd.DataFrame([gao_results, our_results], 
                       columns = ['P', "R", "F1", "Acc"],
                       index = ["GAO", "US"])

In [None]:
results

Unnamed: 0,P,R,F1,Acc
GAO,75.3,84.3,79.1,78.5
US,75.34,75.41,74.8,75.15
