## Code for replicating gao et al research on VUA Sequence Model

In [5]:
# mount drive
from google.colab import drive
ROOT = '/content/drive'
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# add repo directory to path
import os
import sys
from os.path import join 
repo_dir = '/content/drive/MyDrive/Repos/metaphor-detection'
if repo_dir not in sys.path:
    sys.path.append(repo_dir)
print(sys.path)

['', '/content', '/env/python', '/usr/lib/python37.zip', '/usr/lib/python3.7', '/usr/lib/python3.7/lib-dynload', '/usr/local/lib/python3.7/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.7/dist-packages/IPython/extensions', '/root/.ipython', '/content/drive/MyDrive/Repos/metaphor-detection']


In [2]:
# directories
# to download glove and elmo vectors see: notebooks/Download_large_data.ipynb
data_dir = repo_dir + '/resources/metaphor-in-context/data/'
glove_dir = repo_dir + '/resources/glove/'
elmo_dir = repo_dir + '/resources/elmo/'

In [3]:
# cd to working directory here if neccesary
%cd $repo_dir
%ls

/content/drive/MyDrive/Repos/metaphor-detection
[0m[01;34mcore[0m/  gao-g-requirements.txt  install.sh  [01;34mnotebooks[0m/  README.md  [01;34mresources[0m/


Gao code

In [4]:
# pip install requirements (takes a while)
!cd drive/MyDrive/Repos/metaphor-detection/; pip install -r gao-g-requirements.txt
!pip install --upgrade google-cloud-storage

/bin/bash: line 0: cd: drive/MyDrive/Repos/metaphor-detection/: No such file or directory


In [5]:
%load_ext autoreload
%autoreload 2

In [83]:
from core.gao_files.sequence.util import get_num_lines, get_pos2idx_idx2pos, index_sequence, get_vocab, embed_indexed_sequence, \
    get_word2idx_idx2word, get_embedding_matrix, write_predictions, get_performance_VUAverb_val, \
    get_performance_VUAverb_test, get_performance_VUA_test, get_VUA_POS_summary
from core.gao_files.sequence.util import TextDatasetWithGloveElmoSuffix as TextDataset
from core.gao_files.sequence.util import evaluate
from core.gao_files.sequence.model import RNNSequenceModel

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import DataLoader

import csv
import h5py
import numpy as np
import matplotlib.pyplot as plt
import math
import random
import ast
import pandas as pd

In [7]:
print("PyTorch version:")
print(torch.__version__)
print("GPU Detected:")
print(torch.cuda.is_available())
using_GPU = torch.cuda.is_available()

PyTorch version:
1.10.0+cu111
GPU Detected:
True


In [8]:
"""
1. Data pre-processing
"""
'''
1.1 VUA
get raw dataset as a list:
  Each element is a triple:
    a sentence: string
    a list of labels: 
    a list of pos: 
'''
pos_set = set()
raw_train_vua = []
with open(data_dir + 'VUAsequence/VUA_seq_formatted_train.csv', encoding='latin-1') as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
        pos_seq = ast.literal_eval(line[4])
        label_seq = ast.literal_eval(line[3])
        assert (len(pos_seq) == len(label_seq))
        assert (len(line[2].split()) == len(pos_seq))
        raw_train_vua.append([line[2], label_seq, pos_seq])
        pos_set.update(pos_seq)

raw_val_vua = []
with open(data_dir + 'VUAsequence/VUA_seq_formatted_val.csv', encoding='latin-1') as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
        pos_seq = ast.literal_eval(line[4])
        label_seq = ast.literal_eval(line[3])
        assert (len(pos_seq) == len(label_seq))
        assert (len(line[2].split()) == len(pos_seq))
        raw_val_vua.append([line[2], label_seq, pos_seq])
        pos_set.update(pos_seq)

# embed the pos tags
pos2idx, idx2pos = get_pos2idx_idx2pos(pos_set)

for i in range(len(raw_train_vua)):
    raw_train_vua[i][2] = index_sequence(pos2idx, raw_train_vua[i][2])
for i in range(len(raw_val_vua)):
    raw_val_vua[i][2] = index_sequence(pos2idx, raw_val_vua[i][2])
print('size of training set, validation set: ', len(raw_train_vua), len(raw_val_vua))

size of training set, validation set:  6323 1550


In [9]:
raw_train_vua[:2]

[["Ca n't fail to be entertaining .",
  [0, 0, 0, 0, 0, 0, 0],
  [1, 2, 1, 8, 1, 9, 12]],
 ['How much was he going to tell her ?',
  [0, 0, 0, 0, 0, 0, 0, 0, 0],
  [2, 9, 1, 10, 1, 8, 1, 10, 12]]]

In [10]:
"""
2. Data preparation
"""
'''
2. 1
get vocabulary and glove embeddings in raw dataset 
'''
# vocab is a set of words
vocab = get_vocab(raw_train_vua)
# two dictionaries. <PAD>: 0, <UNK>: 1
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(glove_dir + 'glove.840B.300d.txt',word2idx, idx2word, normalization=False)
# elmo_embeddings
elmos_train_vua = h5py.File(elmo_dir + 'VUA_train.hdf5', 'r')
elmos_val_vua = h5py.File(elmo_dir + 'VUA_val.hdf5', 'r')
# no suffix embeddings for sequence labeling
suffix_embeddings = None

vocab size:  13843


100%|██████████| 2196017/2196017 [01:04<00:00, 33944.96it/s]


Number of pre-trained word vectors loaded:  13404
Embeddings mean:  0.0005707233212888241
Embeddings stdev:  0.3729434907436371


In [11]:
print(len(vocab))
glove_embeddings.weight.shape
# 300d embeddings for the 13843 words in the vocab

13843


torch.Size([13845, 300])

In [12]:
'''
2. 2
embed the datasets
'''
# raw_train_vua: sentence, label_seq, pos_seq
# embedded_train_vua: embedded_sentence, pos, labels
embedded_train_vua = [[embed_indexed_sequence(example[0], example[2], word2idx,
                                      glove_embeddings, elmos_train_vua, suffix_embeddings),
                       example[2], example[1]]
                      for example in raw_train_vua]
embedded_val_vua = [[embed_indexed_sequence(example[0], example[2], word2idx,
                                    glove_embeddings, elmos_val_vua, suffix_embeddings),
                     example[2], example[1]]
                    for example in raw_val_vua]

In [13]:
# embedded_train_vua is a list of lists -- one list per sentence
# each sentence list contains 
#     an array of embeddings (seq_length x embedding_dim)
#     list of pos tag ids
#     list of labels
print(len(embedded_train_vua))
print(len(embedded_train_vua[0]))
print(embedded_train_vua[0][0].shape)
print(embedded_train_vua[0][1])
print(embedded_train_vua[0][2])

6323
3
(7, 1324)
[1, 2, 1, 8, 1, 9, 12]
[0, 0, 0, 0, 0, 0, 0]


In [14]:
'''
2. 3
set up Dataloader for batching
'''
# Separate the input (embedded_sequence) and labels in the indexed train sets.
# embedded_train_vua: embedded_sentence, pos, labels
train_dataset_vua = TextDataset([example[0] for example in embedded_train_vua],
                                [example[1] for example in embedded_train_vua],
                                [example[2] for example in embedded_train_vua])
val_dataset_vua = TextDataset([example[0] for example in embedded_val_vua],
                              [example[1] for example in embedded_val_vua],
                              [example[2] for example in embedded_val_vua])

# Data-related hyperparameters
batch_size = 64
# Set up a DataLoader for the training, validation, and test dataset
train_dataloader_vua = DataLoader(dataset=train_dataset_vua, batch_size=batch_size, shuffle=True,
                              collate_fn=TextDataset.collate_fn)
val_dataloader_vua = DataLoader(dataset=val_dataset_vua, batch_size=batch_size,
                            collate_fn=TextDataset.collate_fn)

In [15]:
"""
3. Model training
"""
'''
3. 1 
set up model, loss criterion, optimizer
'''
# Instantiate the model
# embedding_dim = glove + elmo + suffix indicator
# dropout1: dropout on input to RNN
# dropout2: dropout in RNN; would be used if num_layers!=1
# dropout3: dropout on hidden state of RNN to linear layer
RNNseq_model = RNNSequenceModel(num_classes=2, embedding_dim=300 + 1024, hidden_size=300, num_layers=1, bidir=True,
                                dropout1=0.5, dropout2=0, dropout3=0.1)
# Move the model to the GPU if available
if using_GPU:
    RNNseq_model = RNNseq_model.cuda()
# Set up criterion for calculating loss
loss_criterion = nn.NLLLoss()
# Set up an optimizer for updating the parameters of the rnn_clf
rnn_optimizer = optim.Adam(RNNseq_model.parameters(), lr=0.005)
# Number of epochs (passes through the dataset) to train the model for.
num_epochs = 10

In [16]:
'''
3. 2
train model
'''
train_loss = []
val_loss = []
performance_matrix = None
val_f1s = []
train_f1s = []
# A counter for the number of gradient updates
num_iter = 0
comparable = []
for epoch in range(num_epochs):
    print("Starting epoch {}".format(epoch + 1))
    for (__, example_text, example_lengths, labels) in train_dataloader_vua:
        example_text = Variable(example_text)
        example_lengths = Variable(example_lengths)
        labels = Variable(labels)
        if using_GPU:
            example_text = example_text.cuda()
            example_lengths = example_lengths.cuda()
            labels = labels.cuda()
        # predicted shape: (batch_size, seq_len, 2)
        predicted = RNNseq_model(example_text, example_lengths)
        batch_loss = loss_criterion(predicted.view(-1, 2), labels.view(-1))
        rnn_optimizer.zero_grad()
        batch_loss.backward()
        rnn_optimizer.step()
        num_iter += 1
        # Calculate validation and training set loss and accuracy every 200 gradient updates
        if num_iter % 200 == 0:
            avg_eval_loss, performance_matrix = evaluate(idx2pos, val_dataloader_vua, RNNseq_model,
                                                         loss_criterion, using_GPU)
            val_loss.append(avg_eval_loss)
            val_f1s.append(performance_matrix[:, 2])
            print("Iteration {}. Validation Loss {}.".format(num_iter, avg_eval_loss))
#             avg_eval_loss, performance_matrix = evaluate(idx2pos, train_dataloader_vua, RNNseq_model,
#                                                          loss_criterion, using_GPU)
#             train_loss.append(avg_eval_loss)
#             train_f1s.append(performance_matrix[:, 2])
#             print("Iteration {}. Training Loss {}.".format(num_iter, avg_eval_loss))

Starting epoch 1
Starting epoch 2
Starting epoch 3


  eval_text = Variable(eval_text, volatile=True)
  eval_lengths = Variable(eval_lengths, volatile=True)
  eval_labels = Variable(eval_labels, volatile=True)


------------------------------
total_eval_loss.shape torch.Size([])
PRFA performance for  CCONJ 0.0 nan nan 99.92695398100804
PRFA performance for  VERB 69.46107784431138 60.59701492537314 64.72698286169789 87.6258389261745
PRFA performance for  ADV 70.45454545454545 40.25974025974026 51.239669421487605 94.70614625392552
PRFA performance for  X nan nan nan 100.0
PRFA performance for  DET 86.51685393258427 92.4 89.36170212765958 98.44676645015532
PRFA performance for  NOUN 74.3006993006993 45.8963282937365 56.742323097463284 90.55255868202362
PRFA performance for  SYM nan nan nan 100.0
PRFA performance for  ADP 82.50728862973762 84.54070201643017 83.51161932866101 90.18876207199298
PRFA performance for  PART 64.58333333333333 63.91752577319588 64.24870466321244 93.93139841688654
PRFA performance for  ADJ 68.62745098039215 40.69767441860465 51.0948905109489 91.65888577653284
PRFA performance for  PRON nan 0.0 nan 99.68609865470852
PRFA performance for  NUM nan 0.0 nan 99.77272727272727
P

  recall = 100 * grid[1, 1] / np.sum(grid[:, 1])
  precision = 100 * grid[1, 1] / np.sum(grid[1])


Starting epoch 4
Starting epoch 5
------------------------------
total_eval_loss.shape torch.Size([])
PRFA performance for  CCONJ nan nan nan 100.0
PRFA performance for  VERB 66.57263751763047 70.44776119402985 68.45540246555474 87.83557046979865
PRFA performance for  ADV 63.865546218487395 49.35064935064935 55.67765567765567 94.57155675190668
PRFA performance for  X nan nan nan 100.0
PRFA performance for  DET 85.60885608856088 92.8 89.0595009596929 98.39028523016097
PRFA performance for  NOUN 73.2 59.28725701943844 65.51312649164677 91.57311561452107
PRFA performance for  SYM nan nan nan 100.0
PRFA performance for  ADP 83.7696335078534 83.64451082897685 83.70702541106128 90.43020193151888
PRFA performance for  PART 62.365591397849464 59.79381443298969 61.05263157894737 93.49164467897977
PRFA performance for  ADJ 61.8421052631579 54.651162790697676 58.0246913580247 91.53439153439153
PRFA performance for  PRON nan 0.0 nan 99.68609865470852
PRFA performance for  NUM nan 0.0 nan 99.772727

  f1 = 2 * precision * recall / (precision + recall)


Starting epoch 10


In [17]:
"""
for additional training
"""
rnn_optimizer = optim.Adam(RNNseq_model.parameters(), lr=0.0001)
for epoch in range(10):
    print("Starting epoch {}".format(epoch + 1))
    for (__, example_text, example_lengths, labels) in train_dataloader_vua:
        example_text = Variable(example_text)
        example_lengths = Variable(example_lengths)
        labels = Variable(labels)
        if using_GPU:
            example_text = example_text.cuda()
            example_lengths = example_lengths.cuda()
            labels = labels.cuda()
        # predicted shape: (batch_size, seq_len, 2)
        predicted = RNNseq_model(example_text, example_lengths)
        batch_loss = loss_criterion(predicted.view(-1, 2), labels.view(-1))
        rnn_optimizer.zero_grad()
        batch_loss.backward()
        rnn_optimizer.step()
        num_iter += 1
        # Calculate validation and training set loss and accuracy every 200 gradient updates
        if num_iter % 200 == 0:
            avg_eval_loss, performance_matrix = evaluate(idx2pos, val_dataloader_vua, RNNseq_model,
                                                         loss_criterion, using_GPU)
            val_loss.append(avg_eval_loss)
            val_f1s.append(performance_matrix[:, 2])
            print("Iteration {}. Validation Loss {}.".format(num_iter, avg_eval_loss))

#             avg_eval_loss, performance_matrix = evaluate(idx2pos, train_dataloader_vua, RNNseq_model,
#                                                          loss_criterion, using_GPU)
#             train_loss.append(avg_eval_loss)
#             train_f1s.append(performance_matrix[:, 2])
#             print("Iteration {}. Training Loss {}.".format(num_iter, avg_eval_loss))
#             comparable.append(get_performance())

print("Training done!")

Starting epoch 1


  eval_text = Variable(eval_text, volatile=True)
  eval_lengths = Variable(eval_lengths, volatile=True)
  eval_labels = Variable(eval_labels, volatile=True)


------------------------------
total_eval_loss.shape torch.Size([])
PRFA performance for  CCONJ nan nan nan 100.0
PRFA performance for  VERB 69.07294832826747 67.83582089552239 68.44879518072288 88.28299776286353
PRFA performance for  ADV 65.83333333333333 51.298701298701296 57.66423357664234 94.79587258860475
PRFA performance for  X nan nan nan 100.0
PRFA performance for  DET 88.46153846153847 92.0 90.19607843137256 98.5879695001412
PRFA performance for  NOUN 71.58808933002481 62.31101511879049 66.62817551963047 91.57311561452107
PRFA performance for  SYM nan nan nan 100.0
PRFA performance for  ADP 82.49818445896878 84.83943241224794 83.65243004418264 90.25460930640914
PRFA performance for  PART 63.73626373626374 59.79381443298969 61.702127659574465 93.66754617414249
PRFA performance for  ADJ 60.12861736334405 54.36046511627907 57.099236641221374 91.2542794895736
PRFA performance for  PRON nan 0.0 nan 99.68609865470852
PRFA performance for  NUM nan 0.0 nan 99.77272727272727
PRFA perfo

  precision = 100 * grid[1, 1] / np.sum(grid[1])
  recall = 100 * grid[1, 1] / np.sum(grid[:, 1])


Starting epoch 2
Starting epoch 3
------------------------------
total_eval_loss.shape torch.Size([])
PRFA performance for  CCONJ nan nan nan 100.0
PRFA performance for  VERB 67.6076217360621 71.49253731343283 69.49582879941966 88.2410514541387
PRFA performance for  ADV 63.3587786259542 53.896103896103895 58.24561403508772 94.66128308658591
PRFA performance for  X nan nan nan 100.0
PRFA performance for  DET 88.16793893129771 92.4 90.23437500000001 98.5879695001412
PRFA performance for  NOUN 71.72582619339045 63.2829373650108 67.24039013195639 91.67517130777081
PRFA performance for  SYM nan nan nan 100.0
PRFA performance for  ADP 82.37410071942446 85.51157580283794 83.91352143642361 90.36435469710273
PRFA performance for  PART 61.904761904761905 67.01030927835052 64.35643564356437 93.66754617414249
PRFA performance for  ADJ 60.0 54.94186046511628 57.359635811836114 91.2542794895736
PRFA performance for  PRON nan 0.0 nan 99.68609865470852
PRFA performance for  NUM nan 0.0 nan 99.77272727

In [18]:
"""
test on genres by POS tags
"""
print("**********************************************************")
print("Evalutation on test set: ")

raw_test_vua = []
with open(data_dir + 'VUAsequence/VUA_seq_formatted_test.csv', encoding='latin-1') as f:
    lines = csv.reader(f)
    next(lines)
    for line in lines:
        # txt_id	sen_ix	sentence	label_seq	pos_seq	labeled_sentence	genre
        pos_seq = ast.literal_eval(line[4])
        label_seq = ast.literal_eval(line[3])
        assert(len(pos_seq) == len(label_seq))
        assert(len(line[2].split()) == len(pos_seq))
        raw_test_vua.append([line[2], label_seq, pos_seq])
print('number of examples(sentences) for test_set ', len(raw_test_vua))

for i in range(len(raw_test_vua)):
    raw_test_vua[i][2] = index_sequence(pos2idx, raw_test_vua[i][2])

elmos_test_vua = h5py.File(elmo_dir + 'VUA_test.hdf5', 'r')
# raw_train_vua: sentence, label_seq, pos_seq
# embedded_train_vua: embedded_sentence, pos, labels
embedded_test_vua = [[embed_indexed_sequence(example[0], example[2], word2idx,
                                      glove_embeddings, elmos_test_vua, suffix_embeddings),
                       example[2], example[1]]
                      for example in raw_test_vua]

# Separate the input (embedded_sequence) and labels in the indexed train sets.
# embedded_train_vua: embedded_sentence, pos, labels
test_dataset_vua = TextDataset([example[0] for example in embedded_test_vua],
                              [example[1] for example in embedded_test_vua],
                              [example[2] for example in embedded_test_vua])

# Set up a DataLoader for the test dataset
test_dataloader_vua = DataLoader(dataset=test_dataset_vua, batch_size=batch_size,
                              collate_fn=TextDataset.collate_fn)

print("Tagging model performance on VUA test set by POS tags: regardless of genres")
avg_eval_loss, pos_performance_matrix = evaluate(idx2pos, test_dataloader_vua, RNNseq_model, loss_criterion, using_GPU)

**********************************************************
Evalutation on test set: 
number of examples(sentences) for test_set  2694
Tagging model performance on VUA test set by POS tags: regardless of genres


  eval_text = Variable(eval_text, volatile=True)
  eval_lengths = Variable(eval_lengths, volatile=True)
  eval_labels = Variable(eval_labels, volatile=True)


------------------------------
total_eval_loss.shape torch.Size([])
PRFA performance for  CCONJ 0.0 nan nan 99.93674889310563
PRFA performance for  VERB 70.36632039365773 68.20349761526232 69.26803013993542 88.4319286871961
PRFA performance for  ADV 73.11827956989248 55.73770491803279 63.25581395348837 95.34335396404362
PRFA performance for  X nan nan nan 100.0
PRFA performance for  DET 89.36651583710407 94.27207637231504 91.75377468060394 98.27586206896552
PRFA performance for  NOUN 71.38523761375126 54.6862896979086 61.929824561403514 89.89287377736376
PRFA performance for  SYM nan nan nan 100.0
PRFA performance for  ADP 88.67924528301887 90.05988023952096 89.36423054070113 93.24528301886792
PRFA performance for  PART 56.55172413793103 55.033557046979865 55.78231292517007 91.11414900888585
PRFA performance for  ADJ 66.97674418604652 53.23475046210721 59.320288362512876 90.03783102143758
PRFA performance for  PRON nan 0.0 nan 99.84829329962074
PRFA performance for  NUM nan nan nan 100

  recall = 100 * grid[1, 1] / np.sum(grid[:, 1])
  precision = 100 * grid[1, 1] / np.sum(grid[1])


In [101]:
# compute POS summary stats over the VUA train data
pos_cnts, pos_m_rate = get_VUA_POS_summary(raw_train_vua, idx2pos)
POS_selects = ["VERB", "NOUN", "ADP", "ADJ", "PART"]
pos_cnts_list = [pos_cnts[POS] for POS in POS_selects]
pos_meta_rate_list = [pos_m_rate[POS] for POS in POS_selects]

In [102]:
print("Breakdown of performance on the VUA sequence labeling test set by POS tags: ")
pos_table = pd.DataFrame(pos_performance_matrix[[8,4,12,13,14],:],
                         columns = ["Pr", "Re","F1","Acc"],
                         index=["VERB","NOUN","ADP","ADJ","PART"])
pos_table['Count'] = pos_cnts_list
pos_table['% metaphor'] = pos_meta_rate_list
pos_table

Breakdown of performance on the VUA sequence labeling test set by POS tags: 


Unnamed: 0,Pr,Re,F1,Acc,Count,% metaphor
VERB,56.551724,55.033557,55.782313,91.114149,20917,0.180953
NOUN,89.366516,94.272076,91.753775,98.275862,20514,0.136151
ADP,50.0,33.333333,40.0,99.940991,13310,0.280391
ADJ,,0.0,,99.497487,9673,0.115269
PART,57.142857,12.903226,21.052632,98.425197,2966,0.100809


In [31]:
seq_test_pred = write_predictions(raw_test_vua, test_dataloader_vua, RNNseq_model, using_GPU, data_dir + 'VUAsequence/VUA_seq_formatted_test.csv')

  eval_text = Variable(eval_text, volatile=True)
  eval_lengths = Variable(eval_lengths, volatile=True)
  eval_labels = Variable(eval_labels, volatile=True)


In [32]:
seq_test_pred[0:2]

[['txt_id',
  'sen_ix',
  'sentence',
  'label_seq',
  'pos_seq',
  'labeled_sentence',
  'genre',
  'prediction'],
 ['a3m-fragment02',
  '45',
  'Design : Crossed lines over the toytown tram : City transport could soon be back on the right track , says Jonathan Glancey',
  '[0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0]',
  "['NOUN', 'PUNCT', 'ADJ', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT', 'NOUN', 'NOUN', 'VERB', 'ADV', 'VERB', 'ADV', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT', 'VERB', 'PROPN', 'PROPN']",
  'Design : M_Crossed M_lines M_over the toytown tram : City transport could soon be M_back M_on the right M_track , says Jonathan Glancey',
  'news',
  [tensor(0, device='cuda:0'),
   tensor(0, device='cuda:0'),
   tensor(1, device='cuda:0'),
   tensor(1, device='cuda:0'),
   tensor(0, device='cuda:0'),
   tensor(0, device='cuda:0'),
   tensor(0, device='cuda:0'),
   tensor(0, device='cuda:0'),
   tensor(0, device='cuda:0'),
   tensor(0, device='cuda:0'),
   te

In [33]:
get_performance_VUAverb_test(data_dir,seq_test_pred)

Tagging model performance on test-verb: genre
news Precision, Recall, F1, Accuracy:  73.51351351351352 72.98747763864043 73.24955116696589 75.7328990228013
fiction Precision, Recall, F1, Accuracy:  56.56565656565657 61.53846153846154 58.94736842105264 83.10469314079423
academic Precision, Recall, F1, Accuracy:  73.42143906020559 78.36990595611286 75.81501137225172 74.66243050039714
conversation Precision, Recall, F1, Accuracy:  56.22641509433962 51.202749140893474 53.59712230215828 87.10644677661169
Tagging model performance on test-verb: regardless of genre
Precision, Recall, F1, Accuracy:  68.13125695216908 69.56274843838727 68.8395616746277 81.116975991827


array([64.93175606, 66.02464857, 65.40226332, 80.15161736])

In [35]:
#macro-averaged F1 score across four genres
maF1 = (73.2495 + 58.9473 + 75.8150 + 53.5971) / 4
maF1

65.402225

In [34]:
get_performance_VUA_test(data_dir,seq_test_pred)

Tagging model performance on test-sequence: genre
news Precision, Recall, F1, Accuracy:  77.24056603773585 69.97863247863248 73.4304932735426 92.3076923076923
fiction Precision, Recall, F1, Accuracy:  66.57088122605364 68.54043392504931 67.54130223517978 93.9084442823272
academic Precision, Recall, F1, Accuracy:  79.60582690659811 78.99659863945578 79.30004268032437 92.87550495776716
conversation Precision, Recall, F1, Accuracy:  65.53446553446554 65.27363184079601 65.40378863409771 94.77015825169555
Tagging model performance on test-sequence: regardless of genre
Precision, Recall, F1, Accuracy:  74.38683127572017 72.38507127983341 73.37230069816528 93.46287992027902


array([72.23793493, 70.69732422, 71.41890671, 93.46544995])

In [20]:
gao_scores_verb = [68.2, 71.3, 69.7, 81.4, 66.4]
our_scores_verb = [68.0, 68.3, 68.2, 80.9, 65.4]
our_scores_verb = [round(score,1) for score in our_scores_verb]
all_scores_verb = [gao_scores_verb, our_scores_verb]
all_scores_verb_df = pd.DataFrame(all_scores_verb, columns= ['P', 'R', 'F1', 'Acc', 'MaF1'], index=['Gao et al', 'US'])
print("VUA seq model: classification task\n")
all_scores_verb_df

VUA seq model: classification task



Unnamed: 0,P,R,F1,Acc,MaF1
Gao et al,68.2,71.3,69.7,81.4,66.4
US,68.0,68.3,68.2,80.9,65.4


In [4]:
!ls

sample_data
