# Recreate data summary tables from gao et al paper
### (Run in colab)

In [1]:
# mount drive
from google.colab import drive
ROOT = '/content/drive'
drive.mount('/content/drive')

In [4]:
# add repo directory to path
import os
import sys
from os.path import join 
repo_dir = '/content/drive/MyDrive/Repos/metaphor-detection'
if repo_dir not in sys.path:
    sys.path.append(repo_dir)
print(sys.path)

['', '/content', '/env/python', '/usr/lib/python37.zip', '/usr/lib/python3.7', '/usr/lib/python3.7/lib-dynload', '/usr/local/lib/python3.7/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.7/dist-packages/IPython/extensions', '/root/.ipython', '/', '/content/drive/MyDrive/Repos/metaphor-detection']


In [5]:
import tabulate
from core.data.gao_data import *
from core.data.util import get_classification_data_summary, get_sequencing_data_summary

In [9]:
# gao et all data directory
data_dir = os.path.join(repo_dir,"resources", "metaphor-in-context", "data")
print(f"Data directory: {data_dir}")

Data directory: /content/drive/MyDrive/Repos/metaphor-detection/resources/metaphor-in-context/data


In [10]:
data_container = ExperimentData(data_dir)

In [11]:
data_container.read_all_data(to_pandas=True)

MOH formatted nrow: 1603
MOH train nrow: 1283
MOH test nrow: 160
MOH val nrow: 160
MOH-X formatted svo nrow: 647
MOH-X formatted svo cleaned nrow: 647
TroFi formatted_all3737 nrow: 3737
TroFi-X formatted svo nrow: 1444
VUA formatted nrow: 23113
VUA train nrow: 15516
VUA train augmented nrow: 116622
VUA train no val nrow: 12541
VUA test nrow: 5873
VUA val nrow: 1724
VUA_seq train nrow: 6323
VUA_seq test nrow: 2694
VUA_seq val nrow: 1550


## CLASSIFICATION
### Datasets to summarize

In [12]:
classification_data = {
    MOH_X: data_container.moh_x_formatted_svo_cleaned,
    MOH: data_container.moh_formatted,
    TROFI_X: data_container.trofi_x_formatted_svo,
    TROFI: data_container.trofi_formatted_all,
    VUA: data_container.vua_formatted}

In [13]:
classification_data_summary = get_classification_data_summary(classification_data)
pd.DataFrame(classification_data_summary[1:],
             columns = classification_data_summary[0])

Unnamed: 0,Unnamed: 1,n,Perc Metaphor,Uniq Verb,Avg Sentence Len
0,MOH-X,647,0.486862,214,8.0
1,MOH,1603,0.248908,436,7.426076
2,TroFi-X,1444,0.416205,857,29.346953
3,TroFi,3737,0.435376,50,28.351351
4,VUA,23113,0.283563,2298,26.113226


In [14]:
print('\nTabulate Latex:')
print(tabulate.tabulate(
    classification_data_summary,
    headers="firstrow", tablefmt="latex"))


Tabulate Latex:
\begin{tabular}{lrrrr}
\hline
         &     n &   Perc Metaphor &   Uniq Verb &   Avg Sentence Len \\
\hline
 MOH-X   &   647 &        0.486862 &         214 &            8       \\
 MOH     &  1603 &        0.248908 &         436 &            7.42608 \\
 TroFi-X &  1444 &        0.416205 &         857 &           29.347   \\
 TroFi   &  3737 &        0.435376 &          50 &           28.3514  \\
 VUA     & 23113 &        0.283563 &        2298 &           26.1132  \\
\hline
\end{tabular}


## Sequence Labeling
### Datasets to summarize

In [15]:
sequencing_data = {
    "TRAIN": data_container.vua_seq_formatted_train,
    "DEV": data_container.vua_seq_formatted_val,
    "TEST": data_container.vua_seq_formatted_test}

In [16]:
sequencing_data_summary = get_sequencing_data_summary(sequencing_data)
pd.DataFrame(sequencing_data_summary [1:],
             columns = sequencing_data_summary [0])

vocab size:  13843
vocab size:  7458
vocab size:  7200


Unnamed: 0,Unnamed: 1,Uniq Tokens,n Tokens,Unique Sentences,Perc Metaphor
0,TRAIN,13843,116622,6323,0.111909
1,DEV,7458,38628,1550,0.116159
2,TEST,7200,50175,2694,0.124425


In [17]:
print('\nTabulate Latex:')
print(tabulate.tabulate(
    sequencing_data_summary,
    headers="firstrow", tablefmt="latex"))


Tabulate Latex:
\begin{tabular}{lrrrr}
\hline
       &   Uniq Tokens &   n Tokens &   Unique Sentences &   Perc Metaphor \\
\hline
 TRAIN &         13843 &     116622 &               6323 &        0.111909 \\
 DEV   &          7458 &      38628 &               1550 &        0.116159 \\
 TEST  &          7200 &      50175 &               2694 &        0.124425 \\
\hline
\end{tabular}
