# Recreate data summary tables from gao et al paper

In [1]:
import os
import sys
parent_dr = os.path.split(os.getcwd())[0]
if parent_dr not in sys.path:
    sys.path.append(parent_dr)

In [2]:
import tabulate
from core.data.gao_data import *
from core.data.util import get_classification_data_summary

In [3]:
# gao et all data directory
data_dir = os.path.join("..","resources", "metaphor-in-context", "data")
print(f"Data directory: {data_dir}")

Data directory: ../resources/metaphor-in-context/data


In [4]:
data_container = ExperimentData(data_dir)

In [5]:
data_container.read_all_data(to_pandas=True)

MOH formatted nrow: 1603
MOH train nrow: 1283
MOH test nrow: 160
MOH val nrow: 160
MOH-X formatted svo nrow: 647
MOH-X formatted svo cleaned nrow: 647
TroFi formatted_all3737 nrow: 3737
TroFi-X formatted svo nrow: 1444
VUA formatted nrow: 23113
VUA train nrow: 15516
VUA train augmented nrow: 116622
VUA train no val nrow: 12541
VUA test nrow: 5873
VUA val nrow: 1724
VUA_seq train nrow: 6323
VUA_seq test nrow: 2694
VUA_seq val nrow: 1550


## CLASSIFICATION
### Datasets to summarize

In [6]:
classification_data = {
    MOH_X: data_container.moh_x_formatted_svo_cleaned,
    MOH: data_container.moh_formatted,
    TROFI_X: data_container.trofi_x_formatted_svo,
    TROFI: data_container.trofi_formatted_all,
    VUA: data_container.vua_formatted}

In [7]:
classification_data_summary = get_classification_data_summary(classification_data)
pd.DataFrame(classification_data_summary[1:],
             columns = classification_data_summary[0])

Unnamed: 0,Unnamed: 1,n,Perc Metaphor,Uniq Verb,Avg Sentence Len
0,MOH-X,647,0.486862,214,8.0
1,MOH,1603,0.248908,436,7.426076
2,TroFi-X,1444,0.416205,857,29.346953
3,TroFi,3737,0.435376,50,28.351351
4,VUA,23113,0.283563,2298,26.113226


In [8]:
print('\nTabulate Latex:')
print(tabulate.tabulate(
    classification_data_summary,
    headers="firstrow", tablefmt="latex"))


Tabulate Latex:
\begin{tabular}{lrrrr}
\hline
         &     n &   Perc Metaphor &   Uniq Verb &   Avg Sentence Len \\
\hline
 MOH-X   &   647 &        0.486862 &         214 &            8       \\
 MOH     &  1603 &        0.248908 &         436 &            7.42608 \\
 TroFi-X &  1444 &        0.416205 &         857 &           29.347   \\
 TroFi   &  3737 &        0.435376 &          50 &           28.3514  \\
 VUA     & 23113 &        0.283563 &        2298 &           26.1132  \\
\hline
\end{tabular}


## Sequence Labeling
### Datasets to summarize

In [9]:
sequencing_data = {
    "TRAIN": data_container.vua_seq_formatted_train,
    "DEV": data_container.vua_seq_formatted_test,
    "TEST": data_container.vua_seq_formatted_val}

In [10]:
print([print("\n", "#"*10, f"{name}", "#"*10, "\n", data.head()) 
       for name, data in sequencing_data.items()])


 ########## TRAIN ########## 
            txt_id sen_ix                                           sentence  \
0  a1k-fragment02     41                   Ca n't fail to be entertaining .   
1  cdb-fragment04    907                How much was he going to tell her ?   
2  ac2-fragment06   1520  Up until that news hit the Committee , Don had...   
3  kbc-fragment13   5871  Could go on to the rugby and go with them coul...   
4  ahb-fragment51    734  Finally , we went to the office and they gave ...   

                                           label_seq  \
0                              [0, 0, 0, 0, 0, 0, 0]   
1                        [0, 0, 0, 0, 0, 0, 0, 0, 0]   
2  [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...   
3         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]   
4  [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...   

                                             pos_seq  \
0  ['VERB', 'ADV', 'VERB', 'PART', 'VERB', 'ADJ',...   
1  ['ADV', 'ADJ', 'VERB', 'PRON', 'VERB', 'P

In [11]:
# sequencing_data_summary = get_data_summary(sequencing_data)
# pd.DataFrame(sequencing_data_summary [1:],
#              columns = sequencing_data_summary [0])