# Demo of using ExperimentData class to get datasets

In [1]:
import os
import sys
parent_dr = os.path.split(os.getcwd())[0]
if parent_dr not in sys.path:
    sys.path.append(parent_dr)

In [2]:
from core.data.gao_data import *

In [3]:
# gao et all data directory
data_dir = os.path.join("..","resources", "metaphor-in-context", "data")
print(f"Data directory: {data_dir}")

Data directory: ../resources/metaphor-in-context/data


In [4]:
data_container = ExperimentData(data_dir)

## MOH Dataset

In [5]:
data_container.read_moh_data(to_pandas=True)

MOH formatted nrow: 1603
MOH train nrow: 1283
MOH test nrow: 160
MOH val nrow: 160


In [6]:
data_container.moh_formatted.head()

Unnamed: 0,verb,sentence,verb_idx,label
0,absorb,He absorbed the knowledge or beliefs of his tr...,1,1
1,absorb,He absorbed the costs for the accident .,1,1
2,absorb,The sales tax is absorbed into the state incom...,4,1
3,absorb,The immigrants were quickly absorbed into soci...,4,1
4,absorb,Her interest in butterflies absorbs her comple...,4,1


In [7]:
data_container.moh_formatted.nunique()

verb         436
sentence    1602
verb_idx      15
label          2
dtype: int64

In [8]:
data_container.moh_formatted['label'].mean()

0.24890829694323144

## MOH-X Dataset

In [9]:
data_container.read_moh_x_data(to_pandas=True)

MOH-X formatted svo nrow: 647
MOH-X formatted svo cleaned nrow: 647


In [10]:
data_container.moh_x_formatted_svo_cleaned.head()

Unnamed: 0,arg1,arg2,verb,sentence,verb_idx,label
0,knowledge,,absorb,He absorbed the knowledge or beliefs of his t...,1,1
1,cost,,absorb,He absorbed the costs for the accident .,1,1
2,tax,,absorb,The sales tax is absorbed into the state inco...,4,1
3,immigrant,,absorb,The immigrants were quickly absorbed into soc...,4,1
4,interest,,absorb,Her interest in butterflies absorbs her compl...,4,1


In [11]:
data_container.moh_x_formatted_svo_cleaned.nunique()

arg1        453
arg2          0
verb        214
sentence    641
verb_idx     13
label         2
dtype: int64

In [12]:
data_container.moh_x_formatted_svo_cleaned['label'].mean()

0.4868624420401855

## TroFi Dataset

In [13]:
data_container.read_trofi_data(to_pandas=True)

TroFi formatted_all3737 nrow: 3737


In [14]:
data_container.trofi_formatted_all.head()

Unnamed: 0,verb,sentence,verb_idx,label
0,absorb,An Energy Department spokesman says the sulfur...,22,0
1,absorb,The yellow beta carotene pigment absorbs blue ...,5,0
2,absorb,"This time , the ground absorbed the shock wave...",5,0
3,absorb,'' Vitamins could be passed right out of the b...,12,0
4,absorb,"As Eliot wrote : '' In a warm haze , the sultr...",14,0


In [15]:
data_container.trofi_formatted_all.nunique()

verb          50
sentence    3603
verb_idx      63
label          2
dtype: int64

In [16]:
data_container.trofi_formatted_all['label'].mean()

0.4353759700294354

## TroFi-X Dataset

In [17]:
data_container.read_trofi_x_data(to_pandas=True)

TroFi-X formatted svo nrow: 1444


In [18]:
data_container.trofi_x_formatted_svo.head()

Unnamed: 0,arg1,arg2,verb,sentence,verb_stem,label
0,mileage,struck,blow,Triple mileage has struck another blow to the ...,strike,1
1,terrorist,attack,target,U.S. officials said evidence suggests that a J...,attack,0
2,forces,stepped,use,"Some police forces , for example , have steppe...",step,0
3,day,pour,stream,"Every day his troops gather under the green , ...",pour,0
4,manufacturers,rolling,products,He says manufacturers are increasingly rolling...,roll,1


In [19]:
data_container.trofi_x_formatted_svo.nunique()

arg1          679
arg2          152
verb          857
sentence     1233
verb_stem      48
label           2
dtype: int64

In [20]:
data_container.trofi_x_formatted_svo['label'].mean()

0.4162049861495845

## VUA Dataset

In [21]:
data_container.read_vua_data(to_pandas=True)

VUA formatted nrow: 23113
VUA train nrow: 15516
VUA train augmented nrow: 116622
VUA train no val nrow: 12541
VUA test nrow: 5873
VUA val nrow: 1724


In [22]:
data_container.vua_formatted.head()

Unnamed: 0,text_idx,sentence_idx,verb,sentence,verb_idx,label
0,a1h-fragment06,117,reflect,He was President of FISA ( Federation Internat...,31,1
1,a1h-fragment06,118,encounter,Most athletes first encountered him as a voice...,3,0
2,a1h-fragment06,118,bellow,Most athletes first encountered him as a voice...,9,1
3,a1h-fragment06,118,break,Most athletes first encountered him as a voice...,17,1
4,a1h-fragment06,118,design,Most athletes first encountered him as a voice...,21,1


## VUA Seq Dataset

In [23]:
data_container.read_vua_seq_data(to_pandas=True)

VUA_seq train nrow: 6323
VUA_seq test nrow: 2694
VUA_seq val nrow: 1550


In [24]:
data_container.vua_seq_formatted_train.head()

Unnamed: 0,txt_id,sen_ix,sentence,label_seq,pos_seq,labeled_sentence,genre
0,a1k-fragment02,41,Ca n't fail to be entertaining .,"[0, 0, 0, 0, 0, 0, 0]","['VERB', 'ADV', 'VERB', 'PART', 'VERB', 'ADJ',...",Ca n't fail to be entertaining .,news
1,cdb-fragment04,907,How much was he going to tell her ?,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","['ADV', 'ADJ', 'VERB', 'PRON', 'VERB', 'PART',...",How much was he going to tell her ?,fiction
2,ac2-fragment06,1520,"Up until that news hit the Committee , Don had...","[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","['ADP', 'ADP', 'DET', 'NOUN', 'VERB', 'DET', '...","Up until M_that news M_hit the Committee , Don...",fiction
3,kbc-fragment13,5871,Could go on to the rugby and go with them coul...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","['VERB', 'VERB', 'PART', 'ADP', 'DET', 'NOUN',...",Could go on to the rugby and go with them coul...,conversation
4,ahb-fragment51,734,"Finally , we went to the office and they gave ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...","['ADV', 'PUNCT', 'PRON', 'VERB', 'ADP', 'DET',...","Finally , we went to the office and M_they M_g...",news
