# Demo of using ExperimentData class to get datasets

In [1]:
import os
import sys
parent_dr = os.path.split(os.getcwd())[0]
if parent_dr not in sys.path:
    sys.path.append(parent_dr)

In [2]:
from core.data.gao_data import *

In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
# gao et all data directory
data_dir = os.path.join("..","resources", "metaphor-in-context", "data")
print(f"Data directory: {data_dir}")

Data directory: ../resources/metaphor-in-context/data


In [5]:
data_container = ExperimentData(data_dir)

## MOH Dataset

In [6]:
data_container.read_moh_data(to_pandas=True)

MOH formatted nrow: 1603
MOH train nrow: 1283
MOH test nrow: 160
MOH val nrow: 160


In [7]:
data_container.moh_formatted.head()

Unnamed: 0,verb,sentence,verb_idx,label
0,absorb,He absorbed the knowledge or beliefs of his tribe .,1,1
1,absorb,He absorbed the costs for the accident .,1,1
2,absorb,The sales tax is absorbed into the state income tax .,4,1
3,absorb,The immigrants were quickly absorbed into society .,4,1
4,absorb,Her interest in butterflies absorbs her completely .,4,1


In [8]:
data_container.moh_formatted.nunique()

verb         436
sentence    1602
verb_idx      15
label          2
dtype: int64

In [9]:
data_container.moh_formatted['label'].mean()

0.24890829694323144

## MOH-X Dataset

In [10]:
data_container.read_moh_x_data(to_pandas=True)

MOH-X formatted svo nrow: 647
MOH-X formatted svo cleaned nrow: 647


In [11]:
data_container.moh_x_formatted_svo_cleaned.head()

Unnamed: 0,arg1,arg2,verb,sentence,verb_idx,label
0,knowledge,,absorb,He absorbed the knowledge or beliefs of his tribe .,1,1
1,cost,,absorb,He absorbed the costs for the accident .,1,1
2,tax,,absorb,The sales tax is absorbed into the state income tax .,4,1
3,immigrant,,absorb,The immigrants were quickly absorbed into society .,4,1
4,interest,,absorb,Her interest in butterflies absorbs her completely .,4,1


In [12]:
data_container.moh_x_formatted_svo_cleaned.nunique()

arg1        453
arg2          0
verb        214
sentence    641
verb_idx     13
label         2
dtype: int64

In [13]:
data_container.moh_x_formatted_svo_cleaned['label'].mean()

0.4868624420401855

## TroFi Dataset

In [14]:
data_container.read_trofi_data(to_pandas=True)

TroFi formatted_all3737 nrow: 3737


In [15]:
data_container.trofi_formatted_all.head()

Unnamed: 0,verb,sentence,verb_idx,label
0,absorb,"An Energy Department spokesman says the sulfur dioxide might be simultaneously recoverable through the use of powdered limestone , which tends to absorb the sulfur",22,0
1,absorb,The yellow beta carotene pigment absorbs blue -LRB- not yellow -RRB- laser light,5,0
2,absorb,"This time , the ground absorbed the shock waves enough to transfer her images to the metal in bas - relief",5,0
3,absorb,"'' Vitamins could be passed right out of the body without being absorbed , '' he says",12,0
4,absorb,"As Eliot wrote : '' In a warm haze , the sultry light is absorbed , not refracted , by grey stone . . . . '' and flowers do indeed '' sleep in empty silence . '",14,0


In [16]:
data_container.trofi_formatted_all.nunique()

verb          50
sentence    3603
verb_idx      63
label          2
dtype: int64

In [17]:
data_container.trofi_formatted_all['label'].mean()

0.4353759700294354

## TroFi-X Dataset

In [18]:
data_container.read_trofi_x_data(to_pandas=True)

TroFi-X formatted svo nrow: 1444


In [19]:
data_container.trofi_x_formatted_svo.head()

Unnamed: 0,arg1,arg2,verb,sentence,verb_stem,label
0,mileage,struck,blow,Triple mileage has struck another blow to the coupon market .,strike,1
1,terrorist,attack,target,U.S. officials said evidence suggests that a Japanese terrorist arrested last month in New Jersey with homemade bombs was on his way to attack a target in New York City .,attack,0
2,forces,stepped,use,"Some police forces , for example , have stepped up use of a 20-year-old system called Vascar -LRB- Vehicle and Speed Computed and Recorded -RRB- .",step,0
3,day,pour,stream,"Every day his troops gather under the green , red and black banner of black nationalism and pour out a stream of racial abuse .",pour,0
4,manufacturers,rolling,products,"He says manufacturers are increasingly rolling out questionable products that are n't fully researched , justified and tested .",roll,1


In [20]:
data_container.trofi_x_formatted_svo.nunique()

arg1          679
arg2          152
verb          857
sentence     1233
verb_stem      48
label           2
dtype: int64

In [21]:
data_container.trofi_x_formatted_svo['label'].mean()

0.4162049861495845

## VUA Dataset

In [22]:
data_container.read_vua_data(to_pandas=True)

VUA formatted nrow: 23113
VUA train nrow: 15516
VUA train augmented nrow: 116622
VUA train no val nrow: 12541
VUA test nrow: 5873
VUA val nrow: 1724


In [23]:
data_container.vua_formatted.head()

Unnamed: 0,text_idx,sentence_idx,verb,sentence,verb_idx,label
0,a1h-fragment06,117,reflect,"He was President of FISA ( Federation Internationale des Societes d'Aviron ) , the world governing body , for 30 years ; and most of this body 's attitudes and practices reflect the force of his strict , but genial personality .",31,1
1,a1h-fragment06,118,encounter,"Most athletes first encountered him as a voice , bellowing in multi-lingual fury at officials who had broken rules or arrangements designed to make racing safer or more fair .",3,0
2,a1h-fragment06,118,bellow,"Most athletes first encountered him as a voice , bellowing in multi-lingual fury at officials who had broken rules or arrangements designed to make racing safer or more fair .",9,1
3,a1h-fragment06,118,break,"Most athletes first encountered him as a voice , bellowing in multi-lingual fury at officials who had broken rules or arrangements designed to make racing safer or more fair .",17,1
4,a1h-fragment06,118,design,"Most athletes first encountered him as a voice , bellowing in multi-lingual fury at officials who had broken rules or arrangements designed to make racing safer or more fair .",21,1


## VUA Seq Dataset

In [24]:
data_container.read_vua_seq_data(to_pandas=True)

VUA_seq train nrow: 6323
VUA_seq test nrow: 2694
VUA_seq val nrow: 1550


In [26]:
data_container.vua_seq_formatted_val.head()

Unnamed: 0,txt_id,sen_ix,sentence,label_seq,pos_seq,labeled_sentence,genre
0,acj-fragment01,148,"Four alternative approaches have been described , and many others could be listed .","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","['NUM', 'ADJ', 'NOUN', 'VERB', 'VERB', 'VERB', 'PUNCT', 'CCONJ', 'ADJ', 'NOUN', 'VERB', 'VERB', 'VERB', 'PUNCT']","Four alternative M_approaches have been described , and many others could be listed .",academic
1,ab9-fragment03,908,"I wanted to say , you see , that I know you thought Frannie should n't have gone , and that it 's ruined your holiday plans , and , on behalf of us all , I 'm sorry .","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","['PRON', 'VERB', 'PART', 'VERB', 'PUNCT', 'PRON', 'VERB', 'PUNCT', 'ADP', 'PRON', 'VERB', 'PRON', 'VERB', 'PROPN', 'VERB', 'ADV', 'VERB', 'VERB', 'PUNCT', 'CCONJ', 'ADP', 'PRON', 'VERB', 'VERB', 'ADJ', 'NOUN', 'NOUN', 'PUNCT', 'CCONJ', 'PUNCT', 'ADP', 'NOUN', 'ADP', 'PRON', 'DET', 'PUNCT', 'PRON', 'VERB', 'ADJ', 'PUNCT']","I wanted to say , you M_see , that I know you thought Frannie should n't have gone , and that it 's ruined your holiday M_plans , and , on behalf of us all , I 'm sorry .",fiction
2,kbw-fragment42,14929,The one with you chop the chop and then there will be squares,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","['DET', 'NOUN', 'ADP', 'PRON', 'VERB', 'DET', 'NOUN', 'CCONJ', 'ADV', 'ADV', 'VERB', 'VERB', 'NOUN']",The M_one M_with you chop the chop and then there will be squares,conversation
3,b1g-fragment02,772,"Given that most GIS are rather dumb systems , requiring intelligent , very computer literate users , managing natural and technological hazards means that this knowledge base must be built into the system so that it can be utilized quickly by untrained users after a disaster has taken place or in an emergency .","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","['VERB', 'ADP', 'ADJ', 'PROPN', 'VERB', 'ADV', 'ADJ', 'NOUN', 'PUNCT', 'VERB', 'ADJ', 'PUNCT', 'ADV', 'NOUN', 'NOUN', 'NOUN', 'PUNCT', 'VERB', 'ADJ', 'CCONJ', 'ADJ', 'NOUN', 'VERB', 'ADP', 'DET', 'NOUN', 'NOUN', 'VERB', 'VERB', 'VERB', 'ADP', 'DET', 'NOUN', 'ADP', 'ADP', 'PRON', 'VERB', 'VERB', 'VERB', 'ADV', 'ADP', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', 'VERB', 'VERB', 'NOUN', 'CCONJ', 'ADP', 'DET', 'NOUN', 'PUNCT']","Given that most GIS are rather dumb systems , requiring intelligent , very computer literate users , managing natural and technological hazards means that this knowledge base must be built into the system so that it can be utilized quickly by untrained users after a disaster has taken place or in an emergency .",academic
4,a1n-fragment18,350,"Lacking a goal that might have altered its chemistry , it remained a matter of dull physics .","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0]","['VERB', 'DET', 'NOUN', 'ADJ', 'VERB', 'VERB', 'VERB', 'ADJ', 'NOUN', 'PUNCT', 'PRON', 'VERB', 'DET', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT']","Lacking a goal that might have altered its M_chemistry , it remained a matter of M_dull M_physics .",news
