In [1]:
import pandas as pd

In [2]:
# Mask the word at the given index
def mask_word(sentence, index):
    words = sentence.split()  # Split the sentence into words
    if 0 <= index < len(words):  
        words[index] = '[MASK]'  # Replace the word at the specified index
    return ' '.join(words)  

In [None]:
df = pd.read_parquet("data/0000.parquet")

print(df.head())

# Filter for rows where label == 1 and apply masking
df_filtered = df[df['label'] == 1].copy()  # Copy rows where label == 1
df_filtered['masked_sentence'] = df_filtered.apply(
    lambda row: mask_word(row['sentence'], row['v_index']),
    axis=1
)

df_filtered['data_set'] = "trofi"

trofi_df = df_filtered[['sentence', 'data_set', 'masked_sentence']]
trofi_df.head()

   index  label                                           sentence   pos  \
0    373      0  Care Enterprises Inc., a financially troubled ...  VERB   
1    374      1  The GAO was also to examine if the law has cau...  VERB   
2    375      1   " Without voodoo, we would drown in our misery.'  VERB   
3    376      0  Twenty- eight thousand buildings were leveled ...  VERB   
4    377      0  Twelve- year- old Peter Reaves of Pittsburgh t...  VERB   

   v_index  
0       33  
1        5  
2        5  
3       28  
4       13  


Unnamed: 0,sentence,data_set,masked_sentence
1,The GAO was also to examine if the law has cau...,trofi,The GAO was also to [MASK] if the law has caus...
2,""" Without voodoo, we would drown in our misery.'",trofi,""" Without voodoo, we would [MASK] in our misery.'"
5,"Mrs. Bush, who disdains what she calls fake pu...",trofi,"Mrs. Bush, who disdains what she calls fake pu..."
7,""" You lose elections if you touch these things.'",trofi,""" You lose elections if you [MASK] these things.'"
8,Industry watchers respect Genentech for pumpin...,trofi,Industry watchers respect Genentech for [MASK]...


In [None]:
df = pd.read_csv("data/MOH-X_formatted_svo.csv")
print(df.head())

# Filter for rows where label == 1 and apply masking
df_filtered = df[df['label'] == 1].copy()  # Copy rows where label == 1
df_filtered['masked_sentence'] = df_filtered.apply(
    lambda row: mask_word(row['sentence'], row['verb_idx']),
    axis=1
)

df_filtered['data_set'] = "moh_x"

moh_x_df = df_filtered[['sentence', 'data_set', 'masked_sentence']]
moh_x_df.head()

        arg1  arg2    verb                                           sentence  \
0  knowledge   NaN  absorb  He absorbed the knowledge or beliefs of his tr...   
1       cost   NaN  absorb           He absorbed the costs for the accident .   
2        tax   NaN  absorb  The sales tax is absorbed into the state incom...   
3  immigrant   NaN  absorb  The immigrants were quickly absorbed into soci...   
4   interest   NaN  absorb  Her interest in butterflies absorbs her comple...   

   verb_idx  label  
0         1      1  
1         1      1  
2         4      1  
3         4      1  
4         4      1  


Unnamed: 0,sentence,data_set,masked_sentence
0,He absorbed the knowledge or beliefs of his tr...,moh_x,He [MASK] the knowledge or beliefs of his tribe .
1,He absorbed the costs for the accident .,moh_x,He [MASK] the costs for the accident .
2,The sales tax is absorbed into the state incom...,moh_x,The sales tax is [MASK] into the state income ...
3,The immigrants were quickly absorbed into soci...,moh_x,The immigrants were quickly [MASK] into society .
4,Her interest in butterflies absorbs her comple...,moh_x,Her interest in butterflies [MASK] her complet...


In [None]:
all_files = ["data/test.tsv","data/train.tsv"]
df_list = [pd.read_csv(filename, sep='\t') for filename in all_files]

df = pd.concat(df_list, ignore_index=True)
print(df.head())

# Filter for rows where label == 1 and apply masking
df_filtered = df[(df['label'] == 1) & (df['POS'] == 'VERB')].copy()
df_filtered['masked_sentence'] = df_filtered.apply(
    lambda row: mask_word(row['sentence'], row['w_index']),
    axis=1
)

df_filtered['data_set'] = "vua"

vua_df = df_filtered[['sentence', 'data_set', 'masked_sentence']]
vua_df.head()

               index  label  \
0  a3m-fragment02 45      0   
1  a3m-fragment02 45      1   
2  a3m-fragment02 45      1   
3  a3m-fragment02 45      0   
4  a3m-fragment02 45      0   

                                            sentence    POS FGPOS  w_index  
0  Design : Crossed lines over the toytown tram :...   NOUN    NN        0  
1  Design : Crossed lines over the toytown tram :...   VERB   VBN        2  
2  Design : Crossed lines over the toytown tram :...   NOUN   NNS        3  
3  Design : Crossed lines over the toytown tram :...  PROPN   NNP        6  
4  Design : Crossed lines over the toytown tram :...  PROPN   NNP        7  


Unnamed: 0,sentence,data_set,masked_sentence
1,Design : Crossed lines over the toytown tram :...,vua,Design : [MASK] lines over the toytown tram : ...
35,They also exert a fascination very much of the...,vua,They also [MASK] a fascination very much of th...
38,They also exert a fascination very much of the...,vua,They also exert a fascination very much of the...
70,"Eventually they will be replaced , but more th...",vua,"Eventually they will be replaced , but more th..."
91,"Destined to run in Manchester , they will reli...",vua,"[MASK] to run in Manchester , they will reliev..."


In [6]:
combined_df = pd.concat([vua_df, trofi_df, moh_x_df], ignore_index=True)
#combined_df.to_csv("data/combined_dataframe.csv", index=False)


In [8]:
df = combined_df.copy()

In [None]:
def extract_masked_word(row):
    original_words = row['sentence'].split()
    masked_words = row['masked_sentence'].replace('[MASK]', '').split()
    # Find the word in `original_words` that's not in `masked_words`
    return next((word for word in original_words if word not in masked_words), None)

df['masked_word'] = df.apply(extract_masked_word, axis=1)

# Count unique masked words grouped by `data_set`
unique_masked_counts = df.groupby('data_set')['masked_word'].nunique()

print(df[['masked_word', 'data_set']])
print(unique_masked_counts)
unique_masked_word_counts = df.groupby(['data_set', 'masked_word']).size().reset_index(name='count')
print(unique_masked_word_counts)

      masked_word data_set
0         Crossed      vua
1           exert      vua
2         slither      vua
3             run      vua
4        Destined      vua
...           ...      ...
22807          of    moh_x
22808          in    moh_x
22809        None    moh_x
22810         the    moh_x
22811           a    moh_x

[22812 rows x 2 columns]
data_set
moh_x     217
trofi     233
vua      1873
Name: masked_word, dtype: int64
     data_set masked_word  count
0       moh_x        $100      1
1       moh_x      Africa      1
2       moh_x      Bosnia      1
3       moh_x           a      2
4       moh_x       about      1
...       ...         ...    ...
2318      vua       woven      2
2319      vua    wreathed      1
2320      vua     written      1
2321      vua       wrung      2
2322      vua    yielding      3

[2323 rows x 3 columns]


In [17]:
top_10_masked_words = unique_masked_word_counts.sort_values(
    ['data_set', 'count'], ascending=[True, False]
).groupby('data_set').head(10)
top_10_masked_words

Unnamed: 0,data_set,masked_word,count
196,moh_x,the,18
212,moh_x,with,11
118,moh_x,his,7
200,moh_x,to,6
116,moh_x,her,5
5,moh_x,absorbed,4
123,moh_x,in,4
129,moh_x,into,4
154,moh_x,on,4
57,moh_x,clawed,3


In [18]:
def extract_replaced_word(row):
    original_words = row['sentence'].split()
    masked_words = row['masked_sentence'].split()
    for orig, masked in zip(original_words, masked_words):
        if masked == '[MASK]':
            return orig
    return None

df['replaced_word'] = df.apply(extract_replaced_word, axis=1)

top_10_per_dataset = (
    df.groupby('data_set')['replaced_word']
    .value_counts()
    .groupby(level=0, group_keys=False)  
    .nlargest(10) 
    .reset_index(name='count')  
)

top_10_per_dataset['normalized_count'] = (
    top_10_per_dataset.groupby('data_set')['count']
    .transform(lambda x: x / x.sum())
)


In [19]:
top_10_per_dataset[top_10_per_dataset["data_set"] == "moh_x"].to_csv('top_10_in_dataset_moh_x.csv', index=False)

In [20]:
top_10_per_dataset[top_10_per_dataset["data_set"] == "vua"].reset_index().drop(columns='index').to_csv('top_10_in_dataset_vua.csv', index=False)

In [21]:
top_10_per_dataset[top_10_per_dataset["data_set"] == "trofi"].reset_index().drop(columns='index').to_csv('top_10_in_dataset_trofi.csv', index=False)