In [2]:
import pandas as pd

In [3]:
# Define a function to mask the word at the given index
def mask_word(sentence, index):
    words = sentence.split()  # Split the sentence into words
    if 0 <= index < len(words):  # Check if index is within bounds
        words[index] = '[MASK]'  # Replace the word at the specified index
    return ' '.join(words)  # Join the words back into a sentence

In [4]:
# Load the .parquet file
df = pd.read_parquet("data/0000.parquet")

# Display the DataFrame
print(df.head())

# Filter for rows where label == 1 and apply masking
df_filtered = df[df['label'] == 1].copy()  # Copy rows where label == 1
df_filtered['masked_sentence'] = df_filtered.apply(
    lambda row: mask_word(row['sentence'], row['v_index']),
    axis=1
)

df_filtered['data_set'] = "trofi"

# Save the filtered and modified DataFrame as a new DataFrame
trofi_df = df_filtered[['sentence', 'data_set', 'masked_sentence']]
trofi_df.head()

   index  label                                           sentence   pos  \
0    373      0  Care Enterprises Inc., a financially troubled ...  VERB   
1    374      1  The GAO was also to examine if the law has cau...  VERB   
2    375      1   " Without voodoo, we would drown in our misery.'  VERB   
3    376      0  Twenty- eight thousand buildings were leveled ...  VERB   
4    377      0  Twelve- year- old Peter Reaves of Pittsburgh t...  VERB   

   v_index  
0       33  
1        5  
2        5  
3       28  
4       13  


Unnamed: 0,sentence,data_set,masked_sentence
1,The GAO was also to examine if the law has cau...,trofi,The GAO was also to [MASK] if the law has caus...
2,""" Without voodoo, we would drown in our misery.'",trofi,""" Without voodoo, we would [MASK] in our misery.'"
5,"Mrs. Bush, who disdains what she calls fake pu...",trofi,"Mrs. Bush, who disdains what she calls fake pu..."
7,""" You lose elections if you touch these things.'",trofi,""" You lose elections if you [MASK] these things.'"
8,Industry watchers respect Genentech for pumpin...,trofi,Industry watchers respect Genentech for [MASK]...


In [5]:
df = pd.read_csv("data/MOH-X_formatted_svo.csv")
print(df.head())

# Filter for rows where label == 1 and apply masking
df_filtered = df[df['label'] == 1].copy()  # Copy rows where label == 1
df_filtered['masked_sentence'] = df_filtered.apply(
    lambda row: mask_word(row['sentence'], row['verb_idx']),
    axis=1
)

df_filtered['data_set'] = "moh_x"

# Save the filtered and modified DataFrame as a new DataFrame
moh_x_df = df_filtered[['sentence', 'data_set', 'masked_sentence']]
moh_x_df.head()

        arg1  arg2    verb                                           sentence  \
0  knowledge   NaN  absorb  He absorbed the knowledge or beliefs of his tr...   
1       cost   NaN  absorb           He absorbed the costs for the accident .   
2        tax   NaN  absorb  The sales tax is absorbed into the state incom...   
3  immigrant   NaN  absorb  The immigrants were quickly absorbed into soci...   
4   interest   NaN  absorb  Her interest in butterflies absorbs her comple...   

   verb_idx  label  
0         1      1  
1         1      1  
2         4      1  
3         4      1  
4         4      1  


Unnamed: 0,sentence,data_set,masked_sentence
0,He absorbed the knowledge or beliefs of his tr...,moh_x,He [MASK] the knowledge or beliefs of his tribe .
1,He absorbed the costs for the accident .,moh_x,He [MASK] the costs for the accident .
2,The sales tax is absorbed into the state incom...,moh_x,The sales tax is [MASK] into the state income ...
3,The immigrants were quickly absorbed into soci...,moh_x,The immigrants were quickly [MASK] into society .
4,Her interest in butterflies absorbs her comple...,moh_x,Her interest in butterflies [MASK] her complet...


In [6]:
all_files = ["data/test.tsv","data/train.tsv"]

# Read each TSV file and store it in a list of DataFrames
df_list = [pd.read_csv(filename, sep='\t') for filename in all_files]

# Concatenate all DataFrames in the list into one DataFrame
df = pd.concat(df_list, ignore_index=True)
print(df.head())

# Filter for rows where label == 1 and apply masking
df_filtered = df[(df['label'] == 1) & (df['POS'] == 'VERB')].copy()
df_filtered['masked_sentence'] = df_filtered.apply(
    lambda row: mask_word(row['sentence'], row['w_index']),
    axis=1
)

df_filtered['data_set'] = "vua"

# Save the filtered and modified DataFrame as a new DataFrame
vua_df = df_filtered[['sentence', 'data_set', 'masked_sentence']]
vua_df.head()

               index  label  \
0  a3m-fragment02 45      0   
1  a3m-fragment02 45      1   
2  a3m-fragment02 45      1   
3  a3m-fragment02 45      0   
4  a3m-fragment02 45      0   

                                            sentence    POS FGPOS  w_index  
0  Design : Crossed lines over the toytown tram :...   NOUN    NN        0  
1  Design : Crossed lines over the toytown tram :...   VERB   VBN        2  
2  Design : Crossed lines over the toytown tram :...   NOUN   NNS        3  
3  Design : Crossed lines over the toytown tram :...  PROPN   NNP        6  
4  Design : Crossed lines over the toytown tram :...  PROPN   NNP        7  


Unnamed: 0,sentence,data_set,masked_sentence
1,Design : Crossed lines over the toytown tram :...,vua,Design : [MASK] lines over the toytown tram : ...
35,They also exert a fascination very much of the...,vua,They also [MASK] a fascination very much of th...
38,They also exert a fascination very much of the...,vua,They also exert a fascination very much of the...
70,"Eventually they will be replaced , but more th...",vua,"Eventually they will be replaced , but more th..."
91,"Destined to run in Manchester , they will reli...",vua,"[MASK] to run in Manchester , they will reliev..."


In [7]:
combined_df = pd.concat([vua_df, trofi_df, moh_x_df], ignore_index=True)
combined_df.to_csv("data/combined_dataframe.csv", index=False)
