### Create evaluation sets for AtypicalAnimacy (with masks)

Obtain previous and next sentences from dataframes and mask sentences

In [2]:
import os
import pickle
import pandas as pd

def concat_pkl(directory_path):
    pkl_files = [f for f in os.listdir(directory_path) if f.endswith('.pkl')]
    
    df_list = []

    for file in pkl_files:
        file_path = os.path.join(directory_path, file)
        try:
            df = pd.read_pickle(file_path)
            df['source_file'] = file  # Optional: track origin
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

    if df_list:
        combined_df = pd.concat(df_list, ignore_index=True)
        return combined_df
    else:
        print("No pkl files read successfully.")
        return pd.DataFrame()

path = '../data/dataframes'
all_sentences_df = concat_pkl(path)
print(all_sentences_df.columns.tolist())

['SentenceID', 'currentSentence', 'prevSentence', 'nextSentence', 'Abstract', 'source_file']


In [8]:
import re
import csv

def get_context_for_AA(filename):

    with open(f"../data/AA_evaluation_sentences/{filename}.csv","w") as outfile:
        
        writer = csv.writer(outfile)
        new_header = ['id','prevSentence','currentSentence','maskedSentence','nextSentence','AI phrase','suggested mask','AI entity',
                      'anthropomorphic component','targetExpression','animated','context3wmasked','context3w']
        writer.writerow(new_header)
        infile = open(f"../data/evaluation_sentences_csv/{filename}.csv","r")
        header = infile.readline()
        reader = csv.reader(infile)
        
        for row in reader:
            sentence_id = row[0] 
            sentence = row[1]
            sentence = sentence.strip()
            sentence = re.sub(r'\s+', ' ', sentence) # normalize whitespaces that may have occcurred in preprocessing
            sentence_id = '_'.join(sentence_id.split('_')[2:5]) # remove class and dataset prefix added during preprocessing 
            # retrieve previous and next sentences from dataframe all sentences dataframe
            sentence_info = all_sentences_df[all_sentences_df['SentenceID'] == sentence_id]
            if not sentence_info.empty:
                current_sentence = sentence_info.iloc[0]['currentSentence']
                prev_sentence = sentence_info.iloc[0]['prevSentence']
                next_sentence = sentence_info.iloc[0]['nextSentence']
            else:
                prev_sentence = 'NONE'
                next_sentence = 'NONE'
            # create masked sentence
            AI_phrase = row[2].strip()
            AI_phrase = re.sub(r'\s+', ' ', AI_phrase) # normalize whitespaces that may have occcurred in preprocessing
            mask = row[3].strip()
            mask = re.sub(r'\s+', ' ', mask) # normalize whitespaces that may have occcurred in preprocessing
            mask_position = [match.start() for match in re.finditer(rf'\b{re.escape(mask)}\b', sentence, flags=re.IGNORECASE)]
            if len(mask_position) == 1:
                position = mask_position[0]
                masked_sentence = create_masked_sentence(sentence,position,mask)
            elif len(mask_position) > 1:
                AI_phrase_position = [match.start() for match in re.finditer(rf'\b{re.escape(AI_phrase)}\b', sentence,flags=re.IGNORECASE)]
                mask_in_phrase_position = [match.start() for match in re.finditer(rf'\b{re.escape(mask)}\b', AI_phrase)]
                if len(AI_phrase_position) == 1 and len(mask_in_phrase_position) == 1:
                    position = mask_in_phrase_position[0] + AI_phrase_position[0]
                    masked_sentence = create_masked_sentence(sentence,position,mask)
                else:
                    for i,position in enumerate(mask_position):
                        new_sentence_id = sentence_id + '_' + str(i)
                        masked_sentence = create_masked_sentence(sentence,position,mask)
            else: # mask not found in sentence at all - should not happen
                mask_position = [match.start() for match in re.finditer(re.escape(mask), sentence, flags=re.IGNORECASE)]
                print(filename)
                print(f"error in sentence {sentence_id}: the mask was not found in the sentence")
                print(AI_phrase,mask)
                print(sentence)
                print()
            # create context3w and context3wmasked

def create_masked_sentence(sentence,position,mask):

    masked_sentence = []
    len_mask = len(mask)
    masked_sentence.append(sentence[:position])
    masked_sentence.append("<mask>")
    position_after_mask = position+len_mask
    masked_sentence.append(sentence[position_after_mask:])
    masked_sentence = ''.join(masked_sentence)

    return masked_sentence

files = ["adjective_phrases_inconclusive",
         "adjective_phrases_negative",
         "adjective_phrases_positive",
         "comparisons_inconclusive",
         "noun_phrases_positive",
         "possessives_positive",
         "verb_objects_inconclusive",
         "verb_objects_negative",
         "verb_objects_positive",
         "verb_subjects_inconclusive",
         "verb_subjects_negative",
         "verb_subjects_positive"
        ]

for file in files:
    get_context_for_AA(file)

adjective_phrases_inconclusive
error in sentence 7_46407_0: the mask was not found in the sentence
Language models (LMs) Language models (LMs)
Language models (LMs) are vulnerable to exploitation for adversarial misuse.

adjective_phrases_positive
error in sentence 2311.04177_1947144_0: the mask was not found in the sentence
Large Language Models (LLMs) Large Language Models (LLMs)
Large Language Models (LLMs) are smart but forgetful.

adjective_phrases_positive
error in sentence 2308.03688_1891580_0: the mask was not found in the sentence
Large Language Models (LLMs) Large Language Models (LLMs)
Large Language Models (LLMs) are becoming increasingly smart and autonomous, targeting real-world pragmatic missions beyond traditional NLP tasks.

adjective_phrases_positive
error in sentence 1812.08960_1066534_3: the mask was not found in the sentence
A smart autonomous system (SAS) system (SAS)
A smart autonomous system (SAS) combines analytics and autonomy to understand, learn, decide and 