# Obtain masked sentences and prev+next context

This notebook contains code that obtains masked sentences, either using anthroscore masks for experiment 1 (for AtypicalAnimacy evaluation, since the anthroscore masks are generated during the anthroscore evaluation), or by creating masked sentences based on our own masking strategy for experiment 2. Chapter 3 of the thesis describes and justifies the strategy according to which elements were kept as part of the context, and which elements were masked alongside the AI entity. 

The AtypicalAnimacy evaluation relies, in addition to the sentence, on a masked sentence plus the context of the previous and next sentences of the sentence. This is also obtained in this notebook.

In [1]:
# imports

import os
import pickle
import csv
import pandas as pd

#### Get a single dataframe containing the sentence, its unique ID, and previous and next sentences (if exist, empty string if not).

In [2]:
def concat_pkl(directory_path):
    """
    this function takes <mask> path to <mask> directory containing .pkl files of dataframes,
    and returns <mask> single concatenated dataframe.
    The dataframes contain sentences and their unique ID, 
    as well as the previous and next sentences in the abstract from which the sentence is taken.

    :param directory_path: path to <mask> directory containing .pkl files
    :type directory_path: string
    :return: pd.Dataframe()
    """ 
    pkl_files = [f for f in os.listdir(directory_path) if f.endswith('.pkl')]
    
    df_list = []

    for file in pkl_files:
        file_path = os.path.join(directory_path, file)
        try:
            df = pd.read_pickle(file_path)
            df_list.append(df)
        except Exception as e:
            print(f"Error reading {file}: {e}")

    if df_list:
        combined_df = pd.concat(df_list, ignore_index=True)
        return combined_df
    else:
        print("No pkl files read successfully.")
        return pd.DataFrame()

path = '../data/dataframes'
all_sentences_df = concat_pkl(path)
print(all_sentences_df.columns.tolist())

['SentenceID', 'currentSentence', 'prevSentence', 'nextSentence', 'Abstract']


#### Functions for obtaining masked sentences for both experiments.

In [3]:
import re

def normalized(string):
    return re.sub(r'\s+', ' ', string.strip())

def convert_annotation(score):
    """
     This function converts annotations to numerical values:
     negative - 0, positive - 1, inclonclusive - 2
    """ 
    if score in ['p','p1','p2','p3']:
        score = '1'
    elif score in ['n1','n2','n3']:
        score = '0'
    elif score == 'inc':
        score = '2'
    else:
        print("score is malformed")

    return score

def mask_sentence(sentence,position,masked_str,mask_token):
    """
    this function takes a sentence, an index and a string to be masked,
    and returns a masked sentence
    
    :param sentence: sentence from the evaluation set
    :type sentence: string
    :param position: index of the mask in the sentence
    :type position: integer
    :param masked_str: phrase in the sentence that should be masked
    :type masked_str: string
    :return: masked sentence (string)
    """ 
    
    masked_sentence = []
    len_mask = len(masked_str)
    masked_sentence.append(sentence[:position])
    masked_sentence.append(mask_token)
    position_after_mask = position+len_mask
    masked_sentence.append(sentence[position_after_mask:])
    masked_sentence = ''.join(masked_sentence)
        
    return masked_sentence

def punctuation_handling(masked_sent,mask_token):
    """
     This ad-hoc function separates punctuation from the <mask> token, to adhere to the AtypicalAnimacy masking approach. 
     This function only handles cases found in the data, therefore ad-hoc.
     This is applied when generating masked sentences according to our masking strategy for AtypicalAnimacy in experiment 2, 
     as well as preparing the AnthroScore masked sentences for the AtypicalAnimacy evaluation in experiment 1.
    """ 
    
    if mask_token+'.' in masked_sent:
        masked_sent = masked_sent.replace(mask_token+'.',mask_token+' .')
    elif '-'+mask_token in masked_sent:
        masked_sent = masked_sent.replace('-'+mask_token,'- '+mask_token)
    elif '('+mask_token+')' in masked_sent:
        masked_sent = masked_sent.replace('('+mask_token+')','( '+mask_token+' )')
    elif mask_token+',' in masked_sent:
        masked_sent = masked_sent.replace(mask_token+',',mask_token+' ,')
    elif mask_token+':' in masked_sent:
        masked_sent = masked_sent.replace(mask_token+':',mask_token+' :')
    elif ' '+mask_token+')' in masked_sent:
        masked_sent = masked_sent.replace(' '+mask_token+')',' '+mask_token+' )')
    elif mask_token+"’ " in masked_sent:
        masked_sent = masked_sent.replace(mask_token+"’ ",mask_token+" ’s ")
    elif mask_token+"’s" in masked_sent:
        masked_sent = masked_sent.replace(mask_token+"’s",mask_token+" ’s")
    elif mask_token+"' " in masked_sent:
        masked_sent = masked_sent.replace(mask_token+"' ",mask_token+" 's ")
    elif mask_token+"'s" in masked_sent:
        masked_sent = masked_sent.replace(mask_token+"'s",mask_token+" 's")

    return masked_sent

def get_masked_sentence(model,sentence,AI_phrase,mask):
    """
    this function takes a sentence and string to be masked, and returns a list of masked versions 

    :param experiment: specifies which format to use for the masked token based on the model.
    :type experiment: string
    :param sentence: sentence from the evaluation set
    :type sentence: string
    :param AI_phrase: entire AI phrase (including contextual components that should not be masked - 
    used for identification for when there are multiple occurrences of the mask in the sentence)
    :type AI_phrase: string
    :param mask: phrase in the sentence that should be masked
    :type mask: string
    :return: list of tuples containing the sentence and the masked sentence
    """     
    if model == 'anthroscore':
        mask_token = '<mask>'
    elif model == 'AtypicalAnimacy':
        mask_token = '[MASK]'
    
    mask_position = [match.start() for match in re.finditer(rf'\b{re.escape(mask)}\b', sentence, flags=re.IGNORECASE)]
    masked_sentences = []
    
    if len(mask_position) == 1: # simple case, only one occurrence of mask
        position = mask_position[0]
        masked_sentence = mask_sentence(sentence,position,mask,mask_token)
        if model == 'AtypicalAnimacy':
            masked_sentence = punctuation_handling(masked_sentence,'[MASK]') 
        masked_sentences.append((sentence,masked_sentence))
    elif len(mask_position) > 1: # more than one occurrence of the mask
        AI_phrase_position = [match.start() for match in re.finditer(rf'\b{re.escape(AI_phrase)}\b', sentence,flags=re.IGNORECASE)]
        mask_in_phrase_position = [match.start() for match in re.finditer(rf'\b{re.escape(mask)}\b', AI_phrase)]
        if len(AI_phrase_position) == 1 and len(mask_in_phrase_position) == 1: # found mask by comparing position in AI phrase
            position = mask_in_phrase_position[0] + AI_phrase_position[0]
            masked_sentence = mask_sentence(sentence,position,mask,mask_token)
            if model == 'AtypicalAnimacy':
                masked_sentence = punctuation_handling(masked_sentence,'[MASK]') 
            masked_sentences.append((sentence,masked_sentence))
        else: # cannot identify, masking all occurrences to be safe
            for i,position in enumerate(mask_position):
                masked_sentence = mask_sentence(sentence,position,mask,mask_token)
                if model == 'AtypicalAnimacy':
                    masked_sentence = punctuation_handling(masked_sentence,'[MASK]') 
                masked_sentences.append((sentence,masked_sentence))
    else: 
        # brute-force - do not replicate !!! this was done after manual revision and confirmation
        masked_sentence = sentence.replace(mask, mask_token)
        if model == 'AtypicalAnimacy':
            masked_sentence = punctuation_handling(masked_sentence,'[MASK]') 
        masked_sentences.append((sentence,masked_sentence))
    
    # seperate punctuation and possessive markers from the mask token with single spaces - for AtypicalAnimacy only.
    # the anthroscore model expects <mask> tokens that are not separated from the punctuation.   

    return masked_sentences 

def get_anthroscore_masks(filename):
    """
    this function returns the masked sentences from the AnthroScore prediction files, 
    to be used in experiment 1 of AtypicalAnimacy evaluation.

    :param filename: path to .csv file
    :type sentence: string
    :return: dictionary with unique ID as key, list of tuples (sentence-masked_sentence) as values.

    Note: there can be more than one masked sentence per ID (same as in experiment 2 masking strategy).
    """ 
    with open(f"../experiment_1/anthroscore/predictions/csv/{filename}.csv","r") as infile:

        anthroscore_masks = {}

        header = infile.readline()
        reader = csv.reader(infile)
        
        for row in reader:
            sentence_id = normalized(row[0])
            sentence = normalized(row[1])
            masked_sentence = normalized(row[2])
            if sentence_id not in anthroscore_masks:
                anthroscore_masks[sentence_id] = [(sentence,masked_sentence)]

        return anthroscore_masks

#### Deprecated: context3w(-masked) functions

The functions below were created to reconstruct all features existing in the AtypicalAnimacy dataset, including the 3-word context and masked 3-word context. However, these features are not used in the AtypicalAnimacy masking approach, which only uses a masked sentence + previous and next sentences context. The function get_context3w also assumes (and handles) that punctuation and possessive markers are attached to the mask token, which is not the case in the masking strategy taken in experiment 2.
To utilize this function with punctuations separated by spaces from the masked token, simply remove the ValueError handling - that error only occurs when punctuation or possessive markers are attached.

In [4]:
def get_context3w(masked_sentence,masked_str):
    """
    this function takes a masked sentence and the string that was masked,
    and returns both the 3-word context of the original masked string and a 3-word context of the mask

    :param masked_sentence: sentence that has been masked
    :type sentence: string
    :param masked_str: original phrase in the sentence that was masked
    :type masked_str: string
    :return: 3-word context and masked 3-word context (tuple of strings)
    """ 
    masked_sentence_list = masked_sentence.split(' ')
    try:
        mask_index = masked_sentence_list.index("<mask>")
        mask_str = '<mask>'
    except ValueError: # the mask is followed by punctuation or a possessive marker
        printcheck = True
        mask_plus_punct = [x for x in masked_sentence_list if '<mask>' in x][0] # assumes there is exactly one
        mask_index = masked_sentence_list.index(mask_plus_punct)
        mask_str = mask_plus_punct
        masked_str = mask_str.replace('<mask>',masked_str) # restore punctuation and possesive marker to masked string
    if mask_index <= 3:
        prev_words = masked_sentence_list[:mask_index]
    else:
        start_index = mask_index - 3
        prev_words = masked_sentence_list[start_index:mask_index]
    if len(masked_sentence_list) < mask_index + 3:
        next_words = masked_sentence_list[mask_index+1:]
    else:
        end_index = mask_index + 4
        next_words = masked_sentence_list[mask_index+1:end_index]
    prev_words = ' '.join(prev_words)
    next_words = ' '.join(next_words)
    context_3w = prev_words + ' ' + masked_str + ' ' + next_words # masked_str is the original text
    context_3w_masked = prev_words + ' ' + mask_str + ' ' + next_words # mask_str is <mask> (with or without punct)

    return context_3w,context_3w_masked

def get_masked_sentence_and_context(sentence,AI_phrase,mask):
    """
    this function takes a sentence and string to be masked, and returns a list of masked versions
    as well as 3-word context and 3-word masked context  

    :param sentence: sentence from the evaluation set
    :type sentence: string
    :param AI_phrase: entire AI phrase (including contextual components that should not be masked - 
    used for identification for when there are multiple occurrences of the mask in the sentence)
    :type AI_phrase: string
    :param mask: phrase in the sentence that should be masked
    :type mask: string
    :return: list of tuples containing the sentence, the masked sentence, the 3-word context and the masked 3-word context
    """ 
    mask_position = [match.start() for match in re.finditer(rf'\b{re.escape(mask)}\b', sentence, flags=re.IGNORECASE)]
    masked_sentences_and_context = []

    if len(mask_position) == 1: # simple case, only one occurrence of mask
        position = mask_position[0]
        masked_sentence = mask_sentence(sentence,position,mask)
        context_3w_tuple = get_context3w(masked_sentence,mask)
        context_3w = context_3w_tuple[0]
        context_3w_masked = context_3w_tuple[1]
        masked_sentences_and_context.append((sentence,masked_sentence,context_3w,context_3w_masked))
    elif len(mask_position) > 1: # more than one occurrence of the mask
        AI_phrase_position = [match.start() for match in re.finditer(rf'\b{re.escape(AI_phrase)}\b', sentence,flags=re.IGNORECASE)]
        mask_in_phrase_position = [match.start() for match in re.finditer(rf'\b{re.escape(mask)}\b', AI_phrase)]
        if len(AI_phrase_position) == 1 and len(mask_in_phrase_position) == 1: # found mask by comparing position in AI phrase
            position = mask_in_phrase_position[0] + AI_phrase_position[0]
            masked_sentence = mask_sentence(sentence,position,mask)
            context_3w_tuple = get_context3w(masked_sentence,mask)
            context_3w = context_3w_tuple[0]
            context_3w_masked = context_3w_tuple[1]
            masked_sentences_and_context.append((sentence,masked_sentence,context_3w,context_3w_masked))
        else: # cannot identify, masking all occurrences to be safe
            for i,position in enumerate(mask_position):
                masked_sentence = mask_sentence(sentence,position,mask)
                context_3w_tuple = get_context3w(masked_sentence,mask)
                context_3w = context_3w_tuple[0]
                context_3w_masked = context_3w_tuple[1]
                masked_sentences_and_context.append((sentence,masked_sentence,context_3w,context_3w_masked))
    else: 
        # brute-force - do not replicate !!! this was done after manual revision and confirmation
        masked_sentence = sentence.replace(mask, "<mask>")
        context_3w_tuple = get_context3w(masked_sentence,mask)
        context_3w = context_3w_tuple[0]
        context_3w_masked = context_3w_tuple[1]
        masked_sentences_and_context.append((sentence,masked_sentence,context_3w,context_3w_masked))

    return masked_sentences_and_context

#### Create evaluation sets for AtypicalAnimacy using specified masking strategy

In [4]:
def create_evaluation_set(filename,experiment,model,include_3w_context=False):
    """
    this function takes a .csv file and returns a new .csv file with expanded information used by the AtypicalAnimacy model for evaluation: 
    in addition to the original information, it adds the previous and next sentence (if applicable), the masked sentence, 
    the 3-word context of the masked phrase as well as the 3-word context of the mask.

    To include context3w and context3wmasked, add these strings as last two items in AA_header, 
    and then use function masked_sentences_and_context instead of masked_sentences. 

    :param filename: name of the file to be processed
    :type sentence: string
    :param experiment: obtains masked sentences according to the experiment. 
                        experiment_1 uses AnthroScore masking strategy and experiment_2 uses our own masking strategy.
    :type sentence: string
    :param filename: experiment
    :type sentence: string
    :param experiment: True if (masked) 3-word contexts should be included in the generated sets. False by default.
    :type sentence: bool
    """ 
    with open(f"../{experiment}/{model}/expectations/csv/{filename}.csv","w") as outfile:

        missing_ids = []

        if experiment == 'experiment_1':
            anthroscore_masks = get_anthroscore_masks(filename) # used in experiment_1, i.e. AnthroScore masking
        else:
            anthroscore_masks = []
        
        writer = csv.writer(outfile)
        new_header = ['id','Previous Sentence','Current Sentence','Masked Sentence','Next Sentence','AI Phrase','Suggested Mask','AI Entity',
                      'Anthropomorphic Component','Target Expression','Animated']
        if include_3w_context:
            new_header.extend(['context3w','context3wmasked'])
        writer.writerow(new_header)
        infile = open(f"../data/evaluation_sentences_csv/{filename}.csv","r")
        header = infile.readline()
        reader = csv.reader(infile)
        
        for row in reader:
            
            sentence_id = normalized(row[0])
            sentence = normalized(row[1])
            orig_sentence_id = '_'.join(sentence_id.split('_')[2:5]) # remove class and dataset prefix added during preprocessing
            
            # retrieve previous and next sentences from dataframe all sentences dataframe
            sentence_info = all_sentences_df[all_sentences_df['SentenceID'] == orig_sentence_id]
            if not sentence_info.empty:
                current_sentence = normalized(sentence_info.iloc[0]['currentSentence'])
                prev_sent = normalized(sentence_info.iloc[0]['prevSentence'])
                next_sent = normalized(sentence_info.iloc[0]['nextSentence'])
            else: # this is only to capture errors - this does not happen
                print(f"error: the sentence with the id {sentence_id} was not found in the dataframe")
                prev_sent = ''
                next_sent = ''
                
            # get masked sentence, context3w and context3wmasked
            AI_phrase = normalized(row[2])
            mask = normalized(row[3])
            AI_entity = normalized(row[4])
            anthro_component = normalized(row[5])
            score = convert_annotation(normalized(row[6])) # convert p,n,inc scores to 0,1,2 scores

            if experiment == 'experiment_1': # AnthroScore masking strategy

                try:
                    masked_sentences = anthroscore_masks[sentence_id] # use sentence ID to retrieve masks
                    for m in masked_sentences:
                        # make sure the original sentence matches
                        if sentence != m[0]:
                            print("found mismatching sentence!")
                        if len(masked_sentences) > 1:
                            print("something's strange...found more than one mask in anthroscore masks...")
                        else:
                            masked_sentence = m[1].replace('<mask>','[MASK]')
                            masked_sentence = punctuation_handling(masked_sentence,'[MASK]') # necessary for AtypicalAnimacy code to run
                            writer.writerow([sentence_id,prev_sent,m[0],masked_sentence,next_sent,AI_phrase,mask,
                                                AI_entity,anthro_component,mask,score])
                except KeyError:
                    missing_ids.append(sentence_id)
                        
            elif experiment == 'experiment_2': # Our own masking strategy
                if include_3w_context: # only in case we want to include (masked) 3-word contexts 
                    masked_sentences = get_masked_sentence_and_context(sentence,AI_phrase,mask) 
                    for m in masked_sentences:
                        writer.writerow([sentence_id,prev_sent,m[0],m[1],next_sent,AI_phrase,mask,
                                         AI_entity,anthro_component,mask,score,m[2],m[3]])
                else: 
                    masked_sentences = get_masked_sentence(model,sentence,AI_phrase,mask) 
                    for m in masked_sentences:
                        writer.writerow([sentence_id,prev_sent,m[0],m[1],next_sent,AI_phrase,mask,AI_entity,anthro_component,mask,score])
        
        if missing_ids:
            print("IDs of missing sentences: ",missing_ids)


In [7]:
files = ["adjective_phrases_inconclusive",
         "adjective_phrases_negative",
         "adjective_phrases_positive",
         "comparisons_inconclusive",
         "noun_phrases_positive",
         "possessives_positive",
         "verb_objects_inconclusive",
         "verb_objects_negative",
         "verb_objects_positive",
         "verb_subjects_inconclusive",
         "verb_subjects_negative",
         "verb_subjects_positive"
        ]

files_retest = ["noun_phrases_positive_retest"]

for file in files_retest:
    print(f"Creating anthroscore evaluation set for {file}...")
    create_evaluation_set(file,'experiment_2','AtypicalAnimacy')

Creating anthroscore evaluation set for noun_phrases_positive_retest...
