# Various utilities for processing and working with anthroscore files

### Obtain entities for anthroscore evaluation

The entities used for the basic anthroscore evaluation (experiment 1) are taken from the mask and then manually revised. This is to make sure that plurals of LLM, LM (LLMs, LMs) are included, as the anthroscore tokenization does not recognize these as plural inflections of the singular form. 

In [None]:
import csv
import re

def normalized(string):
    return re.sub(r'\s+', ' ', string.strip())
    
def create_anthroscore_entity_files(filename):
    """
    this function reads the csv files and returns new txt files with the AI entities needed for the AnthroScore evaluation
    :param filename: name of the file to be processed
    :type sentence: string
    """ 
    with open(f"../experiment_1/anthroscore/entities/{filename}_entities.txt","w") as outfile:

        entities = []
        
        writer = csv.writer(outfile)
        infile = open(f"../experiment_1/anthroscore/expectations/{filename}.csv","r")
        header = infile.readline()
        reader = csv.reader(infile)
        
        for row in reader:
            AI_entity = normalized(row[3]) # these are the suggested AI masks
            entities.append(AI_entity)

        for entity in list(set(entities)):
            writer.writerow([entity])

files = ["adjective_phrases_inconclusive",
         "adjective_phrases_negative",
         "adjective_phrases_positive",
         "comparisons_inconclusive",
         "noun_phrases_positive",
         "possessives_positive",
         "verb_objects_inconclusive",
         "verb_objects_negative",
         "verb_objects_positive",
         "verb_subjects_inconclusive",
         "verb_subjects_negative",
         "verb_subjects_positive"
        ]
            
for file in files:
    print(f"Creating Anthroscore entity lists for {file}...")
    create_anthroscore_entity_files(file)

### Create predictions result from anthroscore output such that all desired information is retained.

Anthroscore outputs the results into two types of files: sentence_scores, which are the individual scores per entity, and average scores, which is the final anthroscore for the sentence averaged over the individual entities. We are not interested in the latter, but the former contains limited information. The script below compiles files containing more information - the masked sentence, the AI phrase, mask and entity, as well as the suspected anthropomorphic component (or non-anthropomorphic verb/adjective). 

In [1]:
import csv
import re
import pandas as pd


def normalized(string):
    return re.sub(r'\s+', ' ', string.strip())

def convert_annotation(score):
    """
     This function converts annotations to numerical values:
     negative - 0, positive - 1, inclonclusive - 2
    """ 
    if score in ['p','p1','p2','p3']:
        score = '1'
    elif score in ['n1','n2','n3']:
        score = '0'
    elif score == 'inc':
        score = '2'
    else:
        print("score is malformed")

    return score

def get_scores_dict(filename):

    filedict = {}
    
    with open(f"../experiment_2/anthroscore/expectations/csv/{filename}.csv","r") as csv_file:        
        header = csv_file.readline()
        reader = csv.reader(csv_file)
        for row in reader:
            sentence_id = row[0]
            sentence_info = row[1:]
            filedict[sentence_id] = sentence_info # IDs are unique

    return filedict

def concat_info(filename,filedict):

    with open(f"../experiment_2/anthroscore/predictions/csv/{filename}.csv","w") as outfile:

        list_check = []
        
        writer = csv.writer(outfile)
        # new_header = ['id','sentence','masked_sentence','AI_phrase','suggested_mask','AI_entity',
        # 'anthro_component','original_term','original_noun','expectation','anthroscore']
        new_header = ['id','sentence','masked_sentence','AI_phrase','mask','AI_entity','component','expectation','prediction']
        writer.writerow(new_header)
        infile = open(f"../experiment_2/anthroscore/predictions/anthroscore_output/sentence_scores/{filename}.csv","r")
        header = infile.readline()
        reader = csv.reader(infile)
        
        for row in reader:

            #print(row)
            
            sentence_id = normalized(row[3])
            sentence = normalized(row[1])
            masked = normalized(row[2])
            #original_term = normalized(row[6])
            #original_noun = normalized(row[7])
            #anthroscore = normalized(row[8])
            anthroscore = normalized(row[4])            
            info = [normalized(x) for x in filedict[sentence_id]]
            #orig_score = convert_annotation(info[-1])
            orig_score = info[-1]
            
            #write_to_file = [sentence_id,sentence,masked]+info[1:-1]+[original_term,original_noun,orig_score,anthroscore]
            write_to_file = [sentence_id,sentence,masked]+info[-5:-1]+[orig_score,anthroscore]
            
            writer.writerow(write_to_file)
           

files = ["adjective_phrases_inconclusive",
         "adjective_phrases_negative",
         "adjective_phrases_positive",
         "comparisons_inconclusive",
         "noun_phrases_positive",
         "possessives_positive",
         "verb_objects_inconclusive",
         "verb_objects_negative",
         "verb_objects_positive",
         "verb_subjects_inconclusive",
         "verb_subjects_negative",
         "verb_subjects_positive"
        ]

files_retest = ["noun_phrases_positive-retest"]

for file in files_retest:
    file_dict = get_scores_dict(file)
    concat_info(file,file_dict)

### Iterate over removed_sentences file to review information

(This was relevant for the manual review of the anthroscore results of experiment 1, which used anthroscore masks. Since many of those masks were irrelevant, they were removed. The script below was used for reviewing the removed sentences and making sure the filtering out of irrelevant masked sentences was consistent.

In [110]:
def find_all_indices(text, substring):
    return [match.start() for match in re.finditer(re.escape(substring), text)]

column_names = ['id', 'sentence', 'masked_sentence', 'AI_phrase', 'mask', 'AI_entity', 'anthro_component', 
                    'anthroscore_entity', 'anthroscore_phrase', 'score', 'anthroscore']
df = pd.read_csv(f"../experiment_1/anthroscore/predictions/txt/removed_sentences.txt", sep='\t', header=None, names=column_names,index_col=False)

for _, row in df.iterrows():
    masked = row['masked_sentence']
    if masked.count('<mask>') > 1:
        #print(f"More than one <mask> found in ID: {row['id']}, Sentence: {row['masked_sentence']}")
        print(row['id'])
        print(row['sentence'])
        print(row['masked_sentence'])
        print()