### Converting AnthoScore output to expanded .csv files

In [23]:
import csv
import re
import pandas as pd


def normalized(string):
    return re.sub(r'\s+', ' ', string.strip())

def convert_annotation(score):
    """
     This function converts annotations to numerical values:
     negative - 0, positive - 1, inclonclusive - 2
    """ 
    if score in ['p','p1','p2','p3']:
        score = '1'
    elif score in ['n1','n2','n3']:
        score = '0'
    elif score == 'inc':
        score = '2'
    else:
        print("score is malformed")

    return score

def get_scores_dict(filename):

    filedict = {}
    
    with open(f"../experiment_2/anthroscore/expectations/csv/{filename}.csv","r") as csv_file:        
        header = csv_file.readline()
        reader = csv.reader(csv_file)
        for row in reader:
            sentence_id = row[0]
            sentence_info = row[1:]
            filedict[sentence_id] = sentence_info # IDs are unique

    return filedict

def concat_info(filename,filedict):

    with open(f"../experiment_2/anthroscore/predictions/csv/{filename}.csv","w") as outfile:

        list_check = []
        
        writer = csv.writer(outfile)
        # new_header = ['id','sentence','masked_sentence','AI_phrase','suggested_mask','AI_entity',
        # 'anthro_component','original_term','original_noun','expectation','anthroscore']
        new_header = ['id','sentence','masked_sentence','AI_phrase','mask','AI_entity','component','expectation','prediction']
        writer.writerow(new_header)
        infile = open(f"../experiment_2/anthroscore/predictions/anthroscore_output/sentence_scores/{filename}.csv","r")
        header = infile.readline()
        reader = csv.reader(infile)
        
        for row in reader:

            #print(row)
            
            sentence_id = normalized(row[3])
            sentence = normalized(row[1])
            masked = normalized(row[2])
            #original_term = normalized(row[6])
            #original_noun = normalized(row[7])
            #anthroscore = normalized(row[8])
            anthroscore = normalized(row[4])            
            info = [normalized(x) for x in filedict[sentence_id]]
            #orig_score = convert_annotation(info[-1])
            orig_score = info[-1]
            
            #write_to_file = [sentence_id,sentence,masked]+info[1:-1]+[original_term,original_noun,orig_score,anthroscore]
            write_to_file = [sentence_id,sentence,masked]+info[-5:-1]+[orig_score,anthroscore]
            
            writer.writerow(write_to_file)
            

files = ["adjective_phrases_inconclusive",
         "adjective_phrases_negative",
         "adjective_phrases_positive",
         "comparisons_inconclusive",
         "noun_phrases_positive",
         "possessives_positive",
         "verb_objects_inconclusive",
         "verb_objects_negative",
         "verb_objects_positive",
         "verb_subjects_inconclusive",
         "verb_subjects_negative",
         "verb_subjects_positive"
        ]

#for file in files:
    #file_dict = get_scores_dict(file)
    #concat_info(file,file_dict)

In [91]:
def find_all_indices(text, substring):
    return [match.start() for match in re.finditer(re.escape(substring), text)]

column_names = ['id', 'sentence', 'masked_sentence', 'AI_phrase', 'mask', 'AI_entity', 'anthro_component', 
                    'anthroscore_entity', 'anthroscore_phrase', 'score', 'anthroscore']
df = pd.read_csv(f"../experiment_1/anthroscore/predictions/txt/removed_sentences.txt", sep='\t', header=None, names=column_names,index_col=False)

ids = []
for _, row in df.iterrows():
    masked = row['masked_sentence']
    if masked.count('<mask>') > 1:
        if row['id'] not in ids:
            ids.append(row['id'])
        print(f"More than one <mask> found in ID: {row['id']}")

print()
print(ids)

More than one <mask> found in ID: 4_arx_2312.10766_1972574_1
More than one <mask> found in ID: 4_arx_2312.10766_1972574_1
More than one <mask> found in ID: 4_arx_2312.10766_1972574_1
More than one <mask> found in ID: 4_arx_2312.10766_1972574_1
More than one <mask> found in ID: 4_acl_147_34632_2
More than one <mask> found in ID: 4_arx_2411.14133_2196560_0
More than one <mask> found in ID: 4_arx_2411.14133_2196560_0
More than one <mask> found in ID: 7_acl_245_24395_3
More than one <mask> found in ID: 7_acl_245_24395_3
More than one <mask> found in ID: 7_arx_2308.12578_1900470_0
More than one <mask> found in ID: 7_arx_2308.12578_1900470_0
More than one <mask> found in ID: 5_acl_280_13090_8_1
More than one <mask> found in ID: 5_arx_2311.06985_1949952_2
More than one <mask> found in ID: -4_acl_117_37846_1
More than one <mask> found in ID: 6_arx_2408.10159_2130340_1
More than one <mask> found in ID: 6_arx_2408.10159_2130340_1

['4_arx_2312.10766_1972574_1', '4_acl_147_34632_2', '4_arx_2411.1