# Check wellformedness of preprocessed data sentences

This notebook provides code for making sure that no sentence was annotated with conflicting annotations, and that the .txt files used to create the evaluation sets are well-formed - i.e, the IDs contain the database prefix (used to locate them in the dataframe) and that each row contains exactly five tab-separated values.

In [25]:
import re

def check_conflicting_annotations(filename,case1,case2):

    print(f"Comparing {case1[0]} cases and {case2[0]} cases for the {filename} set...")

    conflicting_annotation = False

    dict1 = get_sentences_dict(filename,case1[0],case1[1])[0]
    dict2 = get_sentences_dict(filename,case2[0],case2[1])[0]

    for id1,sent in dict1.items():
        if id1 in dict2:
            conflicting_annotation = True
            print(f"comparing {case1[0]} cases and {case2[0]} cases")
            print(f"the {case1[0]} sentence with the ID ",id1,f" appears in the {case2[0]} set with the same ID")
        elif sent in dict2.values():
            conflicting_annotation = True
            id2 = [key for keys in dict2.keys() if dict2[key] == sent][0]
            print(f"the {case1} sentence with the ID  ",id1,
                  f" appears in the {case2} set with the ID ",id2)

    return conflicting_annotation

def check_duplicates(filename,case):

    duplicates = False
    print(f"Checking for duplicate entries in {filename}_{case[0]}.txt...")

    sentences_dict = get_sentences_dict(filename,case[0],case[1])[0]
    duplicate_ids = get_sentences_dict(filename,case[0],case[1])[1]
    duplicate_sentence_pairs = get_sentences_dict(filename,case[0],case[1])[2]
    
    if len(sentences_dict.keys()) > case[1]:
        print(f"There are more than {case[1]} sentences in {filename}_{case[0]}.txt.")
    elif len(sentences_dict.keys()) < case[1]:
        print(f"There are less than {case[1]} sentences in {filename}_{case[0]}.txt.")

    if duplicate_ids:
        duplicates = True
        print("The sentences with the following ids appear twice: ",duplicate_ids,
             f" in {filename}_{case[0]}.txt")

    if duplicate_sentence_pairs:
        duplicates = True
        print("The following ID pairs refer to the same sentence: ",duplicate_sentence_pairs,
             f" in {filename}_{case[0]}.txt")

    return duplicates
        
def get_anthro_components(filename,case):

    print(f"Retrieving a list of anthropomorphic words in {filename}_{case[0]}.txt:")

    anthro_components = {}

    sentences_dict = get_sentences_dict(filename,case[0],case[1])[0]
    all_sentences_info = sentences_dict.values()
    
    for sent_info in all_sentences_info:
        if sent_info[4] not in anthro_components:
            anthro_components[sent_info[4]] = 1
        else:
            anthro_components[sent_info[4]] += 1

    return anthro_components

def get_sentences_dict(filename,score,num):   

    sentences_dict = {}
    duplicate_ids = []
    duplicate_sentence_pairs = []
    
    sentences = open(f"../preprocessed_data/evaluation_sentences/{filename}_{score}.txt","r")
    
    for line in sentences.readlines():
        line = line.strip()
        line = line.split("\t")
        if len(line) == 0:
            break
        sent_id = line[0]
        sent_info = line[1:]

        # wellformedness checks
        if len(line) != 7:
            print(f"The row with the ID {sent_id} in {filename}_{score}.txt is not well-formed.")
        id_prefix = sent_id[:6]
        if not re.match(r"^[1-5]{1}_(arx|acl)_", id_prefix):
            print(f"The ID {sent_id} in {filename}_{score} is not well-formed.")
        
        if sent_id not in sentences_dict:
            if sent_info not in sentences_dict.values():
                sentences_dict[sent_id] = sent_info
            else: # the sentence appears twice with different IDs 
                other_id = [key for key in sentences_dict if sentences_dict[key] == sent][0]
                duplicate_sentence = (other_id,sent_id)
                duplicate_sentence_pairs.append(duplicate_sentence) 
        else: # the sentence appears twice with the same ID
            duplicate_ids.append(sent_id)

    return sentences_dict,duplicate_ids,duplicate_sentence_pairs

#### Check sentences for each category

change parameter of get_sentences. The options are:
1. agent_subjects - sentences in which the AI entity is the subject of an anthropomorphic verb (nsubj)
2. agent_objects - sentences in which the AI entity is object (agent) of an anthropomorphic verb in the passive voice (pobj)
3. nonagent_objects - sentences in which the AI entity is object (cognizer) of an anthropomorphic verb
4. adjective_phrases - sentences in which the AI entity is part of an anthropomorphic adjectival phrase
5. noun_phrases - sentences in which the AI entity is part of an anthropomorphic noun phrase
6. possessives - sentences in which the AI entity is immediately followed by a possessive marker
7. comparisons - sentences in which the AI entity is being compared to humans explicitly

In [10]:
def conflicting_annotations_check(num_of_cases):

    num_pos = num_of_cases["positive"] 
    num_neg = num_of_cases["negative"]
    num_inc = num_of_cases["inconclusive"]

    conflicting_pos_neg_annotations = check_conflicting_annotations("verb_subjects",("positive",num_pos),("negative",num_neg))
    conflicting_pos_inc_annotations = check_conflicting_annotations("verb_subjects",("positive",num_pos),("inconclusive",num_inc))
    conflicting_neg_inc_annotations = check_conflicting_annotations("verb_subjects",("negative",num_neg),("inconclusive",num_inc))

    if conflicting_pos_neg_annotations:
        print("Resolve conflicting annotations in the positive and negative sets!!!")
    elif conflicting_pos_inc_annotations:
        print("Resolve conflicting annotations in the positive and inconclusive sets!!!")
    elif conflicting_neg_inc_annotations:
        print("Resolve conflicting annotations in the negative and inconclusive sets!!!")
    else:
        print("No conflicting annotations.")

    return

def duplicates_check(num_of_cases):

    num_pos = num_of_cases["positive"] 
    num_neg = num_of_cases["negative"]
    num_inc = num_of_cases["inconclusive"]

    duplicates_in_pos = check_duplicates("verb_subjects",("positive",num_pos))
    duplicates_in_neg = check_duplicates("verb_subjects",("negative",num_neg))
    duplicates_in_inc = check_duplicates("verb_subjects",("inconclusive",num_inc))

    if duplicates_in_pos:
        print("Resolve duplicates in the positive set!!!")
    elif duplicates_in_neg:
        print("Resolve duplicates in the negative set!!!")
    elif duplicates_in_inc:
        print("Resolve duplicates in the inconclusive set!!!")
    else:
        print("No duplicate utterances.")

    return

In [27]:
num_of_cases = {"positive":60,"negative":60,"inconclusive":30}
duplicates_check(num_of_cases)
conflicting_annotations_check(num_of_cases)
print()

anthro_components_p = get_anthro_components("verb_subjects",("positive",60))
print(anthro_components_p)
print()
anthro_components_n = get_anthro_components("verb_subjects",("negative",60))
print(anthro_components_n)
print()
anthro_components_inc = get_anthro_components("verb_subjects",("inconclusive",30))
print(anthro_components_inc)

Checking for duplicate entries in verb_subjects_positive.txt...
Checking for duplicate entries in verb_subjects_negative.txt...
Checking for duplicate entries in verb_subjects_inconclusive.txt...
No duplicate utterances.
Comparing positive cases and negative cases for the verb_subjects set...
Comparing positive cases and inconclusive cases for the verb_subjects set...
Comparing negative cases and inconclusive cases for the verb_subjects set...
No conflicting annotations.

Retrieving a list of anthropomorphic words in verb_subjects_positive.txt:
{'determine when to request help': 1, 'understand': 2, 'recognize': 1, 'think': 1, 'infer': 2, 'resolve a conflict': 1, 'learn from each other': 1, 'collaborate': 1, 'exhibit overconfidence': 1, 'memorize': 2, 'remember': 2, 'see': 1, 'recall': 1, 'have awareness': 1, 'decide to trust': 1, 'suffer': 1, 'believe': 1, 'understand, intepret and predict': 1, 'try to achieve': 1, 'reason': 1, 'teach': 1, 'deduce': 1, 'prefer': 1, 'ask': 1, 'distingui