# Check wellformedness of preprocessed data sentences

This notebook provides code for making sure that no sentence was annotated with conflicting annotations, and that the .txt files used to create the evaluation sets are well-formed - i.e, the IDs contain the database prefix (used to locate them in the dataframe) and that each row contains exactly five tab-separated values.

In [62]:
import re

def check_conflicting_annotations(filename,case1,case2):

    print(f"Comparing {case1} cases and {case2} cases for the {filename} set...")

    conflicting_annotation = False

    dict1 = get_sentences_dict(filename,case1)
    dict2 = get_sentences_dict(filename,case2)

    for id1,sent in dict1.items():
        if id1 in dict2:
            conflicting_annotation = True
            print(f"comparing {case1} cases and {case2} cases")
            print(f"the {case1} sentence with the ID ",id1,f" appears in the {case2} set with the same ID")
        elif sent in dict2.values():
            conflicting_annotation = True
            id2 = [key for keys in dict2.keys() if dict2[key] == sent][0]
            print(f"the {case1} sentence with the ID  ",id1,
                  f" appears in the {case2} set with the ID ",id2)

    return conflicting_annotation

def get_anthro_components(filename,case):

    print(f"Retrieving a list of anthropomorphic words in {filename}_{case}.txt:")

    anthro_components = {}

    sentences_dict = get_sentences_dict(filename,case)
    all_sentences_info = sentences_dict.values()
    
    for sent_info in all_sentences_info:
        if sent_info[4] not in anthro_components:
            anthro_components[sent_info[4]] = 1
        else:
            anthro_components[sent_info[4]] += 1

    return anthro_components

def get_sentences_dict(filename,score):   

    sentences_dict = {}
    duplicate_ids = []
    duplicate_sentence_pairs = []
    
    sentences = open(f"../preprocessed_data/evaluation_sentences/50_{filename}_{score}.txt","r")
    
    for line in sentences.readlines():
        line = line.strip()
        line = line.split("\t")
        sent_id = line[0]
        sent_info = line[1:]

        # wellformedness checks
        if len(line) != 6:
            print(f"The row with the ID {sent_id} in {filename}_{score}.txt is not well-formed.")
        id_prefix = sent_id[:6]
        if not re.match(r"^[1-5]{1}_(arx|acl)_", id_prefix):
            print(f"The ID {sent_id} in {filename}_{score} is missing a database prefix.")
        
        if sent_id not in sentences_dict:
            if sent_info not in sentences_dict.values():
                sentences_dict[sent_id] = sent_info
            else: # the sentence appears twice with different IDs 
                other_id = [key for key in sentences_dict if sentences_dict[key] == sent][0]
                duplicate_sentence = (other_id,sent_id)
                duplicate_sentence_pairs.append(duplicate_sentence) 
        else: # the sentence appears twice with the same ID
            duplicate_ids.append(sent_id)

    if len(sentences_dict.keys()) > 50 and score != "inconclusive":
        print(f"There are more than 50 sentences in 50_{filename}_{score}.txt.")
    elif len(sentences_dict.keys()) < 50 and score != "inconclusive":
        print(f"There are less than 50 sentences in 50_{filename}_{score}.txt.")

    if duplicate_ids:
        print("The sentences with the following ids appear twice: ",duplicate_ids,
             f" in 50_{filename}_{score}.txt")

    if duplicate_sentence_pairs:
        print("The following ID pairs refer to the same sentence: ",duplicate_sentence_pairs,
             f" in 50_{filename}_{score}.txt")

    return sentences_dict

#### Check sentences for each category

change parameter of get_sentences. The options are:
1. agent_subjects - sentences in which the AI entity is the subject of an anthropomorphic verb (nsubj)
2. agent_objects - sentences in which the AI entity is object (agent) of an anthropomorphic verb in the passive voice (pobj)
3. nonagent_objects - sentences in which the AI entity is object (cognizer) of an anthropomorphic verb
4. adjective_phrases - sentences in which the AI entity is part of an anthropomorphic adjectival phrase
5. noun_phrases - sentences in which the AI entity is part of an anthropomorphic noun phrase
6. possessives - sentences in which the AI entity is immediately followed by a possessive marker
7. comparisons - sentences in which the AI entity is being compared to humans explicitly

In [65]:
conflicting_pos_neg_annotations = check_conflicting_annotations("adjective_phrases","positive","negative")
conflicting_pos_inc_annotations = check_conflicting_annotations("adjective_phrases","positive","inconclusive")
conflicting_neg_inc_annotations = check_conflicting_annotations("adjective_phrases","negative","inconclusive")

if conflicting_pos_neg_annotations:
    print("Resolve conflicting annotations in the positive and negative sets!!!")
elif conflicting_pos_inc_annotations:
    print("Resolve conflicting annotations in the positive and inconclusive sets!!!")
elif conflicting_neg_inc_annotations:
    print("Resolve conflicting annotations in the negative and inconclusive sets!!!")
else:
    print("No conflicting annotations. Clean up duplicates and fix any ill-formed IDs or rows before proceeding.")

print()

anthro_components = get_anthro_components("adjective_phrases","positive")
print(anthro_components)

Comparing positive cases and negative cases for the adjective_phrases set...
Comparing positive cases and inconclusive cases for the adjective_phrases set...
Comparing negative cases and inconclusive cases for the adjective_phrases set...
No conflicting annotations. Clean up duplicates and fix any ill-formed IDs or rows before proceeding.

Retrieving a list of anthropomorphic words in adjective_phrases_positive.txt:
{'aware': 7, 'intelligent': 5, 'conscious': 6, 'responsive': 1, 'confident': 5, 'culturally cognizant and value-aligned': 1, 'attentive': 2, 'smart but forgetful': 1, 'malicious': 5, 'manipulative': 1, 'ethically conscious': 1, 'smart and autonomous': 1, 'eager': 1, 'creative': 1, 'blind': 2, 'sensitive': 1, 'deceptive': 1, 'emotionally intelligent': 1, 'smart autonomous': 1, 'vulnerable': 3, 'thoughtful': 1, 'confused': 1, 'vulnerable,confused': 1}
