In [2]:
from collections import Counter, defaultdict
import spacy
from emoji import UNICODE_EMOJI

In [3]:
nlp = spacy.load("en_core_web_sm")

In [8]:
%%time
import sys
sys.path.append('../../neutral_generation/')
from is_gendered import is_gendered

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 16.7 µs


In [10]:
is_gendered('he'), is_gendered('she')

('male', 'female')

In [15]:
def get_source_target(eval_set):
    with open(f'../../evaluation/{eval_set}/source.txt', 'r') as f:
        source = f.readlines()
    
    with open(f'../../evaluation/{eval_set}/target.txt', 'r') as f:
        target = f.readlines()
        
    return source, target

In [16]:
source, target = get_source_target('gendered_test_set')

In [17]:
len(source), len(target)

(500, 500)

In [18]:
token_counter = Counter()

for sent in source:
    tokens = sent.lower().split(' ')
    for token in tokens:
        token_counter[token] += 1

In [19]:
# num of unique tokens
len(token_counter)

2522

In [20]:
total_tokens = 0
for token in token_counter:
    total_tokens += token_counter[token]
total_tokens

6427

In [21]:
token_counter.most_common(20)

[('the', 274),
 ('a', 180),
 ('to', 158),
 ('he', 146),
 ('she', 136),
 ('and', 131),
 ('her', 122),
 ('in', 97),
 ('his', 95),
 ('was', 93),
 ('i', 88),
 ('of', 82),
 ('for', 63),
 ('is', 57),
 ('on', 50),
 ('that', 47),
 ('my', 46),
 ('with', 46),
 ('has', 38),
 ('you', 38)]

In [22]:
from typing import Tuple

def get_difference(old_sent: str, new_sent: str):
    old_words = old_sent.strip().split(' ')
    new_words = new_sent.strip().split(' ')
    
    removed_words = list(set(old_words) - set(new_words))
    added_words = list(set(new_words) - set(old_words))
#     removed_words = list(word for i, word in enumerate(old_words) if word != new_words[i])
#     added_words = list(word for i, word in enumerate(new_words) if word != old_words[i])
    
    return removed_words, added_words

In [23]:
def get_difference_sent_list(generation, target):
    changes = list()
    num_changes = 0

    for i in range(500):
        diffs = get_difference(generation[i], target[i])
    #     if len(diffs[0]) != len(diffs[1]):
    #         print(diffs)
    #         print(generation[i], target[i])

        changes.append(diffs)
        num_changes = num_changes + len(diffs[1])
    
    return changes, num_changes

In [24]:
# 1590 / 795 / 795 for perfect count
changes, num_changes = get_difference_sent_list(generation=source, target=target)
num_changes

743

In [34]:
# approximation, isn't perfect
# e.g. set subtraction for word diff, 
# e.g. calculation of percent of "correct changes" bc target changes aren't necessarily in source changes

def get_changes(eval_set, generation):
    with open(f'../../evaluation/{eval_set}/generations/{generation}/generation.txt', 'r') as f:
        generation = f.readlines()
    
    with open(f'../../evaluation/{eval_set}/source.txt', 'r') as f:
        source = f.readlines()
    
    with open(f'../../evaluation/{eval_set}/target.txt', 'r') as f:
        target = f.readlines()
        
    source_changes, source_num_changes = get_difference_sent_list(generation=generation, target=source)
    
    target_changes, target_num_changes = get_difference_sent_list(generation=generation, target=target)
        
    return generation, {
        'source_changes': source_changes,
        'source_num_changes': source_num_changes,
        'target_changes': target_changes,
        'target_num_changes': target_num_changes
    }

In [25]:
def display_num_changes(all_changes):
    source_num_changes, target_num_changes = all_changes['source_num_changes'], all_changes['target_num_changes']
    correct_changes = source_num_changes - target_num_changes
    
    print(source_num_changes, target_num_changes, correct_changes)
    
    if source_num_changes == 0:
        if target_num_changes == 0:
            print("no changes made")
        else:
            print("no changes made, but generation != target")
    else:
        print('percent of changes that were correct (precision): ', 1 - round(target_num_changes / source_num_changes, 3))
    
    if num_changes == 0:
        print("no changes should be made")
    else:
        print('percent of correct changes captured (recall): ', round(correct_changes / num_changes, 3))    

In [62]:
def display_mistake_counts(mistake_counts):
    print(f"Total of ({mistake_counts['total']}) words in the annotation were not in the generation. Mistakes came from ({mistake_counts['wrong_sentences']}) sentences.")
    print(f"({mistake_counts['subtotal_pronouns_verbs']}) of the mistakes tagged as pronouns / verbs, and ({mistake_counts['subtotal_other']}) tagged as other mistakes.")
    print()
    print(f"Breakdown of the ({mistake_counts['subtotal_pronouns_verbs']}) pronouns / verbs mistakes")
    print(f"\t({mistake_counts['male_pronoun'] + mistake_counts['female_pronoun']}) pronouns: ({mistake_counts['male_pronoun']}) male, ({mistake_counts['female_pronoun']}) female")
    print(f"\t({mistake_counts['auxiliary'] + mistake_counts['verbs']}) verbs: ({mistake_counts['auxiliary']}) auxiliary, ({mistake_counts['verb']}) root verbs")
    print()
    print(f"Breakdown of the ({mistake_counts['subtotal_other']}) other mistakes")
    print(f"\t({mistake_counts['emoji']}) emoji, ({mistake_counts['symbols']}) symbols, ({mistake_counts['whitespace']}) whitespace, ({mistake_counts['nonbreaking_space']}) non-breaking space")
    print(f"\t({mistake_counts['not_categorized']}) not_categorized")

In [27]:
def is_emoji(character):
    return character in UNICODE_EMOJI

In [28]:
GENDER_NEUTRAL_PRONOUNS = ['they', 'their', 'them', 'theirs', 'themself']
SYMBOLS = '!@#$%^&*()_+={}[]\|"\':;?/>.<,~`'

In [49]:
def analyze_target_changes(all_changes, generation):
    target_changes = all_changes['target_changes']
    
    mistake_counts = Counter()
    mistake_types = defaultdict(list)
    
    sentence_indices = list()
    
    for i, full_change in enumerate(target_changes):
        target_change = full_change[1]

        if len(target_change) > 0:
            mistake_counts['wrong_sentences'] += 1
            sentence_indices.append(i)
        
        for change in target_change:
            mistake_counts['total'] += 1
            categorized = False
            
            # whitespace
            if not change:
                mistake_counts['whitespace'] += 1
                mistake_types['whitespace'].append(change)
                categorized = True
#                 continue
            
            # non-breaking space \xa0
            if '\xa0' in change:
                mistake_counts['nonbreaking_space'] += 1
                mistake_types['nonbreaking_space'].append(change)
                categorized = True
#                 continue
            
            # pronoun lowercase
            for pronoun in GENDER_NEUTRAL_PRONOUNS:
                if pronoun in change.lower() and change:
                    categorized = True
                    gender = is_gendered(source[i])
    #                 print(source[i], gender)
                    if gender == 'male':
                        mistake_counts['male_pronoun'] += 1
                        mistake_types['male_pronoun'].append(change)
                    elif gender == 'female':
                        mistake_counts['female_pronoun'] += 1
                        mistake_types['female_pronoun'].append(change)
                    break
#                 continue
            
            # verb or auxiliary verb
            if change:
                doc = nlp(change)
                if doc[0].pos_ == 'VERB':
                    mistake_counts['verb'] += 1
                    mistake_types['verb'].append(change)
                    categorized = True
#                 continue
                
                if doc[0].pos_ == 'AUX':
                    mistake_counts['auxiliary'] += 1
                    mistake_types['auxiliary'].append(change)
                    categorized = True
#                     print(generation[i], target[i])
#                 continue

            for c in change:
                if is_emoji(c):
                    mistake_counts['emoji'] += 1
                    mistake_types['emoji'].append(change)
                    categorized = True
                    break

            for c in change:
                if c in SYMBOLS:
                    mistake_counts['symbols'] += 1
                    mistake_types['symbols'].append(change)
                    categorized = True
                    break
            
            if not categorized:
                mistake_counts['not_categorized'] += 1
                mistake_types['not_categorized'].append(change)

    mistake_counts['subtotal_pronouns_verbs'] = mistake_counts['auxiliary'] + mistake_counts['verb'] + \
                                        mistake_counts['male_pronoun'] + mistake_counts['female_pronoun']
    mistake_counts['subtotal_other'] = mistake_counts['symbols'] + mistake_counts['emoji'] + mistake_counts['whitespace'] + \
                                mistake_counts['not_categorized'] + mistake_counts['nonbreaking_space']
                
    return mistake_counts, mistake_types, sentence_indices

In [50]:
source, target = get_source_target('gendered_test_set')
changes, num_changes = get_difference_sent_list(generation=source, target=target)

In [65]:
generation, model_changes = get_changes(eval_set='gendered_test_set', generation='model_sa_nt_10_3')
display_num_changes(model_changes)
print()

mistake_counts, mistake_types, sentence_indices = analyze_target_changes(model_changes, generation)
display_mistake_counts(mistake_counts)

767 73 694
percent of changes that were correct (precision):  0.905
percent of correct changes captured (recall):  0.934

Total of (73) words in the annotation were not in the generation. Mistakes came from (60) sentences.
(34) of the mistakes tagged as pronouns / verbs, and (45) tagged as other mistakes.

Breakdown of the (34) pronouns / verbs mistakes
	(16) pronouns: (4) male, (12) female
	(9) verbs: (9) auxiliary, (9) root verbs

Breakdown of the (45) other mistakes
	(12) emoji, (14) symbols, (11) whitespace, (1) non-breaking space
	(7) not_categorized


In [64]:
mistake_types

defaultdict(list,
            {'male_pronoun': ['their',
              "they're",
              'THEIR',
              'their',
              'mountains?"They'],
             'symbols': ["they're",
              "They're",
              '*them',
              'was.',
              'say,',
              'that?"They',
              'say:',
              'say,',
              'mountains?"They'],
             'female_pronoun': ['their',
              'them',
              'them',
              "They're",
              'them',
              '*them',
              'them',
              'them',
              'their',
              'them',
              'that?"They'],
             'auxiliary': ['be',
              'was',
              'was',
              'was.',
              'were',
              'was',
              'is',
              'is',
              'have'],
             'not_categorized': ['spokesperson', 'love'],
             'verb': ['say,', 'go', 'look', 'decide', 'say:', 'tell', 

In [56]:
for idx in sentence_indices:
    print(model_changes['target_changes'][idx])
    print('generation: ', generation[idx])
    print('annotation: ', target[idx])
    print('---')

(['mind不ợ#theydontknowrealmusic'], ['#theydontknowrealmusic', 'mind😂😩'])
generation:  @callmedollar 10 years from now they are going to feel sorry they didn't pay them any mind不ợ#theydontknowrealmusic

annotation:  @callmedollar 10 years from now they are going to feel sorry they didn't pay them any mind😂😩 #theydontknowrealmusic

---
(['theirs'], ['their'])
generation:  Inspired by theirs own… https://t.co/pyYUsxXtKv

annotation:  Inspired by their own… https://t.co/pyYUsxXtKv

---
(['THEIRS'], ['', 'THEIR'])
generation:  GETTIN THEIRS PRAISE ON #LATEPOST @ Bethel Jerusalem Apostolic Temple https://t.co/vvQShYPiky

annotation:  GETTIN THEIR PRAISE ON  #LATEPOST @ Bethel Jerusalem Apostolic Temple https://t.co/vvQShYPiky

---
(['begins…'], ['begin…'])
generation:  Excited to see what God has in store for them as they begins… https://t.co/5OzgBlQX5S

annotation:  Excited to see what God has in store for them as they begin… https://t.co/5OzgBlQX5S

---
([], [''])
generation:  HUSBAINER (h