In [1]:
from collections import Counter, defaultdict
import re
import spacy
import matplotlib.pyplot as plt

In [2]:
nlp = spacy.load("en_core_web_sm")

In [144]:
MALE_PRONOUNS = ['he', 'him', 'his', 'himself']
FEMALE_PRONOUNS = ['she', 'her', 'hers', 'herself']

def is_gendered(sentence):
    sentence = sentence.lower()
    contains_male = any(re.search(r'\b{}\b'.format(m_pronoun), sentence) for m_pronoun in MALE_PRONOUNS)
    contains_female = any(re.search(r'\b{}\b'.format(f_pronoun), sentence) for f_pronoun in FEMALE_PRONOUNS)
    if contains_male and not contains_female:
        return "male"
    elif contains_female and not contains_male:
        return "female"
    return False

In [3]:
def get_source_target():
    with open(f'/home/jupyter/wikipedia_training/original/wikipedia_gendered.source', 'r') as f:
        source = f.readlines()
    
    with open(f'/home/jupyter/wikipedia_training/original/wikipedia_gendered.target', 'r') as f:
        target = f.readlines()
        
    return source, target

In [4]:
source, target = get_source_target()

In [5]:
len(source), len(target)

(15000000, 15000000)

In [12]:
def clean_sent(sent):
    nonbreaking_space = re.compile('\xa0')
    sent = nonbreaking_space.sub(' ', sent)
    return sent

In [13]:
clean_sent(source[0])

'In 1961, his single "Water Boy" reached No. 40 on the Billboard Hot 100 and stayed on the chart for 14 weeks.\n'

In [26]:
# # ordering of pronouns matters (so that we don't replace "them" part of "themself" when doing regex)

# male_mapping = {
#     'they': 'he',
#     'themself': 'himself',
#     'them': 'him',
#     'theirs': 'his',
#     'their': 'his',
# }

# female_mapping = {
#     'they': 'she',
#     'themself': 'herself',
#     'them': 'her',
#     'theirs': 'hers',
#     'their': 'her',
# }

# verbs_mapping = {
#     'were': 'was',
#     'are': 'is',
#     'have': 'has'
# }

In [28]:
# def augment_sent(sent, replace_gender):
#     if replace_gender == 'male':
#         for neutral, pronoun in male_mapping.items():
#             for case in [(neutral, pronoun), (neutral.capitalize(), pronoun.capitalize()), (neutral.upper(), pronoun.upper())]:
#                 pattern = re.compile(case[0])
#                 sent = pattern.sub(case[1], sent)
    
#     elif replace_gender == 'female':
#         for neutral, pronoun in female_mapping.items():
#             for case in [(neutral, pronoun), (neutral.capitalize(), pronoun.capitalize()), (neutral.upper(), pronoun.upper())]:
#                 pattern = re.compile(case[0])
#                 sent = pattern.sub(case[1], sent)
    
#     return sent

In [42]:
# augment_sent('They THEM THEIRS hello THEMSELF their', 'female')

In [37]:
# augmentation_list = list()

# for i in range(100):
#     gender = is_gendered(source[i])
#     if gender != 'male' and gender != 'female':
#         print(f'{i}: not gendered (probably a mistake)')
#         continue
        
#     sent = clean_sent(target[i])
    
#     if gender == 'male':
#         replace_gender = 'female'
#     elif gender == 'female':
#         replace_gender = 'male'
    
#     augmentation = augment_sent(sent=sent, replace_gender=replace_gender)
    
#     augmentation_list.append(source[i])
#     augmentation_list.append(augmentation)

In [40]:
MALE_PRONOUNS = ['he', 'him', 'his', 'himself']
FEMALE_PRONOUNS = ['she', 'her', 'hers', 'herself']
NEUTRAL_PRONOUNS = ['they', 'them', 'their', 'theirs', 'themself']

In [57]:
# gender_mapping = [
#     ('he', 'she'),
#     ('himself', 'herself'),
#     ('him', 'her'),
#     ('his', 'her'),
# ]

In [92]:
# ordering of pronouns matters (so that we don't replace "them" part of "themself" when doing regex)
# let's also try downsampling instead of augmentation (this way, cleaner sentences. augmentation --> noisy samples)

female_to_male = {
    'she': 'he',
    'herself': 'himself',
    'hers': 'his',
    'her': 'his',  # not ideal, missing "him" (1.1% of cases)
}

male_to_female= {
    'he': 'she',
    'himself': 'herself',
    'him': 'her',  
    'his': 'her',  # not ideal, missing "hers" (1.4% of cases)
}

In [116]:
def augment_sent(sent, replace_gender):
    if replace_gender == 'male':
        for female, male in female_to_male.items():
            for case in [(male, female), (male.capitalize(), female.capitalize()), (male.upper(), female.upper())]:
                pattern = re.compile(r'\b{}\b'.format(case[1]))
                sent = pattern.sub(case[0], sent)
    
    elif replace_gender == 'female':
        for male, female in male_to_female.items():
            for case in [(male, female), (male.capitalize(), female.capitalize()), (male.upper(), female.upper())]:
                pattern = re.compile(r'\b{}\b'.format(case[0]))
                sent = pattern.sub(case[1], sent)
    
    return sent

In [96]:
test_sent = 'They THEM THEIRS THEMSELF their test he Him HIS himself test She Her hers HERSELF'
augment_sent(test_sent, 'female')

'They THEM THEIRS THEMSELF their test he Him HER himself test She Her hers HERSELF'

In [117]:
augment_sent('He was born in Monopoli near Bari, he was orphaned of father and impoverished.\n', 'female')

'She was born in Monopoli near Bari, she was orphaned of father and impoverished.\n'

In [145]:
male_count, female_count = 0, 0

for i in range(len(source)):
    gender = is_gendered(source[i])
    if gender != 'male' and gender != 'female':
        print(f'{i}: not gendered (probably a mistake)')
        continue
    
    if gender == 'male':
        male_count += 1
    if gender == 'female':
        female_count += 1

In [146]:
male_count, female_count

(11809891, 3190109)

In [148]:
round(male_count / len(source), 3), round(female_count / len(source), 3)

(0.787, 0.213)

In [139]:
%%time
augmentation_source = list()
augmentation_target = list()

for i in range(len(source)):
    gender = is_gendered(source[i])
    if gender != 'male' and gender != 'female':
        print(f'{i}: not gendered (probably a mistake)')
        continue
        
    sent = clean_sent(source[i])
    
    if gender == 'male':
        replace_gender = 'female'
    elif gender == 'female':
        replace_gender = 'male'
    
    augmentation = augment_sent(sent=sent, replace_gender=replace_gender)
    
    augmentation_source.append(sent)
    augmentation_source.append(augmentation)
    
    for _ in range(2):
        augmentation_target.append(clean_sent(target[i]))

CPU times: user 20min 59s, sys: 3.99 s, total: 21min 3s
Wall time: 21min 3s


In [140]:
print(len(augmentation_source))
augmentation_source[:10]

30000000


['In 1961, his single "Water Boy" reached No. 40 on the Billboard Hot 100 and stayed on the chart for 14 weeks.\n',
 'In 1961, her single "Water Boy" reached No. 40 on the Billboard Hot 100 and stayed on the chart for 14 weeks.\n',
 'He was born in Monopoli near Bari, he was orphaned of father and impoverished.\n',
 'She was born in Monopoli near Bari, she was orphaned of father and impoverished.\n',
 "He was a member of India's gold-medal winning hockey team at the 2014 Asian Games in Incheon, South Korea.\n",
 "She was a member of India's gold-medal winning hockey team at the 2014 Asian Games in Incheon, South Korea.\n",
 'He was a polyglot, being well versed in Kannada, Marathi and Urdu, Tamil, Telugu besides English.\n',
 'She was a polyglot, being well versed in Kannada, Marathi and Urdu, Tamil, Telugu besides English.\n',
 'Effectively blacklisted for working on a studio project during the 1988 Writers Guild of America strike, he finally got financial backing for a feature from i

In [141]:
print(len(augmentation_target))
augmentation_target[:10]

30000000


['In 1961, theirs single "Water Boy" reached No. 40 on the Billboard Hot 100 and stayed on the chart for 14 weeks.\n',
 'In 1961, theirs single "Water Boy" reached No. 40 on the Billboard Hot 100 and stayed on the chart for 14 weeks.\n',
 'They were born in Monopoli near Bari, they were orphaned of father and impoverished.\n',
 'They were born in Monopoli near Bari, they were orphaned of father and impoverished.\n',
 "They were a member of India's gold-medal winning hockey team at the 2014 Asian Games in Incheon, South Korea.\n",
 "They were a member of India's gold-medal winning hockey team at the 2014 Asian Games in Incheon, South Korea.\n",
 'They were a polyglot, being well versed in Kannada, Marathi and Urdu, Tamil, Telugu besides English.\n',
 'They were a polyglot, being well versed in Kannada, Marathi and Urdu, Tamil, Telugu besides English.\n',
 'Effectively blacklisted for working on a studio project during the 1988 Writers Guild of America strike, they finally got financial 

In [142]:
with open('/home/jupyter/wikipedia_training/simple_augmentation/simple_augmentation.source', 'w') as f:
    for sent in augmentation_source:
        f.write(sent)

In [143]:
with open('/home/jupyter/wikipedia_training/simple_augmentation/simple_augmentation.target', 'w') as f:
    for sent in augmentation_target:
        f.write(sent)

In [132]:
import sys
sys.path.append('/home/jupyter/tony-sun-intern-project/neutral_generation')
from constants import *
from smart_convert import convert

In [133]:
%%time
convert('In 1961, her single "Water Boy" reached No. 40 on the Billboard Hot 100 and stayed on the chart for 14 weeks.\n')

CPU times: user 24 ms, sys: 0 ns, total: 24 ms
Wall time: 170 ms


'In 1961, their single "Water Boy" reached No. 40 on the Billboard Hot 100 and stayed on the chart for 14 weeks.\n'