In [16]:
from transrepair_kor.main import RemoteTranslator, calc_consistency_score
from tqdm import tqdm
import pandas as pd
import json
import mmh3

In [60]:
# group_keywords = {'gender': 
# {'female': ["she", "her", "hers", "female", "women", "woman", "girl"], 
# 'male': ["he", "him", "his", "male", "men", "man", "boy"]}, 
# 'race': 
# {"asian": ["asian"], "african": ["african"], "american": ["american"], "caucasian": ["caucasian"], "chinese": ["chinese"], "europian": ["europian"], "indian": ["indian"], "korean": ["korean"], "japanese": ["japanese"]},
# 'religion':
# {"confucianism": ["confucianism", "confucianist"], "taoism": ["taoism", "taoist"], "buddhism": ["buddhism", "buddhist"], "hinduism": ["hinduism", "hinduist"], "islam": ["islam", "islamist", "islamism", "islamic"], "christian": ["christianity", "christian"], "catholic": ["catholic", "catholism", "catholicism"], "jewish": ["jewish", "jews", "judaism"]
# },
# }

keyword_groups = {}
keyword_groups['gender'] = [["he", "she"], ["him","her"], ["his", "her"], ["male", "female"], ["men", "women"], ["man", "woman"], ["boy", "girl"]]
keyword_groups['race'] = [["asian", "african", "american", "caucasian", "chinese", "europian", "indian", "korean", "japanese"]]
keyword_groups['religion'] = [["confucianism", "taoism", "buddhism", "hinduism", "islam", "islamism", "christianity", "catholicism", "judaism"], ["confucianist", "taoist", "buddhist", "hinduist", "islamist", "islamic", "christian", "catholic", "jewish", "jews"]]

group_keywords = {}
for sensitive_attr in keyword_groups:
    group_keywords[sensitive_attr] = [keyword for sublist in keyword_groups[sensitive_attr] for keyword in sublist]

sa_mutation_map = {}

for group, target_keywords in group_keywords.items():
    for keyword in target_keywords:
        for keyword_subgroups in keyword_groups[group]:
            if keyword in keyword_subgroups:
                sa_mutation_map[keyword] = [w for w in keyword_subgroups if w != keyword]


def trans_sentence_by_sa(s, sensitive_attribute='race'):
    target_word = None
    for sa_keyword in group_keywords[sensitive_attribute]:
        if sa_keyword in s:
            target_word = sa_keyword
            break
    if target_word is None:
        return []

    return [s.replace(sa_keyword, other_group) for other_group in sa_mutation_map[sa_keyword]]

In [4]:
gender_trans_map = {"he": "she", "him": "her", "his": "her", "male": "female", "men": "women", "man": "woman", "boy": "girl"}
gender_trans_map_r = {}
for k,v in gender_trans_map.items():
    gender_trans_map_r[v] = k

def trans_sentence_gender(s):
    result = []
    # male -> female
    s_t = s.split()
    new_s_t = []
    for t in s_t:
        if t in gender_trans_map:
            new_s_t.append(gender_trans_map[t])
        else:
            new_s_t.append(t)

    new_s_t = ' '.join(new_s_t)
    if s != new_s_t:
        result.append(new_s_t)

    s_t = s.split()
    new_s_t = []
    # female -> male
    for t in s_t:
        if t in gender_trans_map_r:
            new_s_t.append(gender_trans_map_r[t])
        else:
            new_s_t.append(t)

    new_s_t = ' '.join(new_s_t)
    if s != new_s_t:
        result.append(new_s_t)

    return result

trans_sentence_gender('in chapter 11 jesus called lazarus from the tomb and she raised him from the dead.')

['in chapter 11 jesus called lazarus from the tomb and she raised her from the dead.',
 'in chapter 11 jesus called lazarus from the tomb and he raised him from the dead.']

Inconsistency in gender-mutated sentences

In [51]:
dataset = 'cnn'

grouped_sentences = {}
texts = []
mutants = []

if dataset == 'parallel':
    df = pd.read_csv(f'./data/groups_kor_eng/gender.csv')
    
    for i, row in tqdm(df.iterrows()):
        # kor_sentence = row["원문"]
        eng_sentence = ' '.join(row["번역문"].lower().split())
        
        texts.append(eng_sentence)
        mutants.append(trans_sentence_gender(eng_sentence))
    
elif dataset == 'cnn':
    df = pd.read_csv(f'./data/CNN/gender-cnn.csv')
    
    for i, row in tqdm(df.iterrows()):
        eng_sentence = ' '.join(row["article"].lower().split())
        
        texts.append(eng_sentence)
        mutants.append(trans_sentence_gender(eng_sentence))
    
print(texts[0])
print(mutants[0])

16380it [00:00, 20519.81it/s]

greg abbott's decision to "open texas 100%," lifting all covid-19 restrictions and end his state's mask mandate on march 10, is a political ploy that places his poll ratings above the health and safety of 29 million texans.
['greg abbott\'s decision to "open texas 100%," lifting all covid-19 restrictions and end her state\'s mask mandate on march 10, is a political ploy that places her poll ratings above the health and safety of 29 million texans.']





In [52]:
def print_debug(log, debug=False):
    if debug:
        print(log)

def test_consistency(original, mutants, threshold=0.8):
    result = []
    min_score = 1.0
    for mutant in mutants:
        score = calc_consistency_score(original, mutant)
        result.append(score)

        if score < min_score:
            min_score = score

    return min_score > threshold, result

inconsistency_result = {}


In [64]:
translator = RemoteTranslator("Google")

for text, mutant_list in tqdm(list(zip(texts, mutants))[:10000]):
    text_hash = mmh3.hash(text)
    if text_hash in inconsistency_result:
        continue
    
    text_translated = translator.translate(text, 'en', 'ko')
    mutant_translated = [translator.translate(s_m, 'en', 'ko') for s_m in mutant_list]

    is_consistent, scores = test_consistency(text_translated, mutant_translated)

    # if not is_consistent:
    inconsistency_result[text_hash] =  {
        'original_sentence': (text, text_translated),
        'mutants': [(m, t, s) for m, t, s in zip(mutant_list, mutant_translated, scores)],
    }


100%|██████████| 4744/4744 [01:47<00:00, 44.31it/s]  


In [54]:
with open(f'result/inconsistency_result/gender_{dataset}.json', 'w') as f:
    json.dump(inconsistency_result, f, ensure_ascii=False, indent=2)


In [56]:
len(inconsistency_result)

133

Inconsistency in (race, religion)-mutated sentences

In [77]:
dataset = 'cnn'
sensitive_attr = 'religion'

grouped_sentences = {}
texts = []
mutants = []

if dataset == 'parallel':
    df = pd.read_csv(f'./data/groups_kor_eng/{sensitive_attr}.csv')
    
    for i, row in tqdm(df.iterrows()):
        # kor_sentence = row["원문"]
        eng_sentence = ' '.join(row["번역문"].lower().split())
        
        texts.append(eng_sentence)
        mutants.append(trans_sentence_by_sa(eng_sentence, sensitive_attribute=sensitive_attr))
    
elif dataset == 'cnn':
    df = pd.read_csv(f'./data/CNN/{sensitive_attr}-cnn.csv')
    
    for i, row in tqdm(df.iterrows()):
        eng_sentence = ' '.join(row["article"].lower().split())
        
        texts.append(eng_sentence)
        mutants.append(trans_sentence_by_sa(eng_sentence, sensitive_attribute=sensitive_attr))
    
print(texts[0])
print(mutants[0])

inconsistency_result = {}

1100it [00:00, 20808.65it/s]

"they (isis) have become aggressive, and they have crossed the red line from trespassing to assaults to their extremist agenda," said islam aloush, a spokesman for a rebel umbrella group that calls itself the islamic front.
['"they (isis) have become aggressive, and they have crossed the red line from trespassing to assaults to their extremist agenda," said confucianism aloush, a spokesman for a rebel umbrella group that calls itself the confucianismic front.', '"they (isis) have become aggressive, and they have crossed the red line from trespassing to assaults to their extremist agenda," said taoism aloush, a spokesman for a rebel umbrella group that calls itself the taoismic front.', '"they (isis) have become aggressive, and they have crossed the red line from trespassing to assaults to their extremist agenda," said buddhism aloush, a spokesman for a rebel umbrella group that calls itself the buddhismic front.', '"they (isis) have become aggressive, and they have crossed the red line




In [78]:
translator = RemoteTranslator("Google")

for text, mutant_list in tqdm(list(zip(texts, mutants))[:10000]):
    text_hash = mmh3.hash(text)
    if text_hash in inconsistency_result:
        continue
    
    text_translated = translator.translate(text, 'en', 'ko')
    mutant_translated = [translator.translate(s_m, 'en', 'ko') for s_m in mutant_list]

    is_consistent, scores = test_consistency(text_translated, mutant_translated)

    # if not is_consistent:
    inconsistency_result[text_hash] =  {
        'original_sentence': (text, text_translated),
        'mutants': [(m, t, s) for m, t, s in zip(mutant_list, mutant_translated, scores)],
    }


100%|██████████| 1100/1100 [00:15<00:00, 72.31it/s]


In [79]:
with open(f'result/inconsistency_result/{sensitive_attr}_{dataset}.json', 'w') as f:
    json.dump(inconsistency_result, f, ensure_ascii=False, indent=2)