In [1]:
pwd

'/private/home/ccaucheteux/wordemb-gender-bias'

In [2]:
from pathlib import Path
import fasttext
import numpy as np

In [3]:
path_to_fasttext_models = Path("../socio_wordvec/notebooks")

# Functions

In [4]:
def similarity(x, y):
    sim = np.dot(x, y)/(np.sqrt(np.dot(x, x))*np.sqrt(np.dot(y, y)))
    return sim

def score(attribute, target_set, model):
    sim = [similarity(model.get_word_vector(attribute), model[target]) for target in target_set]
    avg_sim = np.mean(sim)
    return avg_sim

def association(w, A, B, model):
    return score(w, A, model) - score(w, B, model)

def weat_score(X, Y, A, B, model):
    ass_x = [association(w, A, B, model) for w in X]
    ass_y = [association(w, A, B, model) for w in Y]
    
    weat = np.mean(ass_x) - np.mean(ass_y)
    weat_es = weat / np.std(ass_x + ass_y)
    
    return weat, weat_es

def load_model(model_name):
    model_path = path_to_fasttext_models / model_name
    assert Path(model_path).is_file(), f"{model_path} does not exist"
    print("Loading model from ", model_path)
    ft = fasttext.load_model(str(model_path))
    return ft

# Test with english model

In [5]:
%time
lang = "en"
model_name = f"cc.{lang}.300.bin" # the model file (should be the same, ending by '.bin')
model = load_model(model_name)

CPU times: user 0 ns, sys: 2 µs, total: 2 µs
Wall time: 5.72 µs
Loading model from  ../socio_wordvec/notebooks/cc.en.300.bin




In [6]:
A = ["man"]
B = ["woman"]
X = ["career"]
Y = ["wedding"]

weat, weat_es = weat_score(X, Y, A, B, model)
weat, weat_es

(0.059203893, 2.0)

In [7]:
A = ["man"]
B = ["woman"]
X = ["career"]
Y = ["family"]

weat, weat_es = weat_score(X, Y, A, B, model)
weat, weat_es

(0.0009842217, 2.0)

In [8]:
A = ["man"]
B = ["woman"]
X = ["career"]
Y = ["children"]

weat, weat_es = weat_score(X, Y, A, B, model)
weat, weat_es

(0.08110222, 2.0)

In [9]:
x = model ["man"]
y = model ["children"]
similarity(x,y)

0.2529125

In [10]:
A = ["man", "he"]
B = ["woman", "she"]
X = ["salary", "career"]
Y = ["home", "wedding"]

weat, weat_es = weat_score(X, Y, A, B, model)
weat, weat_es

(0.09027621, 1.7388039)

In [11]:
A = ["male", "man", "boy", "brother", "he", "him", "his", "son"]
B = ["female", "woman", "girl", "sister", "she", "her", "hers", "daughter"]
X = ["executive", "management", "professional", "corporation", "salary", "office", "business", "career"]
Y = ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]

weat, weat_es = weat_score(X, Y, A, B, model)
weat, weat_es

(0.042019784, 1.359282)

In [12]:
X = ["male", "man", "boy", "brother", "he", "him", "his", "son"]
Y = ["female", "woman", "girl", "sister", "she", "her", "hers", "daughter"]
A = ["executive", "management", "professional", "corporation", "salary", "office", "business", "career"]
B = ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]

weat, weat_es = weat_score(X, Y, A, B, model)
weat, weat_es

(0.042019784, 0.39894468)

In [13]:
A = ["man"]
B = ["woman"]
X = ["career", "management"]
Y = ["wedding", "parents"]

weat, weat_es = weat_score(X, Y, A, B, model)
weat, weat_es

(0.08375169, 1.7685586)

In [14]:
%time
lang="en" # Select the language here, english. 
model_name = f"cc.{lang}.300.bin" # the model file (should be the same, ending by '.bin')
model = load_model(model_name)

CPU times: user 1 µs, sys: 2 µs, total: 3 µs
Wall time: 7.39 µs
Loading model from  ../socio_wordvec/notebooks/cc.en.300.bin




In [15]:
A = ["male", "man", "boy", "brother", "he", "him", "his", "son"]
B = ["female", "woman", "girl", "sister", "she", "her", "hers", "daughter"]
X = ["executive", "management", "professional", "corporation", "salary", "office", "business", "career"]
Y = ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]

weat, weat_es = weat_score(X, Y, A, B, model)
weat, weat_es

(0.042019784, 1.359282)

In [16]:
X = ["male", "man", "boy", "brother", "he", "him", "his", "son"]
Y = ["female", "woman", "girl", "sister", "she", "her", "hers", "daughter"]
A = ["executive", "management", "professional", "corporation", "salary", "office", "business", "career"]
B = ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]

weat, weat_es = weat_score(X, Y, A, B, model)
weat, weat_es

(0.042019784, 0.39894468)

# Generalize to other languages

## Functions

In [17]:
from deep_translator import GoogleTranslator
def run_exp_google_tr(lang, X, Y, A, B):
    model_name = f"cc.{lang}.300.bin" # the model file (should be the same, ending by '.bin')
    model = load_model(model_name)

    # Traduction
    trs = {}
    for name, words in zip(["A", "B", "X", "Y"], [A, B, X, Y]):
        out = []
        for w in words:
            tw = GoogleTranslator(source='en', target=lang).translate(text=w)
            tw = tw.strip()
            out.append(tw)
        print(name, out)
        trs[name] = out

    weat, weat_es = weat_score(trs["X"], 
                               trs["Y"], 
                               trs["A"], 
                               trs["B"], 
                               model)
    return weat, weat_es, trs

## Example for italian

In [18]:
X = ["male", "man", "boy", "brother", "he", "him", "his", "son"]
Y = ["female", "woman", "girl", "sister", "she", "her", "hers", "daughter"]
A = ["executive", "management", "professional", "corporation", "salary", "office", "business", "career"]
B = ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]

weat, weat_es, translations = run_exp_google_tr("it", X, Y, A, B)

print("weat", weat)
print("weat_es", weat_es)
print("translations", translations)

Loading model from  ../socio_wordvec/notebooks/cc.it.300.bin




A ['esecutivo', 'gestione', 'professionale', 'società', 'stipendio', 'ufficio', 'attività commerciale', 'carriera']
B ['casa', 'genitori', 'bambini', 'famiglia', 'cugini', 'matrimonio', 'nozze', 'parenti']
X ['maschio', 'uomo', 'ragazzo', 'fratello', 'lui', 'lui', 'la sua', 'figlio']
Y ['femmina', 'donna', 'ragazza', 'sorella', 'lei', 'sua', 'la sua', 'figlia']
weat 0.0123478845
weat_es 0.12605372
translations {'A': ['esecutivo', 'gestione', 'professionale', 'società', 'stipendio', 'ufficio', 'attività commerciale', 'carriera'], 'B': ['casa', 'genitori', 'bambini', 'famiglia', 'cugini', 'matrimonio', 'nozze', 'parenti'], 'X': ['maschio', 'uomo', 'ragazzo', 'fratello', 'lui', 'lui', 'la sua', 'figlio'], 'Y': ['femmina', 'donna', 'ragazza', 'sorella', 'lei', 'sua', 'la sua', 'figlia']}


## All languages

Select languages

In [20]:
all_languages = ["af", "sq", "ar", "hy", "az", "be", "bn", "bs", "bg", "my", "hr", "cs", "da", "nl", "en", "et", "fi", "fr", "gl", "ka", "de", "el", "hi", "hu", "is", "id", "ga", "it", "ja", "kk", "km", "ky", "ko", "ku", "lv", "lt", "lb", "mk", "mg", "ms", "mt", "ne", "no", "ps", "fa", "pl", "pt", "qu", "ro", "ru", "gd", "sd", "sr", "sl", "sk", "so", "es", "th", "vi", "uk", "uz", "he", "sv", "tl", "tr", "ur", "zh", "th", "vi", "uk", "uz", "he", "sv", "tl", "tr", "ur", "zh"]


Check languages that are compatible with translations

In [21]:
tr = GoogleTranslator()
all_google_languages = tr.get_supported_languages(as_dict=True)
all_google_languages = all_google_languages.values()
missing_google_languages = [x for x in all_languages if x not in all_google_languages]
print(f"Missing translations for lang {missing_google_languages}")

Missing translations for lang ['he', 'zh', 'he', 'zh']


Check that models exists

In [22]:
all_models = path_to_fasttext_models.glob("cc.*.bin")
all_models = [p.stem.split(".")[1] for p in all_models]
missing_models = [l for l in all_languages if l not in all_models]
print(f"Missing models for lang {missing_models}")

Missing models for lang []


Select intersection

In [23]:
sel_languages = [lang for lang in all_languages if (lang in all_models) and lang in all_google_languages]
print(f"{len(sel_languages)} languages out of {len(all_languages)}")

73 languages out of 77


## Run on cluster

In [26]:
def submit_job(lang, reduced=False):
    if reduced:
        A = ["man", "he"]
        B = ["woman", "she"]
        X = ["salary", "career"]
        Y = ["home", "wedding"]
    else:
        X = ["male", "man", "boy", "brother", "he", "him", "his", "son"]
        Y = ["female", "woman", "girl", "sister", "she", "her", "hers", "daughter"]
        A = ["executive", "management", "professional", "corporation", "salary", "office", "business", "career"]
        B = ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]
    weat, weat_es, translations = run_exp_google_tr(lang, X, Y, A, B)
    return weat, weat_es, translations

In [27]:
weat, weat_es, translations = submit_job("it")

Loading model from  ../socio_wordvec/notebooks/cc.it.300.bin




A ['esecutivo', 'gestione', 'professionale', 'società', 'stipendio', 'ufficio', 'attività commerciale', 'carriera']
B ['casa', 'genitori', 'bambini', 'famiglia', 'cugini', 'matrimonio', 'nozze', 'parenti']
X ['maschio', 'uomo', 'ragazzo', 'fratello', 'lui', 'lui', 'la sua', 'figlio']
Y ['femmina', 'donna', 'ragazza', 'sorella', 'lei', 'sua', 'la sua', 'figlia']


In [28]:
import submitit
from submitit import AutoExecutor
executor = AutoExecutor(
            f"submitit_jobs/submitit_jobs/wordemb_bias")
executor.update_parameters(
    slurm_partition="devlab",
    slurm_array_parallelism=200,
    timeout_min=60,
    name="wordemb_bias",
    mem_gb=40,
    cpus_per_task=2,
    gpus_per_node=0,
)

jobs_all_words = executor.map_array(submit_job, all_languages)

In [29]:
jobs_reduced = executor.map_array(submit_job, all_languages, [True]*len(all_languages))

## Load results

In [43]:
jobs = jobs_reduced
is_completed = np.array([j.state=="COMPLETED" for j in jobs])
is_failed = np.array([j.state=="FAILED" for j in jobs])
print(f"{is_failed.sum()} FAILED jobs, {is_completed.sum()} COMPLETED out of {len(jobs)} jobs")
print(f"FAILED languages: {np.array(all_languages)[is_failed]}")
langs = np.array(all_languages)[is_completed]
results = [j.results()[0] for (i, j) in enumerate(jobs) if is_completed[i]]

4 FAILED jobs, 72 COMPLETED out of 77 jobs
FAILED languages: ['he' 'zh' 'he' 'zh']


In [44]:
import pandas as pd
weat = [r[0] for r in results]
weat_std = [r[1] for r in results]
trads = {k: [r[-1][k] for r in results] for k in ["A", "B", "X", "Y"]}

In [45]:
df = pd.DataFrame({"lang": langs, "weat": weat, "weat_std": weat_std, **trads})

In [46]:
print(df.to_csv())

,lang,weat,weat_std,A,B,X,Y
0,af,0.015783235,0.43804803,"['man', 'hy']","['vrou', 'sy']","['salaris', 'loopbaan']","['tuis', 'troue']"
1,sq,0.10622085,1.9631861,"['njeri', 'ai']","['grua', 'ajo']","['paga', 'karrierës']","['shtëpi', 'dasma']"
2,ar,0.06379421,0.94665915,"['رجل', 'هو']","['النساء', 'هي']","['راتب', 'مسار مهني مسار وظيفي']","['الصفحة الرئيسية', 'حفل زواج']"
3,hy,0.0022929087,0.076396756,"['մարդ', 'նա']","['կին', 'նա']","['աշխատավարձ', 'կարիերա']","['տուն', 'հարսանիք']"
4,az,0.005019672,0.15148214,"['adam', 'o']","['qadın', 'o']","['maaş', 'karyera']","['ev', 'toy']"
5,be,0.029921457,0.59501183,"['чалавек', 'ён']","['жанчына', 'яна']","['зарплата', ""кар'ера""]","['дадому', 'вяселле']"
6,bn,0.074292004,1.3269458,"['মানুষ', 'তিনি']","['মহিলা', 'সে']","['বেতন', 'কর্মজীবন']","['বাড়ি', 'বিবাহ']"
7,bs,-0.10816908,-1.9941343,"['covece', 'on']","['zena', 'ona']","['plata', 'karijera']","['Dom', 'vjenčanje']"
8,bg,-0.08184309,-0.895407,"['човек', 'той']","['жена', 'тя']","['запла

## Download missing models

In [None]:
import fasttext.util
sel_languages = "af", "sq", "ar", "hy", "az", "be", "bn", "bs", "bg", "my", "hr", "cs", "da", "nl", "en", "et", "fi", "fr", "gl", "ka", "de", "el", "hi", "hu", "is", "id", "ga", "it", "ja", "kk", "km", "ky", "ko", "ku", "lv", "lt", "lb", "mk", "mg", "ms", "mt", "ne", "no", "ps", "fa", "pl", "pt", "qu", "ro", "ru", "gd", "sd", "sr", "sl", "sk", "so", "es", "th", "vi", "uk", "uz", "lo", "he", "sv", "tl", "tr", "ur", "zh", "zu", "th", "vi", "uk", "uz", "lo", "he", "sv", "tl", "tr", "ur", "zh", "zu"
for lang in sel_languages:
    print(lang)
    fasttext.util.download_model(lang, if_exists='ignore') #'en', if_exists='ignore')  # English