In [179]:
import numpy as np
import os
import pandas as pd
from gensim.models import Word2Vec
from wefat import WEFAT

### Load Word2Vec models and target words

In [21]:
def create_model_dict(embeddings_path):
    embedding_files = os.listdir(embeddings_path)
    model_dict = {}
    for file in embedding_files:
        name, ext = os.path.splitext(file)
        if ext == '.bin':
            print(f"Loading Word2Vec from {file}")
            model_dict[file.split(".")[0]] = Word2Vec.load(f"{embeddings_path}/{file}")
    return model_dict        

In [22]:
embeddings_path = "../data/embeddings"
model_dict = create_model_dict(embeddings_path)

Loading Word2Vec from blackladies.bin
Loading Word2Vec from braincels.bin
Loading Word2Vec from feminisms.bin
Loading Word2Vec from feminismuncensored.bin
Loading Word2Vec from feminism_2015_2017.bin
Loading Word2Vec from feminism_2017_2019.bin
Loading Word2Vec from feminism_2019_2021.bin
Loading Word2Vec from feminism_2021_2023.bin
Loading Word2Vec from feminism_full.bin
Loading Word2Vec from fireyfemmes.bin
Loading Word2Vec from fourthwavewomen.bin
Loading Word2Vec from incels.bin
Loading Word2Vec from incels_full.bin
Loading Word2Vec from mensrights.bin
Loading Word2Vec from trufemcels.bin
Loading Word2Vec from women.bin


In [None]:
def read_dict_from_file(file_path):
    with open(file_path, 'r') as file:
        dict = json.load(file)
    return dict

In [None]:
target_word_file = "target_words.txt"
target_words = read_dict_from_file(target_word_file)

### Create embeddings and run WEFAT

In [128]:
def create_embeddings_dict(corpora_model, all_words):
    embedding_dict = {}
    for key in all_words.keys():
        embeddings = {}
        for word in all_words[key]:
            try:
                embeddings[word] = np.array(corpora_model.wv[word], dtype='float32')
            except:
                print(f"{word} is not in the corpus")
        embedding_dict[key] = embeddings
    return embedding_dict
    

In [161]:
# https://github.com/e-mckinnie/WEAT
# authored by Elizabeth McKinnie, 2022
# from main.py
def test_wefat(embedded_data):
    test = WEFAT(embedded_data['target'], list(embedded_data['attribute_1'].values()), list(embedded_data['attribute_2'].values()))
    return test.all_effect_sizes()

In [None]:
def calculate_wefat(target_words, model_dict, attributes):
    results = {}
    for corpus in model_dict.keys():
        print(f"Corpus: {corpus}")
        target_attribute = attributes.copy()
        target_attribute['target'] = target_words[corpus]
        embeddings_dict = create_embeddings_dict(model_dict[corpus], target_attribute)
        effect_sizes = test_wefat(embeddings_dict)
        results[corpus] = effect_sizes
    return results

In [150]:
# safe/dangerous
safe_dangerous = {'attribute_1': ["peculiarly", "hill", "terminology", "childhood", "accord", "cotton", "sleeping", "nap", "calm", "softness", "natural", "serenity", "reassurance", "peace", "angel", "harmonious", "honest", "blessing", "trustworthy", "wisdom"],
                  'attribute_2': ["homicide", "wrath", "terrifying", "poisonous", "masochism", "obsessive", "acid", "claw", "hospital", "weld", "bankruptcy", "suffering", "mistreated", "abused", "misbehave", "suicidal", "chaos", "toxic", "bloodbath", "killer", "murderer", "assassinate", "earthquake", "missile", "firearm", "firing", "firearms"]}

In [151]:
# weak/powerful
weak_powerful = {'attribute_1': ["disagreeable", "shitload", "deceased", "abandoned", "depressed", "decay", "penniless", "sorrow", "feeble", "weak", "void", "idle", "nothing", "slow", "yawn", "tiny", "small", "meek", "sofa", "empty", "inactivity", "emptiness", "dryness", "drool", "bladder"],
                 'attribute_2': ["rearrange", "superintendent", "desire", "immense", "passion", "excite", "superpower", "champion", "almighty", "success", "generous", "perfect", "freedom", "excellence", "prestigious", "winning", "greatness", "triumph", "victorious", "mighty", "conquering", "conquer", "adrenaline", "intensity", "competitor", "wrestler", "warrior", "dominant", "exorbitant"]}

In [163]:
safe_dangerous_wefat = calculate_wefat(target_words, model_dict, safe_dangerous)

Corpus: blackladies
peculiarly is not in the corpus
masochism is not in the corpus
mistreated is not in the corpus
bloodbath is not in the corpus
firearms is not in the corpus
normie is not in the corpus
Corpus: braincels
peculiarly is not in the corpus
onlyfans is not in the corpus
Corpus: feminisms
peculiarly is not in the corpus
hill is not in the corpus
terminology is not in the corpus
cotton is not in the corpus
sleeping is not in the corpus
nap is not in the corpus
softness is not in the corpus
serenity is not in the corpus
reassurance is not in the corpus
angel is not in the corpus
harmonious is not in the corpus
blessing is not in the corpus
trustworthy is not in the corpus
wisdom is not in the corpus
wrath is not in the corpus
terrifying is not in the corpus
poisonous is not in the corpus
masochism is not in the corpus
obsessive is not in the corpus
acid is not in the corpus
claw is not in the corpus
weld is not in the corpus
bankruptcy is not in the corpus
suffering is not in

In [164]:
weak_powerful_wefat = calculate_wefat(target_words, model_dict, weak_powerful)

Corpus: blackladies
disagreeable is not in the corpus
abandoned is not in the corpus
penniless is not in the corpus
inactivity is not in the corpus
almighty is not in the corpus
conquering is not in the corpus
normie is not in the corpus
Corpus: braincels
onlyfans is not in the corpus
Corpus: feminisms
disagreeable is not in the corpus
shitload is not in the corpus
deceased is not in the corpus
abandoned is not in the corpus
depressed is not in the corpus
decay is not in the corpus
penniless is not in the corpus
sorrow is not in the corpus
feeble is not in the corpus
void is not in the corpus
idle is not in the corpus
yawn is not in the corpus
meek is not in the corpus
sofa is not in the corpus
inactivity is not in the corpus
emptiness is not in the corpus
dryness is not in the corpus
drool is not in the corpus
bladder is not in the corpus
rearrange is not in the corpus
superintendent is not in the corpus
immense is not in the corpus
passion is not in the corpus
excite is not in the co

### Explore results

In [169]:
# safe/dangerous: > 0 => safe, < 0 => dangerous
# weak/powerful: > 0 => weak, < 0 => powerful
# in general, association with attribute 1 is > 0, with attribute 2 is < 0
def get_top_n_words(word_dict, n, is_attribute_1):
    return dict(sorted(word_dict.items(), key=lambda item: item[1], reverse=is_attribute_1)[0:n])


In [170]:
get_top_n_words(safe_dangerous_wefat['feminism_full'], 5, True)

{'love': 0.7123157,
 'gender': 0.49187315,
 'queer': 0.34608305,
 'feminine': 0.34277618,
 'personality': 0.29224607}

In [189]:
# fills dictionary so every word has a value (None if not in dictionary)
# and returns dictionary as dataframe
def fill_dict(wefat_dict, potential_words):
    for corpus in wefat_dict.keys():
        for word in potential_words:
            if word not in wefat_dict[corpus].keys():
                wefat_dict[corpus][word] = None
    
    df = pd.DataFrame.from_dict(wefat_dict) 
    return df
    

In [None]:
potential_words = "chad, ugly, virgin, incel, normie, beta, abortion, sexist, feminist, gender, trans, whore, virgin, rape, birth_control, porn, sexual_assault, love, abuse, prostitution, slut_shaming, onlyfans, bodily_autonomy, taylor_swift, hillary_clinton, cuck, consent, period, sex, drag, gay, queer, pleasure, man, woman, male, female, feminine, masculine, foid, femoid, marginalized, black, white, vagina, menstruation, girl, boy, guy, becky, stacy, karen, femcel, slut, promiscuous, short, attractive, personality"
potential_words = set(potential_words.split(", "))
weak_powerful_df = fill_dict(weak_powerful_wefat, potential_words)
safe_dangerous_df = fill_dict(safe_dangerous_wefat, potential_words)

### Save dataframes to csv

In [190]:
weak_powerful_df.to_csv('weak_powerful_wefat.csv', index=True, header=True)
safe_dangerous_df.to_csv('safe_dangerous_wefat.csv', index=True, header=True)