In [179]:
import numpy as np
import os
import pandas as pd
from gensim.models import Word2Vec
from weat import WEAT
from wefat import WEFAT

### Load Word2Vec models

In [21]:
def create_model_dict(embeddings_path):
    embedding_files = os.listdir(embeddings_path)
    model_dict = {}
    for file in embedding_files:
        name, ext = os.path.splitext(file)
        if ext == '.bin':
            print(f"Loading Word2Vec from {file}")
            model_dict[file.split(".")[0]] = Word2Vec.load(f"{embeddings_path}/{file}")
    return model_dict        

In [22]:
embeddings_path = "../data/embeddings"
model_dict = create_model_dict(embeddings_path)

Loading Word2Vec from blackladies.bin
Loading Word2Vec from braincels.bin
Loading Word2Vec from feminisms.bin
Loading Word2Vec from feminismuncensored.bin
Loading Word2Vec from feminism_2015_2017.bin
Loading Word2Vec from feminism_2017_2019.bin
Loading Word2Vec from feminism_2019_2021.bin
Loading Word2Vec from feminism_2021_2023.bin
Loading Word2Vec from feminism_full.bin
Loading Word2Vec from fireyfemmes.bin
Loading Word2Vec from fourthwavewomen.bin
Loading Word2Vec from incels.bin
Loading Word2Vec from incels_full.bin
Loading Word2Vec from mensrights.bin
Loading Word2Vec from trufemcels.bin
Loading Word2Vec from women.bin


### Functions to create embeddings and run WEAT and WEFAT

In [128]:
def create_embeddings_dict(corpora_model, all_words):
    embedding_dict = {}
    for key in all_words.keys():
        embeddings = {}
        for word in all_words[key]:
            try:
                embeddings[word] = np.array(corpora_model.wv[word], dtype='float32')
            except:
                print(f"{word} is not in the corpus")
        embedding_dict[key] = embeddings
    return embedding_dict
    

In [129]:
# https://github.com/e-mckinnie/WEAT
# authored by Elizabeth McKinnie, 2022
# from main.py
def test_weat(embedded_data, iterations, distribution_type):
    test = WEAT(list(embedded_data['target_1'].values()), list(embedded_data['target_2'].values()), list(embedded_data['attribute_1'].values()), list(embedded_data['attribute_2'].values()))

    d = test.effect_size()
    print(f'\teffect size: {d}')

    p_value = test.p_value(iterations, distribution_type)
    print(f'\tp_value: {p_value}')

In [161]:
# https://github.com/e-mckinnie/WEAT
# authored by Elizabeth McKinnie, 2022
# from main.py
def test_wefat(embedded_data):
    test = WEFAT(embedded_data['target'], list(embedded_data['attribute_1'].values()), list(embedded_data['attribute_2'].values()))
    return test.all_effect_sizes()

### Experiment 1: All incel subreddits vs r/feminism

In [89]:
# define target
all_words = {'target_1': ["feminist", "girl", "woman"], 
             'target_2': ["incel", "boy", "man"],
             'attribute_1': ["nice", "beautiful", "cheerful", "wonderful"],
             'attribute_2': ["mean", "ugly", "rude", "awful"]}

In [116]:
# words not in corpora but changed into other forms
# napping -> nap
# harmoniously -> harmonious
# missiles -> missile
# safe dangerous
all_words = {'target_1': ["feminist", "girl", "woman"], 
             'target_2': ["incel", "boy", "man"],
             'attribute_1': ["peculiarly", "hill", "terminology", "childhood", "accord", "cotton", "sleeping", "nap", "calm", "softness", "natural", "serenity", "reassurance", "peace", "angel", "harmonious", "honest", "blessing", "trustworthy", "wisdom"],
             'attribute_2': ["homicide", "wrath", "terrifying", "poisonous", "masochism", "obsessive", "homosexual", "acid", "claw", "hospital", "weld", "bankruptcy", "suffering", "mistreated", "abused", "misbehave", "suicidal", "chaos", "toxic", "bloodbath", "killer", "murderer", "assassinate", "earthquake", "missile", "firearm", "firing", "firearms"]}

In [111]:
# weak powerful
# decayed -> decay
# nothingness -> nothing
# rearrangement -> rearrange
# desiring -> desire
#excitability -> excite
all_words = {'target_1': ["feminist", "girl", "woman"], 
             'target_2': ["incel", "boy", "man"],
             'attribute_1': ["disagreeable", "shitload", "deceased", "abandoned", "depressed", "decay", "penniless", "sorrow", "feeble", "weak", "void", "idle", "nothing", "slow", "yawn", "tiny", "small", "meek", "sofa", "empty", "inactivity", "emptiness", "dryness", "drool", "bladder"],
             'attribute_2': ["rearrange", "superintendent", "desire", "immense", "passion", "excite", "superpower", "champion", "almighty", "success", "generous", "perfect", "freedom", "excellence", "prestigious", "winning", "greatness", "triumph", "victorious", "mighty", "conquering", "conquer", "adrenaline", "intensity", "competitor", "wrestler", "warrior", "dominant", "exorbitant"]}

In [136]:
# create embeddings
feminism_full_embeddings_dict = create_embeddings_dict(model_dict['feminism_full'], all_words)
incels_full_embeddings_dict = create_embeddings_dict(model_dict['incels_full'], all_words)


In [113]:
# run WEAT
print("Results for feminism_full:")
test_weat(feminism_full_embeddings_dict, 1000, 'normal')


Results for feminism_full:
	effect size: -1.0255852937698364
	p_value: 0.8755169969506597


In [114]:
print("Results for incels_full:")
test_weat(incels_full_embeddings_dict, 1000, 'normal')

Results for incels_full:
	effect size: -0.7051849365234375
	p_value: 0.7802936547308923


### Experiment 2: Incel subreddits across time

In [60]:
all_words = {'target_1': ["feminist", "girl", "woman"], 
             'target_2': ["incel", "boy", "man"],
             'attribute_1': ["nice", "beautiful", "cheerful", "wonderful"],
             'attribute_2': ["mean", "ugly", "rude", "awful"]}

In [64]:
braincels_embeddings_dict = create_embeddings_dict(model_dict['braincels'], all_words)
incels_embeddings_dict = create_embeddings_dict(model_dict['incels'], all_words)
mensrights_embeddings_dict = create_embeddings_dict(model_dict['mensrights'], all_words)
trufemcels_embeddings_dict = create_embeddings_dict(model_dict['trufemcels'], all_words)

In [65]:
print("Results for braincels:")
test_weat(braincels_embeddings_dict, 1000, 'normal')

Results for braincels:
	effect size: 0.6003853678703308
	p_value: 0.2625427021179424


In [66]:
print("Results for incels:")
test_weat(incels_embeddings_dict, 1000, 'normal')

Results for incels:
	effect size: 0.4175088703632355
	p_value: 0.32537145953079594


In [68]:
print("Results for mensrights:")
test_weat(mensrights_embeddings_dict, 1000, 'normal')

Results for mensrights:
	effect size: 0.8146820068359375
	p_value: 0.16984705290506352


In [69]:
print("Results for trufemcels:")
test_weat(trufemcels_embeddings_dict, 1000, 'normal')

Results for trufemcels:
	effect size: 0.999513566493988
	p_value: 0.12003415399244433


### Experiment 3: r/feminism across time

In [71]:
all_words = {'target_1': ["feminist", "girl", "woman"], 
             'target_2': ["incel", "boy", "man"],
             'attribute_1': ["nice", "beautiful", "happy", "wonderful"],
             'attribute_2': ["mean", "ugly", "sad", "awful"]}
# note: cheerful is replaced by "happy" and to balance, "rude" is replaced by "sad"

In [72]:
feminism_2015_2017_embeddings_dict = create_embeddings_dict(model_dict['feminism_2015_2017'], all_words)
feminism_2017_2019_embeddings_dict = create_embeddings_dict(model_dict['feminism_2017_2019'], all_words)
feminism_2019_2021_embeddings_dict = create_embeddings_dict(model_dict['feminism_2019_2021'], all_words)
feminism_2021_2023_embeddings_dict = create_embeddings_dict(model_dict['feminism_2021_2023'], all_words)

In [73]:
print("Results for feminism_2015_2017:")
test_weat(feminism_2015_2017_embeddings_dict, 1000, 'normal')

Results for feminism_2015_2017:
	effect size: 0.9019179940223694
	p_value: 0.16171066156932157


In [74]:
print("Results for feminism_2017_2019:")
test_weat(feminism_2017_2019_embeddings_dict, 1000, 'normal')

Results for feminism_2017_2019:
	effect size: 0.8591486215591431
	p_value: 0.15840957857267302


In [75]:
print("Results for feminism_2019_2021:")
test_weat(feminism_2019_2021_embeddings_dict, 1000, 'normal')

Results for feminism_2019_2021:
	effect size: 1.2137731313705444
	p_value: 0.0991475629290598


In [76]:
print("Results for feminism_2021_2023:")
test_weat(feminism_2021_2023_embeddings_dict, 1000, 'normal')

Results for feminism_2021_2023:
	effect size: 0.6987370252609253
	p_value: 0.21980526004633716


### Experiment 4: Between feminism subreddits

In [79]:
all_words = {'target_1': ["feminist", "girl", "woman"], 
             'target_2': ["incel", "boy", "man"],
             'attribute_1': ["nice", "beautiful", "happy", "wonderful"],
             'attribute_2': ["mean", "ugly", "sad", "awful"]}
# note: cheerful is replaced by "happy" and to balance, "rude" is replaced by "sad"

In [80]:
blackladies_embeddings_dict = create_embeddings_dict(model_dict['blackladies'], all_words)
fourthwavewomen_embeddings_dict = create_embeddings_dict(model_dict['fourthwavewomen'], all_words)
women_embeddings_dict = create_embeddings_dict(model_dict['women'], all_words)

In [81]:
print("Results for feminism_full:")
test_weat(feminism_full_embeddings_dict, 1000, 'normal')

Results for feminism_full:
	effect size: 1.0739195346832275
	p_value: 0.10982385947538886


In [82]:
print("Results for blackladies:")
test_weat(blackladies_embeddings_dict, 1000, 'normal')

Results for blackladies:
	effect size: 1.1685402393341064
	p_value: 0.10169695354007424


In [83]:
print("Results for fourthwavewomen:")
test_weat(fourthwavewomen_embeddings_dict, 1000, 'normal')

Results for fourthwavewomen:
	effect size: 1.0381433963775635
	p_value: 0.1111839942242322


In [84]:
print("Results for women:")
test_weat(women_embeddings_dict, 1000, 'normal')

Results for women:
	effect size: 0.6590870022773743
	p_value: 0.24115049434301095


### WEFAT

In [150]:
# safe/dangerous
safe_dangerous = {'attribute_1': ["peculiarly", "hill", "terminology", "childhood", "accord", "cotton", "sleeping", "nap", "calm", "softness", "natural", "serenity", "reassurance", "peace", "angel", "harmonious", "honest", "blessing", "trustworthy", "wisdom"],
                  'attribute_2': ["homicide", "wrath", "terrifying", "poisonous", "masochism", "obsessive", "acid", "claw", "hospital", "weld", "bankruptcy", "suffering", "mistreated", "abused", "misbehave", "suicidal", "chaos", "toxic", "bloodbath", "killer", "murderer", "assassinate", "earthquake", "missile", "firearm", "firing", "firearms"]}

In [151]:
# weak/powerful
weak_powerful = {'attribute_1': ["disagreeable", "shitload", "deceased", "abandoned", "depressed", "decay", "penniless", "sorrow", "feeble", "weak", "void", "idle", "nothing", "slow", "yawn", "tiny", "small", "meek", "sofa", "empty", "inactivity", "emptiness", "dryness", "drool", "bladder"],
                 'attribute_2': ["rearrange", "superintendent", "desire", "immense", "passion", "excite", "superpower", "champion", "almighty", "success", "generous", "perfect", "freedom", "excellence", "prestigious", "winning", "greatness", "triumph", "victorious", "mighty", "conquering", "conquer", "adrenaline", "intensity", "competitor", "wrestler", "warrior", "dominant", "exorbitant"]}

In [152]:
def read_dict_from_file(file_path):
    with open(file_path, 'r') as file:
        dict = json.load(file)
    return dict

In [153]:
target_word_file = "target_words.txt"
target_words = read_dict_from_file(target_word_file)

In [162]:
def calculate_wefat(target_words, model_dict, attributes):
    results = {}
    for corpus in model_dict.keys():
        print(f"Corpus: {corpus}")
        target_attribute = attributes.copy()
        target_attribute['target'] = target_words[corpus]
        embeddings_dict = create_embeddings_dict(model_dict[corpus], target_attribute)
        effect_sizes = test_wefat(embeddings_dict)
        results[corpus] = effect_sizes
    return results

In [163]:
safe_dangerous_wefat = calculate_wefat(target_words, model_dict, safe_dangerous)

Corpus: blackladies
peculiarly is not in the corpus
masochism is not in the corpus
mistreated is not in the corpus
bloodbath is not in the corpus
firearms is not in the corpus
normie is not in the corpus
Corpus: braincels
peculiarly is not in the corpus
onlyfans is not in the corpus
Corpus: feminisms
peculiarly is not in the corpus
hill is not in the corpus
terminology is not in the corpus
cotton is not in the corpus
sleeping is not in the corpus
nap is not in the corpus
softness is not in the corpus
serenity is not in the corpus
reassurance is not in the corpus
angel is not in the corpus
harmonious is not in the corpus
blessing is not in the corpus
trustworthy is not in the corpus
wisdom is not in the corpus
wrath is not in the corpus
terrifying is not in the corpus
poisonous is not in the corpus
masochism is not in the corpus
obsessive is not in the corpus
acid is not in the corpus
claw is not in the corpus
weld is not in the corpus
bankruptcy is not in the corpus
suffering is not in

In [164]:
weak_powerful_wefat = calculate_wefat(target_words, model_dict, weak_powerful)

Corpus: blackladies
disagreeable is not in the corpus
abandoned is not in the corpus
penniless is not in the corpus
inactivity is not in the corpus
almighty is not in the corpus
conquering is not in the corpus
normie is not in the corpus
Corpus: braincels
onlyfans is not in the corpus
Corpus: feminisms
disagreeable is not in the corpus
shitload is not in the corpus
deceased is not in the corpus
abandoned is not in the corpus
depressed is not in the corpus
decay is not in the corpus
penniless is not in the corpus
sorrow is not in the corpus
feeble is not in the corpus
void is not in the corpus
idle is not in the corpus
yawn is not in the corpus
meek is not in the corpus
sofa is not in the corpus
inactivity is not in the corpus
emptiness is not in the corpus
dryness is not in the corpus
drool is not in the corpus
bladder is not in the corpus
rearrange is not in the corpus
superintendent is not in the corpus
immense is not in the corpus
passion is not in the corpus
excite is not in the co

In [169]:
# safe/dangerous: > 0 => safe, < 0 => dangerous
# weak/powerful: > 0 => weak, < 0 => powerful
# in general, association with attribute 1 is > 0, with attribute 2 is < 0
def get_top_n_words(word_dict, n, is_attribute_1):
    return dict(sorted(word_dict.items(), key=lambda item: item[1], reverse=is_attribute_1)[0:n])


In [170]:
get_top_n_words(safe_dangerous_wefat['feminism_full'], 5, True)

{'love': 0.7123157,
 'gender': 0.49187315,
 'queer': 0.34608305,
 'feminine': 0.34277618,
 'personality': 0.29224607}

In [171]:
get_top_n_words(safe_dangerous_wefat['feminism_full'], 5, False)

{'abuse': -1.0854567,
 'rape': -1.04738,
 'sexual_assault': -0.94468856,
 'woman': -0.6860135,
 'incel': -0.6776244}

In [173]:
get_top_n_words(weak_powerful_wefat['feminism_full'], 5, True)

{'cuck': 0.68850154,
 'incel': 0.67160153,
 'gay': 0.653799,
 'vagina': 0.38201907,
 'guy': 0.38177663}

In [174]:
get_top_n_words(weak_powerful_wefat['feminism_full'], 5, False)

{'woman': -0.5603854,
 'female': -0.5469336,
 'male': -0.51924366,
 'hillary_clinton': -0.43403992,
 'white': -0.38744622}

In [177]:
potential_words = "chad, ugly, virgin, incel, normie, beta, abortion, sexist, feminist, gender, trans, whore, virgin, rape, birth_control, porn, sexual_assault, love, abuse, prostitution, slut_shaming, onlyfans, bodily_autonomy, taylor_swift, hillary_clinton, cuck, consent, period, sex, drag, gay, queer, pleasure, man, woman, male, female, feminine, masculine, foid, femoid, marginalized, black, white, vagina, menstruation, girl, boy, guy, becky, stacy, karen, femcel, slut, promiscuous, short, attractive, personality"
potential_words = set(potential_words.split(", "))

In [189]:
def fill_dict(wefat_dict, file_name, potential_words):
    for corpus in wefat_dict.keys():
        for word in potential_words:
            if word not in wefat_dict[corpus].keys():
                wefat_dict[corpus][word] = None
    
    df = pd.DataFrame.from_dict(wefat_dict) 
    df.to_csv(file_name, index=True, header=True)
    return df
    

In [190]:
weak_powerful_df = write_dict_to_csv(weak_powerful_wefat, 'weak_powerful_wefat.csv', potential_words)

In [191]:
safe_dangerous_df = write_dict_to_csv(safe_dangerous_wefat, 'safe_dangerous_wefat.csv', potential_words)

In [192]:
weak_powerful_df
