This file explores if determining sensitivity by measuring the similarity of the input word to one or more social justice buzzword works. 

In [50]:
# Get words from macht.sprache
import pandas as pd
input_words_en_de = pd.read_json('macht.sprache_words.json')


input_words_en = input_words_en_de[input_words_en_de['lemma_lang'] == 'en']['lemma'].reset_index(drop=True)
input_words_de = input_words_en_de[input_words_en_de['lemma_lang'] == 'de']['lemma'].reset_index(drop=True)
print(input_words_en)

0                                           ability
1                                       able-bodied
2                                           ableism
3      Aboriginal and Torres Strait Islander People
4                                         Afrikaner
                           ...                     
171                                 white supremacy
172                                       Whiteness
173                                  whitesplaining
174                                            woke
175                                            yeke
Name: lemma, Length: 176, dtype: object


1. Using pre-trained GloVe Twitter 25 

In [2]:
# Setting up lexicon
import gensim.downloader as api
glove_vectors = api.load("glove-twitter-25")

In [74]:
input_word = input_words_en.iloc[7]  # The word you want to find similar words for
n = 20  # The number of most similar words you want
print("input word: ", input_word)

# Find the n most similar words to the specified word
most_similar_words = glove_vectors.most_similar(input_word, topn=n)

# Print the most similar words and their similarity scores
#print("--- most similar words ---")
#for similar_word, similarity in most_similar_words:
#    print(f"{similar_word}: {similarity}")



# Rank the list of similar words according to their similarity to a buzzword 
buzzword_similarities = []
buzzwords = ['discrimination', 'power', 'political']

print("--- buzzword similarities ---")
for similar_word, _ in most_similar_words:
    similarity = 0
    for buzzword in buzzwords:
        similarity = similarity + glove_vectors.similarity(similar_word, buzzword)
    buzzword_similarities.append((similar_word, similarity/len(buzzwords)))
    
    
df = pd.DataFrame(buzzword_similarities, columns=['similar_word', 'buzzword_similarity'])   
df.sort_values(by=['buzzword_similarity'], ascending=False, inplace=True) 
print(df)



input word:  ancestors
--- buzzword similarities ---
    similar_word  buzzword_similarity
1         slaves             0.576186
19          jews             0.513533
16      believed             0.475029
7   commandments             0.458929
6        priests             0.420096
0       prophets             0.411606
5         unborn             0.401501
13       orphans             0.401005
8         greeks             0.398529
17     glorified             0.388898
11    sacrificed             0.370196
3   missionaries             0.365527
4       worships             0.359310
10       witches             0.354220
18     egyptians             0.334991
14   forefathers             0.326398
9    worshippers             0.323238
2      disciples             0.309377
12    worshipped             0.290851
15      apostles             0.231268


2. Using self-trained word2vec model on reddit comments

In [78]:
import gensim

# Load pre-trained Word2Vec model.
w2v = gensim.models.Word2Vec.load("word2vec_test.model").wv

In [79]:
input_word = input_words_en.iloc[7]  # The word you want to find similar words for
n = 20  # The number of most similar words you want
print("input word: ", input_word)

most_similar_words = w2v.most_similar(input_word, topn=n)

# Print the most similar words and their similarity scores
#print("--- most similar words ---")
#for similar_word, similarity in most_similar_words:
#    print(f"{similar_word}: {similarity}")


# Rank the list of similar words according to their similarity to a buzzword 
buzzword_similarities = []
buzzwords = ['discrimination', 'power', 'political']

print("--- buzzword similarities ---")
for similar_word, _ in most_similar_words:
    similarity = 0
    for buzzword in buzzwords:
        similarity = similarity + w2v.similarity(similar_word, buzzword)
    buzzword_similarities.append((similar_word, similarity/len(buzzwords)))
    
    
df = pd.DataFrame(buzzword_similarities, columns=['similar_word', 'buzzword_similarity'])   
df.sort_values(by=['buzzword_similarity'], ascending=False, inplace=True) 
print(df)

input word:  ancestors
--- buzzword similarities ---
   similar_word  buzzword_similarity
10  forefathers             0.447843
4          mede             0.432698
18       iranic             0.414727
16      seljuks             0.409344
15    sumerians             0.396964
11       serers             0.388622
17    anatolian             0.383567
3         medes             0.367793
12      afghans             0.364946
14   decendents             0.360088
2       mongols             0.349856
13        herat             0.349758
9      persians             0.346132
8   descendents             0.345633
6         avars             0.344050
7     assyrians             0.338645
19       persia             0.301108
1     descended             0.286256
5      ancestor             0.238327
0   descendants             0.232567
