# measure coherence seed words (NOI) according to Maria antoniak 

In [1]:
import pandas as pd
from scipy.stats import rankdata

In [2]:
w2v_cos_sim_Scores_path = "ethnicity_similarities/word2vec"
glove_twitter_cos_sim_scores_path = "ethnicity_similarities/glove_twitter"
glove_wk_cos_sim_scores_path = "ethnicity_similarities/glove-wk"
ud_cos_sim_scores_path = "ethnicity_similarities/UD"
chan_cos_sim_scores_path = "ethnicity_similarities/chan"

In [3]:
w2v_cos_sim_majority_df = pd.read_csv(w2v_cos_sim_Scores_path+"/"+"majority_words_sim_to_profane_vector.csv")
w2v_cos_sim_minority_df = pd.read_csv(w2v_cos_sim_Scores_path+"/"+"minority_words_sim_to_profane_vector.csv")

In [4]:
glove_twitter_cos_sim_majority_df = pd.read_csv(glove_twitter_cos_sim_scores_path+"/"+"majority_words_sim_to_profane_vector.csv")
glove_twitter_cos_sim_minority_df = pd.read_csv(glove_twitter_cos_sim_scores_path+"/"+"minority_words_sim_to_profane_vector.csv")

In [5]:
glove_wk_cos_sim_majority_df = pd.read_csv(glove_wk_cos_sim_scores_path+"/"+"majority_words_sim_to_profane_vector.csv")
glove_wk_cos_sim_minority_df = pd.read_csv(glove_wk_cos_sim_scores_path+"/"+"minority_words_sim_to_profane_vector.csv")

In [6]:
ud_cos_sim_majority_df = pd.read_csv(ud_cos_sim_scores_path+"/"+"majority_words_sim_to_profane_vector.csv")
ud_cos_sim_minority_df = pd.read_csv(ud_cos_sim_scores_path+"/"+"minority_words_sim_to_profane_vector.csv")

In [7]:
chan_cos_sim_majority_df = pd.read_csv(chan_cos_sim_scores_path+"/"+"majority_words_sim_to_profane_vector.csv")
chan_cos_sim_minority_df = pd.read_csv(chan_cos_sim_scores_path+"/"+"minority_words_sim_to_profane_vector.csv")

In [8]:
lgtb_words = ["lesbian","gay","bisexual","transgender","tran","queer",
                "lgbt","lgbtq","homosexual","non-binary"]

women_words = ["woman", "female", "girl","wife","sister","daughter","mother"]


eth_words = ["african", "african american", "asian", "black", "hispanic", "latin", "mexican", "indian", "middle eastern",
                "arab"]

straight_words = ["hetrosexual", "cisgender"]

men_words = ["man","male","boy","son","father","husband","brother"]

white_eth_words = ["white","caucasian", "european american","european", "norwegian", "canadian", "german", "australian", "english",
                                "french", "american", "swedish", "dutch"]

In [9]:
def normalize_data(a):
    normalizded_a = []
    amin, amax = min(a), max(a)
    for i in a:
        normalizded_a.append((i-amin) / (amax-amin))
    return normalizded_a

In [32]:
def measure_coherence_word_lists(model_minority_cos_sim_df, model_majority_cos_sim_df, minority_word_list, majority_word_list):
    
    cos_sim_minority_wordlist = model_minority_cos_sim_df[model_minority_cos_sim_df["words"].isin(minority_word_list)]["cos_sim_scores"]
    cos_sim_majority_word_list = model_majority_cos_sim_df[model_majority_cos_sim_df["words"].isin(majority_word_list)]["cos_sim_scores"]
    
    
    mean_ranks_minority_word_list = (cos_sim_minority_wordlist).mean()
    mean_ranks_majority_word_list = (cos_sim_majority_word_list).mean()
    
    coherence = mean_ranks_minority_word_list - mean_ranks_majority_word_list

    return coherence
    

## measue coherence in w2v models

In [33]:
w2v_sex_or = measure_coherence_word_lists(w2v_cos_sim_minority_df, w2v_cos_sim_majority_df, lgtb_words, straight_words)

In [34]:
w2v_gender = measure_coherence_word_lists(w2v_cos_sim_minority_df, w2v_cos_sim_majority_df, women_words, men_words)

In [35]:
w2v_eth = measure_coherence_word_lists(w2v_cos_sim_minority_df, w2v_cos_sim_majority_df, eth_words, white_eth_words)

In [36]:
normalized_coherence = normalize_data ([w2v_sex_or, w2v_gender, w2v_eth])

In [37]:
normalized_coherence

[0.0, 0.9888198353926599, 1.0]

## measue coherence in glove-wk models

In [38]:
glove_wk_sex_or = measure_coherence_word_lists(glove_wk_cos_sim_minority_df, glove_wk_cos_sim_majority_df, lgtb_words, straight_words)

In [39]:
glove_wk_gender = measure_coherence_word_lists(glove_wk_cos_sim_minority_df, glove_wk_cos_sim_majority_df, women_words, men_words)

In [40]:
glove_wk_eth = measure_coherence_word_lists(glove_wk_cos_sim_minority_df, glove_wk_cos_sim_majority_df, eth_words, white_eth_words)

In [41]:
normalized_coherence = normalize_data ([glove_wk_sex_or,glove_wk_gender,glove_wk_eth])
normalized_coherence

[0.0, 0.9187169910981976, 1.0]

## measue coherence in glove-twitter models

In [42]:
measure_coherence_word_lists(glove_twitter_cos_sim_minority_df, glove_twitter_cos_sim_majority_df, lgtb_words, straight_words)

nan

In [43]:
measure_coherence_word_lists(glove_twitter_cos_sim_minority_df, glove_twitter_cos_sim_majority_df, women_words, men_words)

0.10214580595493317

In [44]:
measure_coherence_word_lists(glove_twitter_cos_sim_minority_df, glove_twitter_cos_sim_majority_df, eth_words, white_eth_words)

0.017385903745889664

## measue coherence in UD models

In [45]:
ud_sex_or = measure_coherence_word_lists(ud_cos_sim_minority_df, ud_cos_sim_majority_df, lgtb_words, straight_words)

In [46]:
ud_gender = measure_coherence_word_lists(ud_cos_sim_minority_df, ud_cos_sim_majority_df, women_words, men_words)

In [47]:
ud_eth = measure_coherence_word_lists(ud_cos_sim_minority_df, ud_cos_sim_majority_df, eth_words, white_eth_words)

In [48]:
normalized_coherence = normalize_data ([ud_sex_or,ud_gender,ud_eth])
normalized_coherence

[1.0, 0.028543336924576182, 0.0]

## measue coherence in chan models

In [49]:
chan_sex_or = measure_coherence_word_lists(chan_cos_sim_minority_df, chan_cos_sim_majority_df, lgtb_words, straight_words)

In [50]:
chan_gender =  measure_coherence_word_lists(chan_cos_sim_minority_df, chan_cos_sim_majority_df, women_words, men_words)

In [51]:
chan_eth = measure_coherence_word_lists(chan_cos_sim_minority_df, chan_cos_sim_majority_df, eth_words, white_eth_words)

In [52]:
normalized_coherence = normalize_data ([chan_sex_or,chan_gender,chan_eth])
normalized_coherence

[0.8845330761073148, 0.0, 1.0]