# Organization of words into ontology

Last updated: 06212024  
By: Lauren Liao  
Purpose: get word similarity matrix

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
from tqdm import tqdm
from io import StringIO
import io
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import seaborn as sns
import pickle
import scipy
import scipy.cluster.hierarchy as sch

%matplotlib inline

**Constants and functions**

SEED_LST: 10 random seeds of 5 digit primes  
LONG_SEED_LST: 100 random seed of 6 digit  
KEYTERMS: main key terms of interest  
KEYTERM_CAT: categories relevant to the KEYTERMS

_get_similarity_df_ make the similarity matrix into a pandas dataframe  
_get_similarity_across_seeds_ make the similarity dataframe across the seeds

In [13]:
# set random seed lists
SEED_LST = [11113, 12211, 13999, 18947, 49999, 54787, 65537, 69427, 99989, 99991]
LONG_SEED_LST = [756839, 757577, 757727, 765169, 767857, 769591, 773147, 773251, 
                 773473, 774023, 776887, 777643, 777677, 777743, 778879, 779693, 
                 780721, 780799, 783137, 786431, 786983, 787757, 788999, 789101, 
                 789121, 790879, 791663, 792397, 797161, 797581, 800801, 803989, 
                 805723, 806041, 809461, 823541, 823547, 823553, 823679, 823799, 
                 824609, 826669, 826699, 832411, 833719, 835399, 841069, 844427, 
                 853211, 859433, 864203, 864307, 864449, 875491, 880667, 880949, 
                 887113, 894689, 895553, 896723, 900001, 904117, 908909, 909091, 
                 909287, 913579, 914491, 919799, 920209, 923471, 938351, 944689, 
                 945457, 946489, 946801, 946949, 949391, 953593, 953983, 955223, 
                 956401, 960889, 961273, 961397, 962963, 965401, 966727, 968041, 
                 969407, 971389, 971767, 974123, 974159, 974213, 974849, 975313, 
                 975379, 977923, 978403, 980641]

# set keyterms
KEYTERMS = ['health', 'doctor', 'resource', 'class', 
            'race', 'black', 'white', 'bame', 'minority', 
            'racism', 'disadvantaged',
            'discrimination', 'wealth', 'poverty', 'power', 'bias', 'structural', 
            'possibly',
            'perhaps', 'probably', 'likely']

# set category for keyterms
KEYTERM_CAT = [cat_term for sublist in [['basic_terms']*4, 
                                        ['race_related']*5, 
                                        ['racism_or_power_related']*8, 
                                        ['close_reading_informed']*4] for cat_term in sublist]


KEYTERMS2 = ['health', 'patient', 'doctor', 'resource', 'class', 
            'care', 'medical',
            'population', 'social', 'community', 'system',
            'race', 'black', 'white', 'bame', 'minority', 
            'racism', 'disadvantaged',
            'discrimination', 'wealth', 'poverty', 'power', 'bias', 'structural', 
            'ethnic_minority', 'african_american', 'human_right', 'risk_factor',
            'structural_racism', 'racial_ethnic', 'social_determinant', 'life_expectancy']

In [3]:
# get similarity matrix
def get_similarity_df(model):
    """
    Take the Word2Vec model
    Return the similarity matrix for each specification
    """
    vec_by_keys = np.vstack([model.wv.get_vector(k) for k in model.wv.index_to_key])
    word_similarity_df = pd.DataFrame(cosine_similarity(vec_by_keys), 
                                       index=model.wv.index_to_key, 
                                       columns=model.wv.index_to_key)
    
    return(word_similarity_df)

In [4]:
def get_similarity_across_seeds(sentences, 
                                window=5, 
                                min_count=5, 
                                vector_size=128, 
                                seed_lst=SEED_LST,
                                save_to_pickle=False
                               ):
    """
    Goes through different Word2Vec model seed specifications
    Store the resulting dataframes inside of a datafame
    """
    
    # creating an empty data frame
    similarity_across_seeds_df = pd.DataFrame({'seed': seed_lst,
                                               'similarity_df': [None] * len(seed_lst)})

    similarity_across_seeds_df['similarity_df'] = [get_similarity_df(Word2Vec(sentences=sentences,
                                                                              window=window, 
                                                                              min_count=min_count,
                                                                              vector_size=vector_size,
                                                                              seed=seed)) for seed in tqdm(seed_lst)]
    sim_vals = np.zeros((similarity_across_seeds_df.shape[0],
                         similarity_across_seeds_df['similarity_df'][0].shape[0], 
                         similarity_across_seeds_df['similarity_df'][0].shape[0]))
    
    for i, df in enumerate(similarity_across_seeds_df['similarity_df']):
        sim_vals[i] = df.to_numpy()
    
    mean_sim_vals = sim_vals.mean(axis=0)
    std_sim_vals = sim_vals.std(axis=0)

    mean_sim_vals_df = pd.DataFrame(mean_sim_vals, 
                                    index=similarity_across_seeds_df['similarity_df'][0].columns.values,
                                    columns=similarity_across_seeds_df['similarity_df'][0].columns.values)
    std_sim_vals_df = pd.DataFrame(std_sim_vals, 
                                    index=similarity_across_seeds_df['similarity_df'][0].columns.values,
                                    columns=similarity_across_seeds_df['similarity_df'][0].columns.values)
    
    if(save_to_pickle):
        mean_sim_vals_df.to_pickle('../../data/similarity_df_mean.pkl')
        std_sim_vals_df.to_pickle('../../data/similarity_df_std.pkl')

    return(mean_sim_vals_df, std_sim_vals_df)

**Data loading**  
the data starts with the preprocessed text (pickled) file

In [2]:
# read processed text from previous
full_text = pd.read_pickle('../../data/full_data_w_processedtext.pkl')
processed_text = full_text['processed_text'] 

# only working with the this particular column moving forward
sentences = [x for x in processed_text]

## take average similarity across 30 random seeds

get the average and standard deviation then save this 

In [None]:
%%time
mean_similarity_df, std_similarity_df = get_similarity_across_seeds(sentences, seed_lst=LONG_SEED_LST[:30], save_to_pickle=True)

100%|███████████████████████████████████████████| 30/30 [01:38<00:00,  3.27s/it]


### Example usage

In [5]:
mean_similarity_df = pd.read_pickle("../../data/similarity_df_mean.pkl")
std_similarity_df = pd.read_pickle("../../data/similarity_df_std.pkl")

In [5]:
word_interest = "interpersonal"
mean_similarity_df.sort_values(by= word_interest, ascending=False).iloc[:31][word_interest]

interpersonal            1.000000
exclusion                0.958830
margin_ation             0.957773
stigma                   0.954079
homophobia               0.944167
biologic                 0.939472
persistent               0.939158
profound                 0.936795
shaped                   0.935668
systemic                 0.933694
oppression               0.932175
injustice                0.932056
implicit                 0.926844
interplay                0.926209
socio_cultural           0.924985
construct                0.922460
tackling                 0.921764
stereotype               0.921271
influence                0.920708
identity                 0.919835
racial_discrimination    0.919200
interact                 0.918830
pervasive                0.916720
exist                    0.916429
disadvantage             0.914985
negative                 0.914592
sexual_orientation       0.912542
overt                    0.911081
persist                  0.910432
driver        

### Preliminary visualizations

In [14]:
results = [({key: value for key, value in 
            zip(mean_similarity_df.sort_values(by= word_interest, 
                                               ascending=False).iloc[1:11][word_interest].index,
                mean_similarity_df.sort_values(by= word_interest, 
                                               ascending=False).iloc[1:11][word_interest])})
           for word_interest in tqdm(KEYTERMS2)] #previously tqdm(KEYTERMS)


  0%|                                                    | 0/32 [00:00<?, ?it/s][A
  3%|█▍                                          | 1/32 [00:01<00:36,  1.18s/it][A
  6%|██▊                                         | 2/32 [00:02<00:34,  1.14s/it][A
  9%|████▏                                       | 3/32 [00:03<00:29,  1.01s/it][A
 12%|█████▌                                      | 4/32 [00:04<00:28,  1.02s/it][A
 16%|██████▉                                     | 5/32 [00:05<00:27,  1.03s/it][A
 19%|████████▎                                   | 6/32 [00:06<00:26,  1.03s/it][A
 22%|█████████▋                                  | 7/32 [00:07<00:26,  1.05s/it][A
 25%|███████████                                 | 8/32 [00:08<00:25,  1.05s/it][A
 28%|████████████▍                               | 9/32 [00:09<00:24,  1.04s/it][A
 31%|█████████████▍                             | 10/32 [00:10<00:22,  1.04s/it][A
 34%|██████████████▊                            | 11/32 [00:11<00:21,  1.04

In [1]:
# def save_topn_results(n):
#     filename = f"../results/top{n}_mean2.pkl"
#     with open(filename, 'wb') as file:
#         pickle.dump([(word_interest,
#             {key: value for key, value in 
#             zip(mean_similarity_df.sort_values(by= word_interest, 
#                                                ascending=False).iloc[1:(n+1)][word_interest].index,
#                 mean_similarity_df.sort_values(by= word_interest, 
#                                                ascending=False).iloc[1:(n+1)][word_interest])})
#            for word_interest in tqdm(KEYTERMS2)], file)

In [8]:
def save_topn_results(n):
    filename = f"../results/top{n}_mean_all.pkl"
    with open(filename, 'wb') as file:
        pickle.dump([(word_interest,
            {key: value for key, value in 
            zip(mean_similarity_df.sort_values(by= word_interest, 
                                               ascending=False).iloc[1:(n+1)][word_interest].index,
                mean_similarity_df.sort_values(by= word_interest, 
                                               ascending=False).iloc[1:(n+1)][word_interest])})
           for word_interest in tqdm(mean_similarity_df.columns)], file)

In [10]:
save_topn_results(30)


  0%|                                                  | 0/9242 [00:00<?, ?it/s][A
  0%|                                        | 1/9242 [00:00<1:36:30,  1.60it/s][A
  0%|                                        | 2/9242 [00:01<1:31:52,  1.68it/s][A
  0%|                                        | 3/9242 [00:02<2:02:50,  1.25it/s][A
  0%|                                        | 4/9242 [00:03<2:07:17,  1.21it/s][A
  0%|                                        | 5/9242 [00:03<2:10:21,  1.18it/s][A
  0%|                                        | 6/9242 [00:04<2:11:33,  1.17it/s][A
  0%|                                        | 7/9242 [00:05<2:12:31,  1.16it/s][A
  0%|                                        | 8/9242 [00:06<2:13:08,  1.16it/s][A
  0%|                                        | 9/9242 [00:07<2:13:36,  1.15it/s][A
  0%|                                       | 10/9242 [00:08<2:14:07,  1.15it/s][A
  0%|                                       | 11/9242 [00:09<2:14:04,  1.15