In [2]:
import numpy as np
import scipy as sp
from tqdm import tqdm
from gensim.models import KeyedVectors, Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics.pairwise import cosine_similarity
import sys
sys.path.append('../src')
from models import get_matrices_from_term_lists, \
    filter_terms_not_in_wemodel

In [3]:
we_model_name = "sg_dim300_min100_win5"
we_vector_size = 300
we_model_dir = '../data/external/wiki-english/wiki-english-20171001/%s' % we_model_name

we_model = Word2Vec.load(we_model_dir+'/model.gensim')
print ('loading done!')
print(f'Total words: {len(we_model.wv.vocab)}')

'''
# Caliskan GloVe
glove_file = '../data/external/glove.6B/glove.6B.50d.txt'
_ = glove2word2vec(glove_file, '../data/interim/tmp.txt')
we_model = KeyedVectors.load_word2vec_format('../data/interim/tmp.txt')
print('loading done!')
print(f'Total words: {len(we_model.wv.vocab)}')
'''

loading done!
Total words: 312425


"\n# Caliskan GloVe\nglove_file = '../data/external/glove.6B/glove.6B.50d.txt'\n_ = glove2word2vec(glove_file, '../data/interim/tmp.txt')\nwe_model = KeyedVectors.load_word2vec_format('../data/interim/tmp.txt')\nprint('loading done!')\nprint(f'Total words: {len(we_model.wv.vocab)}')\n"

In [4]:
X_terms = ['aster', 'clover', 'hyacinth', 'marigold', 'poppy', 'azalea', 
           'crocus', 'iris', 'orchid', 'rose', 'bluebell', 'daffodil','lilac',
           'pansy','tulip','buttercup','daisy','lily','penny','violet','carnation', 'gladiola',
           'magnolia','petunia','zinnia']
Y_terms = ['ant','caterpillar','flea','locust','spider','bedbug','centipede','fly',
          'maggot','tarantula','bee','cockroach','gnat','mosquito','termite','beetle',
          'cricket','hornet','moth','wasp','blackfly','dragonfly','horsefly','roach',
          'weevil']
A_terms = ['caress','freedom','health','love','peace','cheer','friend','heaven',
           'loyal','pleasure','diamond','gentle','honest','lucky','rainbow','diploma',
           'gift','honor','miracle','sunrise','family','happy','laugher','paradise',
           'vacation']
B_terms = ['abuse','crash','filth','murder','sickness','accident','death','grief',
          'poison','stink','assault','disaster','hatred','pollute','tragedy',
          'divorce','jail','poverty','ugly','cancer','kill','rotten','vomit','agony',
          'prison']
X_terms, Y_terms = filter_terms_not_in_wemodel(we_model, X_terms, Y_terms)
A_terms, B_terms = filter_terms_not_in_wemodel(we_model, A_terms, B_terms)

The following terms were removed from the list first_list because they were not found in the we_model: ['gladiola']
The following terms were removed from the list second_list because they were not found in the we_model: []
The following terms were removed from the second list to balance the length of the lists: ['ant']
The following terms were removed from the list first_list because they were not found in the we_model: []
The following terms were removed from the list second_list because they were not found in the we_model: []


In [32]:
# Fastest version, 10000 words -> 1 minute
for i in tqdm(range(10000)):    
    [X_mtx, Y_mtx, A_mtx, B_mtx] = get_matrices_from_term_lists(we_model, X_terms, Y_terms, A_terms, B_terms)
    cosine_sim_XA = cosine_similarity(X_mtx, A_mtx)
    cosine_sim_XB = cosine_similarity(X_mtx, B_mtx)
    mean_over_Xa = np.mean(cosine_sim_XA, axis=1)
    mean_over_Xb = np.mean(cosine_sim_XB, axis=1)
    s_for_X_words = mean_over_Xa - mean_over_Xb
    # shape is (24,) or (|X_terms|,)

    cosine_sim_YA = cosine_similarity(Y_mtx, A_mtx)
    cosine_sim_YB = cosine_similarity(Y_mtx, B_mtx)
    mean_over_Ya = np.mean(cosine_sim_YA, axis=1)
    mean_over_Yb = np.mean(cosine_sim_YB, axis=1)
    s_for_Y_words = mean_over_Ya - mean_over_Yb
    test_stat = np.mean(s_for_X_words) - np.mean(s_for_Y_words)

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [01:05<00:00, 153.43it/s]


In [33]:
# This cell works too. It takes twice as long as the cell above,
# but if we want to try to vectorize the outer loop, then 
# we will probably have to use this version
def calculate_association_metric_for_target_word(word_vec, A_mtx, B_mtx):
    '''Computes the association metric, s(w,A,B).
    word_vec: 1-D word vector
    A_mtx, B_mtx: 2-D word vector arrays'''
    A_dot_v = np.dot(A_mtx, word_vec)
    B_dot_v = np.dot(B_mtx, word_vec)
    A_norms = np.multiply(np.linalg.norm(A_mtx, axis=1), np.linalg.norm(word_vec))
    B_norms = np.multiply(np.linalg.norm(B_mtx, axis=1), np.linalg.norm(word_vec))
    A_cosines = np.divide(A_dot_v, A_norms)
    B_cosines = np.divide(B_dot_v, B_norms)
    return np.mean(A_cosines) - np.mean(B_cosines)

for i in tqdm(range(10000)):
    [X_mtx, Y_mtx, A_mtx, B_mtx] = get_matrices_from_term_lists(we_model, X_terms, Y_terms, A_terms, B_terms)
    X_associations = np.apply_along_axis(lambda x_vec: calculate_association_metric_for_target_word(x_vec, A_mtx, B_mtx), 1, X_mtx)
    Y_associations = np.apply_along_axis(lambda y_vec: calculate_association_metric_for_target_word(y_vec, A_mtx, B_mtx), 1, Y_mtx)
    m = np.mean(X_associations) - np.mean(Y_associations)


 12%|████████▊                                                                   | 1166/10000 [00:12<01:15, 116.98it/s]

KeyboardInterrupt: 

 12%|████████▊                                                                   | 1166/10000 [00:30<01:15, 116.98it/s]

In [1]:
def get_complements(x_union_y):
    '''Generator function that yields pairs of equal-size disjoint subsets
    of x_union_y.
    x_union_y should a set type.'''
    for seq in combinations(x_union_y, len(x_union_y)//2):
        complement = frozenset(x_union_y.difference(seq))
        yield (seq, complement)

def produce_2ndorder_p_value(wv_obj, X_terms, Y_terms, A_terms, B_terms):
    '''Generates the p-value for a set of terms with the word-vector object.
    High-level function; this function should be directly imported into 
    notebooks for experimentation.'''
    x_union_y = set(X_terms).union(set(Y_terms))
    total_terms = len(x_union_y)
    comparison_statistic = produce_test_statistic(wv_obj, X_terms, Y_terms, A_terms, B_terms)
    dist = np.array([])
    for (X_i_terms, Y_i_terms) in tqdm(get_complements(x_union_y), total=num_combinations(total_terms, total_terms/2)):
        test_statistic = produce_test_statistic(wv_obj, X_i_terms, Y_i_terms, A_terms, B_terms)
        dist = np.append(dist, test_statistic)
    return 1 - sp.stats.norm.cdf(comparison_statistic, loc=np.mean(dist), scale=np.std(dist, ddof=1))

In [23]:
[X_mtx, Y_mtx, A_mtx, B_mtx] = get_matrices_from_term_lists(we_model, X_terms, Y_terms, A_terms, B_terms)
cosine_sim = cosine_similarity(X_mtx, A_mtx)
print (f'Shape of X: {X_mtx.shape}')
print(f'Cosine similarity matrix shape: {cosine_sim.shape}')
print(f'Mean axis 0 shape: {np.mean(cosine_sim, axis=0).shape}')

Shape of X: (24, 300)
Cosine similarity matrix shape: (24, 25)
Mean axis 0 shape: (25,)


In [28]:
for i in tqdm(range(10000)):    
    [X_mtx, Y_mtx, A_mtx, B_mtx] = get_matrices_from_term_lists(we_model, X_terms, Y_terms, A_terms, B_terms)
    cosine_sim_XA = cosine_similarity(X_mtx, A_mtx)
    cosine_sim_XB = cosine_similarity(X_mtx, B_mtx)
    mean_over_Xa = np.mean(cosine_sim_XA, axis=1)
    mean_over_Xb = np.mean(cosine_sim_XB, axis=1)
    s_for_X_words = mean_over_Xa - mean_over_Xb
    # shape is (24,) or (|X|,)

    cosine_sim_YA = cosine_similarity(Y_mtx, A_mtx)
    cosine_sim_YB = cosine_similarity(Y_mtx, B_mtx)
    mean_over_Ya = np.mean(cosine_sim_YA, axis=1)
    mean_over_Yb = np.mean(cosine_sim_YB, axis=1)
    s_for_Y_words = mean_over_Ya - mean_over_Yb
    test_stat = np.mean(s_for_X_words) - np.mean(s_for_Y_words)

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [01:03<00:00, 158.49it/s]


In [13]:
# if a is nxd and b is mxd then cosine_similarity(a,b) is nxm
a = np.array([[1,1], [3,4]])
b = np.array([[1,1], [1,1], [1,1]])
print(a)
print(b)

[[1 1]
 [3 4]]
[[1 1]
 [1 1]
 [1 1]]


In [14]:
cosine_similarity(a,b)

array([[1.        , 1.        , 1.        ],
       [0.98994949, 0.98994949, 0.98994949]])