In [2]:
%load_ext autoreload
%autoreload 2
import numpy as np
import scipy as sp
from tqdm import tqdm
from gensim.models import KeyedVectors, Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import random
random.seed(5)
import sys
sys.path.append('../src')
from models import get_matrices_from_term_lists, \
    filter_terms_not_in_wemodel, \
    save_pickle, open_pickle, \
    save_experiment_arbitrary_label

In [3]:
SCALERS_FILEPATH = '../data/processed/scalers.pickle'
RESULTS_FILEPATH = '../data/interim/association_metric_exps.pickle'

we_model_name = "sg_dim300_min100_win5"
we_vector_size = 300
we_model_dir = '../data/external/wiki-english/wiki-english-20171001/%s' % we_model_name

we_model = Word2Vec.load(we_model_dir+'/model.gensim')
print ('loading done!')
print(f'Total words: {len(we_model.wv.vocab)}')

'''
# Caliskan GloVe
glove_file = '../data/external/glove.6B/glove.6B.50d.txt'
_ = glove2word2vec(glove_file, '../data/interim/tmp.txt')
we_model = KeyedVectors.load_word2vec_format('../data/interim/tmp.txt')
print('loading done!')
print(f'Total words: {len(we_model.wv.vocab)}')
'''

loading done!
Total words: 312425


"\n# Caliskan GloVe\nglove_file = '../data/external/glove.6B/glove.6B.50d.txt'\n_ = glove2word2vec(glove_file, '../data/interim/tmp.txt')\nwe_model = KeyedVectors.load_word2vec_format('../data/interim/tmp.txt')\nprint('loading done!')\nprint(f'Total words: {len(we_model.wv.vocab)}')\n"

In [4]:
X_terms = ['aster', 'clover', 'hyacinth', 'marigold', 'poppy', 'azalea', 
           'crocus', 'iris', 'orchid', 'rose', 'bluebell', 'daffodil','lilac',
           'pansy','tulip','buttercup','daisy','lily','penny','violet','carnation', 'gladiola',
           'magnolia','petunia','zinnia']
Y_terms = ['ant','caterpillar','flea','locust','spider','bedbug','centipede','fly',
          'maggot','tarantula','bee','cockroach','gnat','mosquito','termite','beetle',
          'cricket','hornet','moth','wasp','blackfly','dragonfly','horsefly','roach',
          'weevil']
A_terms = ['caress','freedom','health','love','peace','cheer','friend','heaven',
           'loyal','pleasure','diamond','gentle','honest','lucky','rainbow','diploma',
           'gift','honor','miracle','sunrise','family','happy','laugher','paradise',
           'vacation']
B_terms = ['abuse','crash','filth','murder','sickness','accident','death','grief',
          'poison','stink','assault','disaster','hatred','pollute','tragedy',
          'divorce','jail','poverty','ugly','cancer','kill','rotten','vomit','agony',
          'prison']
X_terms, Y_terms = filter_terms_not_in_wemodel(we_model, X_terms, Y_terms)
A_terms, B_terms = filter_terms_not_in_wemodel(we_model, A_terms, B_terms)

The following terms were removed from the list first_list because they were not found in the we_model: ['gladiola']
The following terms were removed from the list second_list because they were not found in the we_model: []
The following terms were removed from the second list to balance the length of the lists: ['ant']
The following terms were removed from the list first_list because they were not found in the we_model: []
The following terms were removed from the list second_list because they were not found in the we_model: []


# Generating Distributions

In [6]:
# Fastest version, 10000 words -> 1 minute
# (Possible TODO) May be able to add minimal speedup with itemgetter 
# (see https://stackoverflow.com/questions/18453566/python-dictionary-get-list-of-values-for-list-of-keys)
# to speed up creation of word matrices in get_matrices_from_term_lists
def get_test_stat(wv_obj, X_terms, Y_terms, A_terms, B_terms):  
    [X_mtx, Y_mtx, A_mtx, B_mtx] = get_matrices_from_term_lists(we_model, X_terms, Y_terms, A_terms, B_terms)
    cosine_sim_XA = cosine_similarity(X_mtx, A_mtx)
    cosine_sim_XB = cosine_similarity(X_mtx, B_mtx)
    mean_over_Xa = np.mean(cosine_sim_XA, axis=1)
    mean_over_Xb = np.mean(cosine_sim_XB, axis=1)
    s_for_X_words = mean_over_Xa - mean_over_Xb
    # shape is (24,) or (|X_terms|,)

    cosine_sim_YA = cosine_similarity(Y_mtx, A_mtx)
    cosine_sim_YB = cosine_similarity(Y_mtx, B_mtx)
    mean_over_Ya = np.mean(cosine_sim_YA, axis=1)
    mean_over_Yb = np.mean(cosine_sim_YB, axis=1)
    s_for_Y_words = mean_over_Ya - mean_over_Yb
    test_stat = np.mean(s_for_X_words) - np.mean(s_for_Y_words)
    return test_stat
get_test_stat(we_model, X_terms, Y_terms, A_terms, B_terms)

0.049525782

In [6]:
# This cell works too. It takes twice as long as the cell above,
# but if we want to try to vectorize the outer loop, then 
# we will probably have to use this version
def calculate_association_metric_for_target_word(word_vec, A_mtx, B_mtx):
    '''Computes the association metric, s(w,A,B).
    word_vec: 1-D word vector
    A_mtx, B_mtx: 2-D word vector arrays'''
    A_dot_v = np.dot(A_mtx, word_vec)
    B_dot_v = np.dot(B_mtx, word_vec)
    A_norms = np.multiply(np.linalg.norm(A_mtx, axis=1), np.linalg.norm(word_vec))
    B_norms = np.multiply(np.linalg.norm(B_mtx, axis=1), np.linalg.norm(word_vec))
    A_cosines = np.divide(A_dot_v, A_norms)
    B_cosines = np.divide(B_dot_v, B_norms)
    return np.mean(A_cosines) - np.mean(B_cosines)

for i in tqdm(range(10000)):
    [X_mtx, Y_mtx, A_mtx, B_mtx] = get_matrices_from_term_lists(we_model, X_terms, Y_terms, A_terms, B_terms)
    X_associations = np.apply_along_axis(lambda x_vec: calculate_association_metric_for_target_word(x_vec, A_mtx, B_mtx), 1, X_mtx)
    Y_associations = np.apply_along_axis(lambda y_vec: calculate_association_metric_for_target_word(y_vec, A_mtx, B_mtx), 1, Y_mtx)
    m = np.mean(X_associations) - np.mean(Y_associations)


100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [01:50<00:00, 90.22it/s]


In [11]:
def get_n_test_stats(wv_obj, X_terms, Y_terms, A_terms, B_terms, n_samples=100):
    sigtest_dist_1 = []
    sigtest_dist_2 = []
    sigtest_dist_3 = []
    n_targets = len(X_terms)
    n_attributes = len(A_terms)
    assert len(X_terms) == len(Y_terms)
    assert len(A_terms) == len(B_terms)
    vocab_list = list(wv_obj.wv.vocab)
    random.seed(5)
    for i in tqdm(range(n_samples)):
        X_sample = random.sample(vocab_list, k=n_targets)
        Y_sample = random.sample(vocab_list, k=n_targets)
        sigtest_dist_1.append(get_test_stat(wv_obj, X_sample, Y_sample, A_terms, B_terms))
        sigtest_dist_2.append(get_test_stat(wv_obj, X_terms, Y_sample, A_terms, B_terms))
        sigtest_dist_3.append(get_test_stat(wv_obj, X_sample, Y_terms, A_terms, B_terms))
    return np.array(sigtest_dist_1), np.array(sigtest_dist_2), np.array(sigtest_dist_3)
#a,b,c = get_n_test_stats(we_model, X_terms, Y_terms, A_terms, B_terms)

In [13]:
# Reminder that if you run this cell with a lower number of n_samples, 
# It will overwrite what's currently in the dictionary
FILEPATH = '../data/interim/association_metric_exps.pickle'
EXPERIMENT_DEFINITION_PATH = '../data/interim/experiment_definitions.pickle'
def run_all_sigtests(order='second'):
    exps = open_pickle(EXPERIMENT_DEFINITION_PATH)
    scalers_dict = open_pickle(SCALERS_FILEPATH)
    print(f'ORDER = {order}')
    for exp_num, exp in exps.items():
        print('******************************')
        print(f'Experiment: {exp_num}')
        X_terms = exp['X_terms']
        Y_terms = exp['Y_terms']
        A_terms = exp['A_terms']
        B_terms = exp['B_terms']
        if order == 'second':
            #scaler = scalers_dict[exp_num][order]
            comparison_statistic = get_test_stat(we_model, X_terms, Y_terms, A_terms, B_terms)
            dist_1, dist_2, dist_3 = get_n_test_stats(we_model, X_terms, Y_terms, A_terms, B_terms, n_samples=10000)
            #[dist_1, dist_2, dist_3] = [scaler.transform(dist.reshape(-1,1)).reshape(len(dist)) for dist in [dist_1, dist_2, dist_3]]
            save_experiment_arbitrary_label(FILEPATH, exp_num, order,
                                            'sigtest_dist_1', dist_1)
            save_experiment_arbitrary_label(FILEPATH, exp_num, order,
                                            'sigtest_dist_2', dist_2)
            save_experiment_arbitrary_label(FILEPATH, exp_num, order,
                                            'sigtest_dist_3', dist_3)
        else:
            #TODO
            raise NotImplementedError
            run_exps_1storder(X_terms, Y_terms, A_terms, B_terms, exp_num)
run_all_sigtests(order='second')

ORDER = second
******************************
Experiment: 1


100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [02:25<00:00, 68.71it/s]


Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [1][second][sigtest_dist_1]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [1][second][sigtest_dist_2]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [1][second][sigtest_dist_3]
******************************
Experiment: 2


100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [02:14<00:00, 74.10it/s]


Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [2][second][sigtest_dist_1]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [2][second][sigtest_dist_2]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [2][second][sigtest_dist_3]
******************************
Experiment: 3


100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [02:30<00:00, 66.45it/s]


Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [3][second][sigtest_dist_1]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [3][second][sigtest_dist_2]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [3][second][sigtest_dist_3]
******************************
Experiment: 4


100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [02:19<00:00, 71.57it/s]


Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [4][second][sigtest_dist_1]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [4][second][sigtest_dist_2]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [4][second][sigtest_dist_3]
******************************
Experiment: 5


100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [01:38<00:00, 101.64it/s]


Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [5][second][sigtest_dist_1]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [5][second][sigtest_dist_2]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [5][second][sigtest_dist_3]
******************************
Experiment: 6


100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [01:23<00:00, 120.00it/s]


Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [6][second][sigtest_dist_1]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [6][second][sigtest_dist_2]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [6][second][sigtest_dist_3]
******************************
Experiment: 7


100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [01:31<00:00, 109.04it/s]


Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [7][second][sigtest_dist_1]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [7][second][sigtest_dist_2]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [7][second][sigtest_dist_3]
******************************
Experiment: 8


100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [01:33<00:00, 107.02it/s]


Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [8][second][sigtest_dist_1]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [8][second][sigtest_dist_2]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [8][second][sigtest_dist_3]
******************************
Experiment: 9


100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [01:39<00:00, 100.92it/s]


Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [9][second][sigtest_dist_1]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [9][second][sigtest_dist_2]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [9][second][sigtest_dist_3]
******************************
Experiment: 10


100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [01:39<00:00, 100.16it/s]


Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [10][second][sigtest_dist_1]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [10][second][sigtest_dist_2]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [10][second][sigtest_dist_3]


# Fitting Distributions

In [34]:
RESULTS_FILEPATH = '../data/interim/association_metric_exps.pickle'
d = open_pickle(RESULTS_FILEPATH)

In [38]:
d[1]

defaultdict(dict,
            {'first': {'X_array': array([ 0.08113076,  0.18911934, -0.0954811 ,  0.13401467,  0.09897012,
                      0.19287561,  0.11558053,  0.06996465,  0.12096066,  0.12797187,
                      0.11942707,  0.18256408,  0.17806363,  0.17999095,  0.11687332,
                      0.08817419,  0.13236946,  0.10520089,  0.17782208,  0.09271343,
                      0.14632878,  0.13019669,  0.14671066,  0.15014505], dtype=float32),
              'Y_array': array([-0.04801004, -0.06460989, -0.08932857, -0.06297787, -0.40200305,
                     -0.1162259 ,  0.05139589, -0.29557508, -0.14763156,  0.11352445,
                     -0.1613886 , -0.0787251 , -0.19676779, -0.14766207, -0.03967397,
                      0.07359396, -0.04168847,  0.08253389, -0.0736915 , -0.06181172,
                      0.0839064 ,  0.08104831,  0.03454182, -0.13286161], dtype=float32),
              'X_mean': 0.12423697,
              'Y_mean': -0.068337,
            

In [10]:
from scipy.stats import norm
FILEPATH = '../data/interim/association_metric_exps.pickle'
EXPERIMENT_DEFINITION_PATH = '../data/interim/experiment_definitions.pickle'
def calculate_all_sigtest_metrics(order='second'):
    exps = open_pickle(EXPERIMENT_DEFINITION_PATH)
    scalers_dict = open_pickle(SCALERS_FILEPATH)
    results_dict = open_pickle(RESULTS_FILEPATH)
    print(f'ORDER = {order}')
    for exp_num, exp in exps.items():
        print('******************************')
        print(f'Experiment: {exp_num}')
        X_terms = exp['X_terms']
        Y_terms = exp['Y_terms']
        A_terms = exp['A_terms']
        B_terms = exp['B_terms']
        if order == 'second':
            comparison_statistic = get_test_stat(we_model, X_terms, Y_terms, A_terms, B_terms)
            
            dist_1, dist_2, dist_3 = [results_dict[exp_num][order][f'sigtest_dist_{n}'] for n in [1,2,3]]
            loc_1, loc_2, loc_3 = [np.mean(dist) for dist in [dist_1, dist_2, dist_3]]
            scale_1, scale_2, scale_3 = [np.std(dist) for dist in [dist_1, dist_2, dist_3]]
            # If you want to play around with statistics of the distributions,
            # Add code and print statements here, e.g.
            # print(f'90% CI for dist 1: {norm.ppf(0.1, loc=loc_1, scale=scale_1)}')
            
            save_experiment_arbitrary_label(FILEPATH, exp_num, order, 'test_statistic', comparison_statistic)
            
            '''
            save_experiment_arbitrary_label(FILEPATH, exp_num, order,
                                            'ST1_80CI', [norm.ppf(0.1, loc=loc_1, scale=scale_1),
                                                         norm.ppf(0.9, loc=loc_1, scale=scale_1)])
            save_experiment_arbitrary_label(FILEPATH, exp_num, order,
                                            'ST1_90CI', [norm.ppf(0.05, loc=loc_1, scale=scale_1),
                                                        norm.ppf(0.95, loc=loc_1, scale=scale_1)])
            save_experiment_arbitrary_label(FILEPATH, exp_num, order,
                                            'ST1_95CI', [norm.ppf(0.025, loc=loc_1, scale=scale_1),
                                                        norm.ppf(0.975, loc=loc_1, scale=scale_1)])
            '''
            save_experiment_arbitrary_label(FILEPATH, exp_num, order,
                                            'ST1_p-value', [norm.ppf(0.025, loc=loc_1, scale=scale_1),
                                                        norm.ppf(0.975, loc=loc_1, scale=scale_1)])
            
        else:
            #TODO
            raise NotImplementedError
            run_exps_1storder(X_terms, Y_terms, A_terms, B_terms, exp_num)
calculate_all_sigtest_metrics(order='second')

ORDER = second
******************************
Experiment: 1
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [1][second][test_statistic]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [1][second][ST1_80CI]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [1][second][ST1_90CI]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [1][second][ST1_95CI]
******************************
Experiment: 2
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [2][second][test_statistic]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [2][second][ST1_80CI]
Results array successfully saved to file ../data/interim/association_metric_exps.pickle under    keys [2][second][ST1_90CI]
Results array s