In [16]:
%load_ext autoreload
%autoreload 2
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from gensim.test.utils import datapath
from gensim.models import KeyedVectors, Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from collections import defaultdict
import sys
sys.path.append('../src')
from models import filter_terms_not_in_wemodel, \
    get_2ndorder_association_metric_list_for_target_list, \
    get_matrices_from_term_lists, \
    save_arrays, open_pickle, save_pickle, \
    save_scalers

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
# Glove model fast load
we_model = KeyedVectors.load('../data/interim/glove_840_normed', mmap='r')
print('loading done!')
print(f'Total words: {len(we_model.wv.vocab)}')

loading done!
Total words: 2196016


  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
EXPERIMENT_DEFINITION_PATH = '../data/interim/glove_840B_experiment_definitions.pickle'
RESULTS_FILEPATH = '../data/interim/glove_840B_association_metric_exps.pickle'
SCALERS_FILEPATH = '../data/processed/glove_840B_scalers.pickle'
THRESHOLD_BIASES_PATH_2NDORDER = '../data/processed/glove_840B_threshold_biases_2ndorder.pickle'
THRESHOLD_BIASES_PATH_1STORDER = '../data/processed/glove_840B_threshold_biases_1storder.pickle'


# Second-Order

In [29]:
def calculate_cosines_for_target_word_unscaled(word_vec, A_mtx, B_mtx):
    A_dot_v = np.dot(A_mtx, word_vec)
    B_dot_v = np.dot(B_mtx, word_vec)
    A_norms = np.multiply(np.linalg.norm(A_mtx, axis=1), np.linalg.norm(word_vec))
    B_norms = np.multiply(np.linalg.norm(B_mtx, axis=1), np.linalg.norm(word_vec))
    A_cosines = np.divide(A_dot_v, A_norms)
    B_cosines = np.divide(B_dot_v, B_norms)
    return np.mean(A_cosines), np.mean(B_cosines)

def calculate_cosines_for_all_words_unscaled(we_model, A_mtx, B_mtx):
    '''Computes the association metric, s(w,A,B).
    word_vec: 1-D word vector
    A_mtx, B_mtx: 2-D word vector arrays'''
    #A_cosines_apply = np.apply_along_axis(lambda row: 1-cosine_distance(row, word_vec), 1, A_mtx)
    #B_cosines_apply = np.apply_along_axis(lambda row: 1-cosine_distance(row, word_vec), 1, B_mtx)
    A_mtx_norm = A_mtx/np.linalg.norm(A_mtx, axis=1).reshape(-1,1)
    B_mtx_norm = B_mtx/np.linalg.norm(B_mtx, axis=1).reshape(-1,1)
    all_mtx_norm = we_model.wv.vectors/np.linalg.norm(we_model.wv.vectors, axis=1).reshape(-1,1)
    
    all_associations_to_A = np.dot(A_mtx_norm, all_mtx_norm.T)
    all_associations_to_B = np.dot(B_mtx_norm, all_mtx_norm.T)
    
    return np.mean(all_associations_to_A, axis=0), np.mean(all_associations_to_B, axis=0)


def get_2ndorder_association_metric_list_for_target_list(target_list, A_terms, B_terms, we_model, exp_num):
    
    [X_mtx, _, A_mtx, B_mtx] = get_matrices_from_term_lists(we_model, target_list, target_list, A_terms, B_terms)
    
    # A_associations, B_associations are associations for all words    
    A_associations, B_associations = calculate_cosines_for_all_words_unscaled(we_model, A_mtx, B_mtx)
    
    
    all_associations = np.concatenate((A_associations, B_associations))
    scaler = MinMaxScaler(feature_range=(0,1))
    scaler.fit(all_associations.reshape(-1,1))
    save_scalers(SCALERS_FILEPATH, exp_num, 'second', scaler)
    
    _th = np.mean(np.abs(A_associations - B_associations))
    _th = scaler.transform(_th.reshape(-1, 1))[0,0]
    
    '''
    threshold_biases = open_pickle(THRESHOLD_BIASES_PATH_2NDORDER)
    threshold_biases = scaler.transform(threshold_biases.reshape(-1,1))
    pct_5 = np.percentile(threshold_biases, 5)
    pct_95 = np.percentile(threshold_biases, 95)
    '''
    
    biases = A_associations - B_associations
    '''
    biases = scaler.transform(biases.reshape(-1, 1))
    lower_bound = np.percentile(biases, 5)
    print(f'Lower bound: {lower_bound}')
    upper_bound = np.percentile(biases, 95)
    print(f'Upper bound: {upper_bound}')
    '''
    
    target_associations = np.apply_along_axis(lambda x_vec: calculate_cosines_for_target_word_unscaled(x_vec, A_mtx, B_mtx), 1, X_mtx)
    
    target_biases = []
    A_biases = []
    for _assoc in target_associations:
        _A_assoc = scaler.transform(_assoc[0].reshape(-1, 1))[0,0]
        _B_assoc = scaler.transform(_assoc[1].reshape(-1, 1))[0,0]
        _bias = _A_assoc - _B_assoc
        target_biases.append(_bias)
        A_biases.append(_A_assoc)
    #return np.array(target_biases), _th, pct_5, pct_95, np.array(A_biases), lower_bound, upper_bound
    return np.array(target_biases), _th, None, None, np.array(A_biases), None, None

def run_exps_2ndorder(X_terms, Y_terms, A_terms, B_terms, exp_num):
    order='second'
    X_metrics, _th, pct_5, pct_95, A_biases, lower_bound, upper_bound = get_2ndorder_association_metric_list_for_target_list(X_terms, A_terms, B_terms, we_model, exp_num)
    Y_metrics, _th, pct_5, pct_95, A_biases, lower_bound, upper_bound = get_2ndorder_association_metric_list_for_target_list(Y_terms, A_terms, B_terms, we_model, exp_num)
    print (X_metrics)
    print (Y_metrics)

    print ('mean bias to X', np.mean(X_metrics))
    print ('mean bias to Y', np.mean(Y_metrics))

    print ('Bias threshold', _th)
    print ('5th percentile', pct_5)
    print ('95th percentile', pct_95)

    order = 'second'
    threshold = _th
    #save_arrays(RESULTS_FILEPATH, exp_num, order, X_metrics, Y_metrics, threshold, pct_5, pct_95, A_biases, lower_bound, upper_bound)
    save_arrays(RESULTS_FILEPATH, exp_num, order, X_metrics, Y_metrics, threshold, None, None, A_biases, None, None)
#run_exps_2ndorder(X_terms, Y_terms, A_terms, B_terms, exp_num)

In [30]:
def run_all_exps(order='second'):
    exps = open_pickle(EXPERIMENT_DEFINITION_PATH)
    print(f'ORDER = {order}')
    for exp_num, exp in exps.items():
        print('***********************************')
        print(f'Experiment: {exp_num}')
        X_terms = exp['X_terms']
        Y_terms = exp['Y_terms']
        A_terms = exp['A_terms']
        B_terms = exp['B_terms']
        if order == 'second':
            run_exps_2ndorder(X_terms, Y_terms, A_terms, B_terms, exp_num)
        else:
            run_exps_1storder(X_terms, Y_terms, A_terms, B_terms, exp_num)
run_all_exps(order='second')

ORDER = second
***********************************
Experiment: 1




[ 0.04218954  0.07744247 -0.00571454  0.05228046  0.04121482  0.01642609
  0.05941069  0.08293772  0.10962468  0.06767094  0.05554709  0.08660334
  0.11914104  0.02486253  0.09582818  0.06356439  0.10746822  0.09457076
  0.0782218   0.06955409  0.07021624  0.00056103  0.07109076  0.00462493
  0.01719075]
[-0.06156641 -0.02580431 -0.01672459 -0.09525567 -0.04476053 -0.16175112
 -0.07126629  0.05551386 -0.14847207 -0.04637542  0.02886164 -0.11517835
 -0.08661395 -0.07366097 -0.09728244 -0.07608503  0.02309245 -0.10237056
 -0.07108605 -0.11936021 -0.1044682   0.08603638 -0.07104123 -0.0895586
 -0.11381534]
mean bias to X 0.060101118
mean bias to Y -0.063959725
Bias threshold 0.43816024
5th percentile None
95th percentile None
Results array successfully saved to file ../data/interim/glove_840B_association_metric_exps.pickle under keys [1][second]
***********************************
Experiment: 2




[0.02112871 0.10340279 0.09794074 0.03156099 0.06695956 0.08279988
 0.08799821 0.07756364 0.09256029 0.0854587  0.02969325 0.03033215
 0.09486365 0.04832196 0.06213605 0.07350785 0.04589254 0.04254091
 0.14756882 0.07185456 0.04977226 0.13195184 0.04830742 0.07840472]
[ 0.13220632 -0.08497709 -0.10385516 -0.02845621 -0.07789075 -0.04371208
 -0.09026426 -0.03039348 -0.0064081   0.00885332 -0.05281079 -0.11493045
 -0.01990819 -0.02996206 -0.16547555 -0.11600381 -0.02762115 -0.0789417
 -0.15496811 -0.01919436 -0.13672858 -0.04515535 -0.02813044 -0.00747687]
mean bias to X 0.0709384
mean bias to Y -0.055091873
Bias threshold 0.43816024
5th percentile None
95th percentile None
Results array successfully saved to file ../data/interim/glove_840B_association_metric_exps.pickle under keys [2][second]
***********************************
Experiment: 3




[0.04947376 0.03819066 0.06075794 0.03681493 0.04013211 0.00934345
 0.04949635 0.03551841 0.0395602  0.04871958 0.02307355 0.013138
 0.0571239  0.03247112 0.04018152 0.01846474 0.05561113 0.04540902
 0.03290111 0.07932207 0.06089175 0.09034175 0.06679013 0.08316827
 0.03810418 0.06469381 0.04321086 0.09191966 0.07615888 0.08920604
 0.05901968 0.07435572]
[-0.04334632 -0.05183366  0.03639939 -0.00722542  0.03240851  0.00173777
 -0.04019764 -0.05799341  0.01127321  0.01092386 -0.01803887 -0.05119988
 -0.03436151 -0.03272891 -0.01318696 -0.06054419 -0.05950689 -0.07549354
 -0.02034172  0.03173459 -0.00107506  0.04394093 -0.07391176 -0.10550284
  0.09901994 -0.04825634  0.03441891 -0.02459928 -0.05353639  0.02470395
 -0.02235052  0.01960972]
mean bias to X 0.051361386
mean bias to Y -0.017158136
Bias threshold 0.45130557
5th percentile None
95th percentile None
Results array successfully saved to file ../data/interim/glove_840B_association_metric_exps.pickle under keys [3][second]
********



[ 0.05068097  0.01993805 -0.02059141  0.0236299   0.00840753  0.01258183
  0.03077859 -0.01993155  0.02975827  0.04305631  0.05260944  0.07919616
  0.05781418  0.05057943  0.02023917  0.03302085]
[-0.05321178 -0.0120869  -0.01012674  0.00173041 -0.03187305 -0.00158376
 -0.03198817 -0.06319198 -0.01371768  0.03941283 -0.00934198 -0.03070247
 -0.0141753  -0.04654589 -0.01330665 -0.03882402]
mean bias to X 0.029485483
mean bias to Y -0.02059582
Bias threshold 0.43801668
5th percentile None
95th percentile None
Results array successfully saved to file ../data/interim/glove_840B_association_metric_exps.pickle under keys [4][second]
***********************************
Experiment: 5




[0.02997595 0.02777767 0.0562219  0.03991467 0.05625299 0.06538895
 0.03081465 0.09509486 0.10302895 0.10436741 0.1357091  0.13586634
 0.1445094  0.08173317 0.10956419]
[ 0.00755104  0.05193174  0.00831422  0.01326728  0.01684302  0.05267057
  0.01058781 -0.0105519   0.0463689   0.04227883  0.04774749  0.0175404
  0.03432521  0.05756029  0.0054355 ]
mean bias to X 0.08108135
mean bias to Y 0.02679136
Bias threshold 0.41365448
5th percentile None
95th percentile None
Results array successfully saved to file ../data/interim/glove_840B_association_metric_exps.pickle under keys [5][second]
***********************************
Experiment: 6




[-0.02403039 -0.00274336  0.03149205  0.00241226  0.06785169  0.02037591
  0.03925794  0.03305089]
[-0.1275298  -0.11192524 -0.09374321 -0.19913763 -0.14690241 -0.16776764
 -0.14576101 -0.10901403]
mean bias to X 0.020958375
mean bias to Y -0.13772263
Bias threshold 0.397994
5th percentile None
95th percentile None
Results array successfully saved to file ../data/interim/glove_840B_association_metric_exps.pickle under keys [6][second]
***********************************
Experiment: 7




[ 0.00286573  0.00294164  0.00115371  0.02871773  0.00278935  0.01474109
  0.03175566 -0.00981426]
[-0.02410835  0.0049789  -0.04747242 -0.01069218 -0.03350341  0.02037662
 -0.01521182  0.00030255]
mean bias to X 0.00939383
mean bias to Y -0.013166264
Bias threshold 0.39708924
5th percentile None
95th percentile None
Results array successfully saved to file ../data/interim/glove_840B_association_metric_exps.pickle under keys [7][second]
***********************************
Experiment: 8




[ 0.03385812  0.03231093  0.05526206 -0.00139216  0.03719294  0.02329931
  0.01163203  0.04156119]
[-0.01482975  0.01632476  0.01545653 -0.04080719 -0.00069809 -0.02520853
  0.02052143 -0.00741225]
mean bias to X 0.029215552
mean bias to Y -0.0045816377
Bias threshold 0.34878048
5th percentile None
95th percentile None
Results array successfully saved to file ../data/interim/glove_840B_association_metric_exps.pickle under keys [8][second]
***********************************
Experiment: 9




[-0.07468641 -0.09706289 -0.00810719  0.03295875 -0.09705055 -0.12483597]
[-0.18552452 -0.17207026 -0.11537635 -0.22701114 -0.14478397 -0.19616002]
mean bias to X -0.06146404
mean bias to Y -0.17348771
Bias threshold 0.41640112
5th percentile None
95th percentile None
Results array successfully saved to file ../data/interim/glove_840B_association_metric_exps.pickle under keys [9][second]
***********************************
Experiment: 10




[0.07385314 0.05609781 0.04313481 0.0367108  0.03212434 0.05113655
 0.0641565  0.01035607]
[ 0.01141104  0.04575011  0.00337946  0.03290766 -0.01298252 -0.03145593
 -0.02803424 -0.00526515]
mean bias to X 0.04594625
mean bias to Y 0.0019638054
Bias threshold 0.41365448
5th percentile None
95th percentile None
Results array successfully saved to file ../data/interim/glove_840B_association_metric_exps.pickle under keys [10][second]
