In [82]:
import os
import copy
import pickle
import pandas as pd
import copy
from statistics import mean

import numpy as np
import scipy
from tqdm import tqdm

from gensim.models import Word2Vec
from gensim import utils
from gensim.corpora import Dictionary
from gensim.models.lsimodel import LsiModel
from sklearn.preprocessing import MinMaxScaler

import sys
sys.path.append('../src')
from models import get_matrices_from_term_lists, produce_effect_size

In [8]:
we_model_name = "sg_dim300_min100_win5"
we_vector_size = 300
we_model_dir = '../data/external/wiki-english/wiki-english-20171001/%s' % we_model_name

In [29]:
we_model = Word2Vec.load(we_model_dir+'/model.gensim')

print ('loading done!')

loading done!


In [67]:
we_model['car']

  """Entry point for launching an IPython kernel.


array([ 0.12733914, -0.08342427,  0.16369039, -0.21538447,  0.14228454,
        0.00829218,  0.18249302,  0.03723789, -0.07433749, -0.17181633,
       -0.2403415 , -0.31628737,  0.00648428, -0.1407163 , -0.18559778,
       -0.2550604 ,  0.04433366,  0.01166777, -0.10130986, -0.13285924,
       -0.06508999, -0.00890678,  0.17122649,  0.30153406,  0.10102404,
       -0.16058509, -0.01278191,  0.01183818, -0.0335123 , -0.15908985,
       -0.01373081, -0.14647457,  0.15658577,  0.05838944, -0.15460949,
        0.01621989,  0.02162932, -0.18378492,  0.32068068, -0.00999494,
       -0.27018073,  0.2592168 , -0.0147164 , -0.10615241, -0.5176828 ,
       -0.03581639, -0.00578303,  0.00869335,  0.47046474,  0.18723992,
        0.20179759, -0.15939656, -0.19770053, -0.1105013 ,  0.04323676,
        0.261503  ,  0.21725641,  0.11865173, -0.08013157,  0.11658183,
        0.0284771 , -0.12727183,  0.28431818,  0.05774978, -0.03682013,
       -0.1408243 , -0.09530429,  0.2631356 ,  0.0708516 ,  0.12

In [10]:
#loading E_ctx_vec
with open(we_model_dir+'/E_ctx_vec.pkl', 'rb') as fr:
    E_ctx_vec = pickle.load(fr)
with open(we_model_dir+'/E_wrd_vec.pkl', 'rb') as fr:
    E_wrd_vec = pickle.load(fr)

In [11]:
def get_expSG_vecs(words):
    
    expSG_vecs = {}
    for word in words:
        _idx = we_model.wv.vocab[word].index
        _vec = we_model.wv.vectors[_idx]
        
        # explicit SkipGram
        expSG_vecs[word] = scipy.special.expit(np.dot(we_model.trainables.syn1neg, _vec))
        expSG_vecs[word] /= np.sqrt(E_ctx_vec * E_wrd_vec[_idx])
    
    return expSG_vecs

get_expSG_vecs(['book'])

{'book': array([0.24737692, 0.56118023, 0.68307143, ..., 0.09694416, 0.18521136,
        0.15946515], dtype=float32)}

In [12]:
def get_expSG_1storder_relation(word_from, words_to):
    expSG_vec = get_expSG_vecs([word_from])[word_from]
    
    relations={}
    for word_to in words_to:
        if word_to in we_model.wv.vocab:
            _idx = we_model.wv.vocab[word_to].index
            relations[word_to] = expSG_vec[_idx]
    
    return relations

get_expSG_1storder_relation('book', ['book', 'library'])

{'book': 11.011071, 'library': 3.6249995}

In [84]:
def get_expSG_1storder_relation_scaled(word_from, words_to):
    expSG_vec = get_expSG_vecs([word_from])[word_from]
    
    scaler = MinMaxScaler()
    scaler.fit(expSG_vec.reshape(-1, 1))

    relations={}
    for word_to in words_to:
        if word_to in we_model.wv.vocab:
            _idx = we_model.wv.vocab[word_to].index
            _value = expSG_vec[_idx]
            _value_scaled = scaler.transform(np.array([_value]).reshape(1, -1))[0][0]
            relations[word_to] = _value_scaled
    
    return relations


# Calculating *d*

In [35]:
A_terms = ['book','library']
B_terms = ['car','garage']
X_terms = ['female','woman']
Y_terms = ['male','man']

In [46]:
# This function is untested
def get_1storder_association_metric(word, A_terms, B_terms):
    A_relations = get_expSG_1storder_relation(word, A_terms)
    B_relations = get_expSG_1storder_relation(word, B_terms)
    return mean(A_relations.values()) - mean(B_relations.values())
get_1storder_association_metric('book', ['book','library'], ['car','garage'])

6.6768265

In [33]:
get_expSG_1storder_relation('book',['car','garage'])

{'car': 0.3352654, 'garage': 0.9471519}

In [86]:
# WEAT 6
X_terms = ['John', 'Paul','Mike','Kevin','Steve','Greg','Jeff','Bill']
Y_terms = ['Amy','Joan','Lisa','Sarah','Diana','Kate','Ann','Donna']
A_terms = ['executive','management','professional','corporation',
               'salary','office','business','career']
B_terms = ['home','parents','children','family',
               'cousins','marriage','wedding','relatives']
[X_terms, Y_terms, A_tersm, B_terms] = [[str.lower(term) for term in terms] for terms in [X_terms, Y_terms, A_terms, B_terms]]

In [88]:
# WEAT 7 using Google News word2vec model
X_terms = ['math','algebra','geometry','calculus',
             'equations','computation','numbers','addition']
Y_terms = ['poetry','art','dance','literature',
             'novel','symphony','drama','sculpture']
A_terms = ['male','man','boy','brother',
              'he','him','his','son']
B_terms = ['female','woman','girl','sister',
               'she','her','hers','daughter']
[X_terms, Y_terms, A_tersm, B_terms] = [[str.lower(term) for term in terms] for terms in [X_terms, Y_terms, A_terms, B_terms]]

In [90]:
# Weat 8
A_terms = ['brother','father','uncle','grandfather',
          'son','he','his','him']
B_terms = ['sister','mother','aunt','grandmother',
          'daughter','she','hers','her']
X_terms = ['science','technology','physics','chemistry',
          'Einstein','NASA','experiment','astronomy']
Y_terms = ['poetry','art','Shakespeare','dance',
          'literature','novel','symphony','drama']
[X_terms, Y_terms, A_tersm, B_terms] = [[str.lower(term) for term in terms] for terms in [X_terms, Y_terms, A_terms, B_terms]]

In [80]:
def produce_1storder_effect_size_unnormalized(X_terms, Y_terms, A_terms, B_terms):
    x_associations = np.array([])
    y_associations = np.array([])
    for (x,y) in zip(X_terms, Y_terms):
        x_association = get_1storder_association_metric(x, A_terms, B_terms)
        y_association = get_1storder_association_metric(y, A_terms, B_terms)
        x_associations = np.append(x_associations, x_association)
        y_associations = np.append(y_associations, y_association)
    all_associations = np.append(x_associations, y_associations)
    return (np.mean(x_associations) - np.mean(y_associations))/np.std(all_associations, ddof=1)
produce_1storder_effect_size_unnormalized(X_terms, Y_terms, A_terms, B_terms)

1.4705007085670068

Results: WEAT 6: 1.8156 (1.89) WEAT 7: 1.648, Weat 8: 1.471 (1.24)

# Second-Order

In [75]:
produce_effect_size(we_model, X_terms, Y_terms, A_terms, B_terms)

1.4183265688072304

Results: 1.774, 1.595, 1.4183

# Normalized

In [78]:
def get_1storder_association_metric_scaled(word, A_terms, B_terms):
    A_relations = get_expSG_1storder_relation_scaled(word, A_terms)
    B_relations = get_expSG_1storder_relation_scaled(word, B_terms)
    return mean(A_relations.values()) - mean(B_relations.values())

In [92]:
def produce_1storder_effect_size_scaled(X_terms, Y_terms, A_terms, B_terms):
    x_associations = np.array([])
    y_associations = np.array([])
    for (x,y) in zip(X_terms, Y_terms):
        x_association = get_1storder_association_metric_scaled(x, A_terms, B_terms)
        y_association = get_1storder_association_metric_scaled(y, A_terms, B_terms)
        x_associations = np.append(x_associations, x_association)
        y_associations = np.append(y_associations, y_association)
    all_associations = np.append(x_associations, y_associations)
    return (np.mean(x_associations) - np.mean(y_associations))/np.std(all_associations, ddof=1)
produce_1storder_effect_size_scaled(X_terms, Y_terms, A_terms, B_terms)

1.4517385120046582

(2nd order, 1st order unscaled, 1st order scaled)
(1.774, 1.816, 1.759)
(1.595, 1.648, 1.335)
(1.4183, 1.471, 1.451)