In [1]:
import gensim
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load Google's pre-trained Word2Vec model.
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)
model.init_sims(replace=True) # The vectors shall be normalized by L2 norm to reproduce the results published

In [3]:
def check_full_list(counterpart, obj, subj):
    '''
    This function aims at finding the 10 closest matches for analogies like
    
    vec(he) - vec(king) ~= vec(she) - vec(??)
    
    where ?? is queen. This is an extension to the `most_similar` method of the class 
    gensim.models.keyedvectors.WordEmbeddingsKeyedVectors that it finds the closest
    matches of vector representation to
    
    vec(she) - vec(he) + vec(king),
    
    and vec(king) is also included in the returned list for comparison.
    
    Args:
        counterpart (str): The counterpart of the subject in the analogy ("she" above)
        obj (str): The object of which its relative relationship with subject is of interest.
                   In the above example, it is "king"
        subj (str): The subject with an object, which is "he" in the above example

    Returns:
        A list of 11 tuples indicating vectors with highest cosine similarity with the vector
        vec(counterpart) - vec(subj) + vec(obj)

    '''
    top_ten = model.most_similar(positive=[counterpart, obj], negative=[subj])
    vec1 = (model.get_vector(counterpart) + model.get_vector(obj) - model.get_vector(subj)).reshape(1, 300)
    vec2 = model.get_vector(obj).reshape(1, 300)
    top_ten.append((obj, cosine_similarity(vec1, vec2)[0][0]))
    return sorted(top_ten, key=lambda x: x[1], reverse=True)

## Demostration of how word-embedding can give analogy in Mikolov *et al*. (2013)

$\vec{\text{Paris}} - \vec{\text{France}} \approx \vec{\text{Tokyo}} - \vec{\text{??}}$

In [28]:
check_full_list(counterpart='Tokyo', obj='France', subj='Paris')

[('Japan', 0.8167769908905029),
 ('Japanese', 0.648090124130249),
 ('South_Korea', 0.6141558885574341),
 ('Japans', 0.6117385029792786),
 ('Shizuoka', 0.5742497444152832),
 ('Aomori_Prefecture', 0.5598059892654419),
 ('northernmost_prefecture', 0.5524747967720032),
 ('Kyushu', 0.5514252185821533),
 ('captain_Makoto_Hasebe', 0.5508174896240234),
 ('Shimane', 0.5497493743896484),
 ('France', 0.53038573)]

## An appropriate *she-he* analogy

$\vec{\text{he}} - \vec{\text{king}} \approx \vec{\text{she}} - \vec{\text{??}}$

In [6]:
check_full_list( counterpart='she', obj='king', subj='he')

[('queen', 0.7633836269378662),
 ('king', 0.6967242),
 ('princess', 0.6342117786407471),
 ('queens', 0.5744965076446533),
 ('monarch', 0.5577754974365234),
 ('goddess', 0.5278830528259277),
 ('princesses', 0.5202734470367432),
 ('Queen_Consort', 0.5134546756744385),
 ('very_pampered_McElhatton', 0.5131746530532837),
 ('empress', 0.5119600892066956),
 ('queendom', 0.5091063380241394)]

$\vec{\text{he}} - \vec{\text{brother}} \approx \vec{\text{she}} - \vec{\text{??}}$

In [26]:
check_full_list( counterpart='she', obj='brother', subj='he')

[('sister', 0.7946048378944397),
 ('daughter', 0.7510346174240112),
 ('mother', 0.7232228517532349),
 ('brother', 0.70276976),
 ('husband', 0.702451765537262),
 ('niece', 0.6873900890350342),
 ('aunt', 0.661358118057251),
 ('eldest_daughter', 0.6575361490249634),
 ('sisters', 0.6565696597099304),
 ('twin_sister', 0.635208010673523),
 ('neice', 0.6324375867843628)]

## Examples of gender stereotype *she-he* analogies mentioned in Bolukbasi *et al*. (2016)

### *sewing-carpentry*

$\vec{\text{she}} - \vec{\text{sewing}} \approx \vec{\text{he}} - \vec{\text{??}}$

In [8]:
check_full_list(subj='she', obj='sewing', counterpart='he')

[('sewing', 0.66992193),
 ('woodworking', 0.5794616937637329),
 ('sew', 0.531487226486206),
 ('carpentry', 0.5230334997177124),
 ('woodcarving', 0.49166664481163025),
 ('wood_carving', 0.4753499925136566),
 ('leatherworking', 0.4700630009174347),
 ('Sewing', 0.4634256660938263),
 ('knitting', 0.46304479241371155),
 ('spinning_weaving', 0.4606916308403015),
 ('woodworking_shop', 0.45343706011772156)]

$\vec{\text{he}} - \vec{\text{sewing}} \approx \vec{\text{she}} - \vec{\text{??}}$

In [7]:
check_full_list(subj='he', obj='sewing', counterpart='she')

[('sewing', 0.8904124),
 ('knitting', 0.6953117251396179),
 ('quilting', 0.6564728021621704),
 ('crocheting', 0.6488434076309204),
 ('Sewing', 0.6432007551193237),
 ('sew', 0.6409136652946472),
 ('needlework', 0.6230053305625916),
 ('sewing_embroidery', 0.6086386442184448),
 ('crochet', 0.6023696064949036),
 ('embroidery', 0.6006345152854919),
 ('sewing_machine', 0.5801945924758911)]

### *nurse-surgeon*

$\vec{\text{she}} - \vec{\text{nurse}} \approx \vec{\text{he}} - \vec{\text{??}}$

In [11]:
check_full_list(subj='she', obj='nurse', counterpart='he')

[('nurse', 0.6655272),
 ('doctor', 0.5559605360031128),
 ('medic', 0.5425376892089844),
 ('physician', 0.5394270420074463),
 ('x_ray_technician', 0.5355567932128906),
 ('surgeon', 0.516014575958252),
 ('nurses', 0.49741458892822266),
 ('paramedic', 0.4924110770225525),
 ('anesthetist', 0.4886544942855835),
 ('patient', 0.46544167399406433),
 ('doctors', 0.4639861583709717)]

$\vec{\text{she}} - \vec{\text{nurse}} \approx \vec{\text{she}} - \vec{\text{??}}$

In [15]:
check_full_list(subj='she', obj='nurse', counterpart='she')

[('nurse', 1.0),
 ('registered_nurse', 0.7907712459564209),
 ('nurses', 0.738167405128479),
 ('nurse_practitioner', 0.699310302734375),
 ('midwife', 0.6727651953697205),
 ('respiratory_therapist', 0.6620449423789978),
 ('Nurse', 0.6428854465484619),
 ('nursing', 0.6424654722213745),
 ('doctor', 0.6319522857666016),
 ('neonatal_nurse', 0.6193329691886902),
 ('x_ray_technician', 0.6126095056533813)]

$\vec{\text{he}} - \vec{\text{nurse}} \approx \vec{\text{she}} - \vec{\text{??}}$

In [12]:
check_full_list(subj='he', obj='nurse', counterpart='she')

[('nurse', 0.82805425),
 ('registered_nurse', 0.7027999758720398),
 ('nurse_practitioner', 0.6314352750778198),
 ('midwife', 0.6205434203147888),
 ('nurses', 0.6066274046897888),
 ('certified_lactation_counselor', 0.5820688605308533),
 ('nurse_midwife', 0.5799018144607544),
 ('birth_doula', 0.5744963884353638),
 ('neonatal_nurse', 0.5663833022117615),
 ('dental_hygienist', 0.551973819732666),
 ('lactation_consultant', 0.5466617345809937)]

$\vec{\text{he}} - \vec{\text{surgeon}} \approx \vec{\text{she}} - \vec{\text{??}}$

In [14]:
check_full_list(subj='he', obj='surgeon', counterpart='she')

[('surgeon', 0.723608),
 ('gynecologist', 0.6337347030639648),
 ('surgeons', 0.6173180341720581),
 ('plastic_surgeon', 0.5985187888145447),
 ('nurse', 0.5903362035751343),
 ('cosmetic_surgeon', 0.5817878246307373),
 ('hysterectomy', 0.5799472332000732),
 ('obstetrician', 0.5660049319267273),
 ('sonographer', 0.5639315843582153),
 ('midwife', 0.5569908618927002),
 ('MRI_technologist', 0.546063244342804)]

In [27]:
check_full_list(subj='he', obj='surgeon', counterpart='he')

[('surgeon', 1.0000001),
 ('surgeons', 0.7731136083602905),
 ('neurosurgeon', 0.7661304473876953),
 ('Surgeon', 0.7248443365097046),
 ('orthopedic_surgeon', 0.7162246704101562),
 ('plastic_surgeon', 0.70467209815979),
 ('doctor', 0.6793397665023804),
 ('urologist', 0.6659057140350342),
 ('vascular_surgeon', 0.6651214361190796),
 ('thoracic_surgeon', 0.6579000949859619),
 ('physician', 0.6561126708984375)]

### *feminism-conservatism*

$\vec{\text{she}} - \vec{\text{feminism}} \approx \vec{\text{he}} - \vec{\text{??}}$

In [16]:
check_full_list(subj='she', obj='feminism', counterpart='he')

[('feminism', 0.6571188),
 ('liberalism', 0.5454636812210083),
 ('liberationist', 0.529727578163147),
 ('conservatism', 0.5211994051933289),
 ('neoconservativism', 0.5119233131408691),
 ('anarchism', 0.5116571187973022),
 ('leftism', 0.5088927745819092),
 ('intellectualism', 0.5075400471687317),
 ('Neo_conservatism', 0.5073385238647461),
 ('postmodernism', 0.4998423755168915),
 ('Hegelian_dialectic', 0.49135786294937134)]

$\vec{\text{she}} - \vec{\text{feminism}} \approx \vec{\text{she}} - \vec{\text{??}}$

In [23]:
check_full_list(subj='she', obj='feminism', counterpart='she')

[('feminism', 1.0000001),
 ('feminist', 0.8082043528556824),
 ('feminists', 0.7639337778091431),
 ('feminist_movement', 0.7451367974281311),
 ('Feminism', 0.7141621112823486),
 ('radical_feminism', 0.6737356781959534),
 ('radical_feminist', 0.6637553572654724),
 ('Betty_Freidan', 0.6418492794036865),
 ('feminisms', 0.6368929743766785),
 ('womanist', 0.6334027051925659),
 ('women_libbers', 0.6325899958610535)]

### *lovely-brilliant*

$\vec{\text{she}} - \vec{\text{lovely}} \approx \vec{\text{he}} - \vec{\text{??}}$

In [21]:
check_full_list(subj='she', obj='lovely', counterpart='he')

[('lovely', 0.69995606),
 ('magnificent', 0.6248228549957275),
 ('marvelous', 0.6054928302764893),
 ('splendid', 0.5995590686798096),
 ('nice', 0.5869458913803101),
 ('fantastic', 0.5587064027786255),
 ('delightful', 0.5561120510101318),
 ('terrific', 0.5524159669876099),
 ('wonderful', 0.5481390953063965),
 ('brilliant', 0.5460425615310669),
 ('beautiful', 0.545063316822052)]

In [22]:
check_full_list(subj='she', obj='lovely', counterpart='she')

[('lovely', 1.0),
 ('beautiful', 0.8106936812400818),
 ('gorgeous', 0.8014094233512878),
 ('delightful', 0.7586833238601685),
 ('wonderful', 0.7320095896720886),
 ('fabulous', 0.712957501411438),
 ('marvelous', 0.6729508638381958),
 ('nice', 0.6676310300827026),
 ('charming', 0.6509542465209961),
 ('magnificent', 0.650709867477417),
 ('splendid', 0.6399756669998169)]

### *charming-affable*
$\vec{\text{she}} - \vec{\text{charming}} \approx \vec{\text{he}} - \vec{\text{??}}$

In [17]:
check_full_list(subj='she', obj='charming', counterpart='he')

[('charming', 0.72905624),
 ('genial', 0.6243418455123901),
 ('affable', 0.6227308511734009),
 ('likeable', 0.6079907417297363),
 ('amiable', 0.5914740562438965),
 ('likable', 0.5785181522369385),
 ('unassuming', 0.5687059760093689),
 ('urbane', 0.5588454604148865),
 ('jovial', 0.5488089323043823),
 ('unpretentious', 0.5404201745986938),
 ('suave', 0.5303424596786499)]

### *cosmetics-pharmaceuticals*
$\vec{\text{she}} - \vec{\text{cosmetics}} \approx \vec{\text{he}} - \vec{\text{??}}$

In [24]:
check_full_list(subj='she', obj='cosmetics', counterpart='he')

[('cosmetics', 0.65411854),
 ('cosmetics_perfumes', 0.499711811542511),
 ('skin_creams', 0.4964258670806885),
 ('haircare', 0.49254095554351807),
 ('pharmaceuticals', 0.4858422577381134),
 ('cosmetic', 0.4819394648075104),
 ('maker_Shiseido', 0.46269580721855164),
 ('Clarins_SA', 0.4574735760688782),
 ('creams_soaps', 0.45667964220046997),
 ('skincare', 0.45505884289741516),
 ('perfumes', 0.4498598575592041)]

In [25]:
check_full_list(subj='she', obj='cosmetics', counterpart='she')

[('cosmetics', 1.0),
 ('Cosmetics', 0.7287227511405945),
 ('skincare', 0.6951978206634521),
 ('skin_creams', 0.6833450794219971),
 ('cosmetic', 0.6811989545822144),
 ('maker_Shiseido', 0.6809768676757812),
 ('haircare', 0.6790057420730591),
 ('perfumes', 0.6575888991355896),
 ('creams_soaps', 0.6494388580322266),
 ('bodycare', 0.6485159397125244),
 ('perfume', 0.6439985036849976)]

# Run till here

In [18]:
model.most_similar(positive=['he', 'homemaker'], negative=['she'])

[('carpenter', 0.5112387537956238),
 ('tinkerer', 0.47657930850982666),
 ('machinist', 0.47604185342788696),
 ('mechanical_engineer', 0.4732446074485779),
 ('lifelong_resident', 0.471080482006073),
 ('avid_fisherman', 0.4508781135082245),
 ('laborer', 0.4494982957839966),
 ('retired', 0.4469570219516754),
 ('businessman', 0.4450863301753998),
 ('retired_schoolteacher', 0.44203752279281616)]

In [19]:
print(model.similarity('he', 'nurse'))
print(model.similarity('she', 'nurse'))

0.12233140540741333
0.36942589567514705


In [20]:
nurse_list = model.most_similar(positive=['he', 'nurse'], negative=['she'])
nurse_list

[('doctor', 0.5559605360031128),
 ('medic', 0.5425376892089844),
 ('physician', 0.5394270420074463),
 ('x_ray_technician', 0.5355567932128906),
 ('surgeon', 0.516014575958252),
 ('nurses', 0.49741458892822266),
 ('paramedic', 0.4924110770225525),
 ('anesthetist', 0.4886544942855835),
 ('patient', 0.46544167399406433),
 ('doctors', 0.4639861583709717)]

In [21]:
for x1, x2 in [('nurse', 0)] + nurse_list:
    vec1 = (model.get_vector('he') + model.get_vector('nurse') - model.get_vector('she')).reshape(1, 300)
    vec2 = model.get_vector(x1).reshape(1, 300)
    print('{}: {}'.format(x1, cosine_similarity(vec1, vec2)[0][0]))

nurse: 0.6655272245407104
doctor: 0.5559605360031128
medic: 0.5425377488136292
physician: 0.5394271612167358
x_ray_technician: 0.5355568528175354
surgeon: 0.5160146951675415
nurses: 0.49741464853286743
paramedic: 0.4924110770225525
anesthetist: 0.4886545240879059
patient: 0.46544164419174194
doctors: 0.46398621797561646


In [22]:
homemaker_list = model.most_similar(positive=['he', 'homemaker'], negative=['she'])
homemaker_list

[('carpenter', 0.5112387537956238),
 ('tinkerer', 0.47657930850982666),
 ('machinist', 0.47604185342788696),
 ('mechanical_engineer', 0.4732446074485779),
 ('lifelong_resident', 0.471080482006073),
 ('avid_fisherman', 0.4508781135082245),
 ('laborer', 0.4494982957839966),
 ('retired', 0.4469570219516754),
 ('businessman', 0.4450863301753998),
 ('retired_schoolteacher', 0.44203752279281616)]

In [23]:
for x1, x2 in [('homemaker', 0)] + homemaker_list:
    vec1 = (model.get_vector('he') + model.get_vector('homemaker') - model.get_vector('she')).reshape(1, 300)
    vec2 = model.get_vector(x1).reshape(1, 300)
    print('{}: {}'.format(x1, cosine_similarity(vec1, vec2)[0][0]))

homemaker: 0.657961905002594
carpenter: 0.5112388730049133
tinkerer: 0.47657933831214905
machinist: 0.47604191303253174
mechanical_engineer: 0.4732445478439331
lifelong_resident: 0.4710805118083954
avid_fisherman: 0.4508780837059021
laborer: 0.4494982659816742
retired: 0.4469570815563202
businessman: 0.44508635997772217
retired_schoolteacher: 0.4420374631881714


In [74]:
for x1, x2 in [('nurse', 0)] + nurse_list:
    vec1 = (model.get_vector('nurse') - model.get_vector('she')).reshape(1, 300)
    vec2 = (model.get_vector(x1) - model.get_vector('he')).reshape(1, 300)
    print('{}: {}'.format(x1, cosine_similarity(vec1, vec2)[0][0]))

nurse: 0.8459365367889404
doctor: 0.6182307600975037
medic: 0.6041296720504761
physician: 0.6366410255432129
x_ray_technician: 0.6145442128181458
surgeon: 0.5559583902359009
nurses: 0.6771997213363647
paramedic: 0.579715371131897
anesthetist: 0.5961609482765198
patient: 0.568503201007843
doctors: 0.5336419343948364


In [73]:
for x1, x2 in [('nurse', 0)] + nurse_list:
    vec1 = (model.word_vec('nurse') - model.word_vec('she')).reshape(1, 300)
    vec2 = (model.word_vec(x1) - model.word_vec('he')).reshape(1, 300)
    print('{}: {}'.format(x1, cosine_similarity(vec1, vec2)[0][0]))

nurse: 0.8459365367889404
doctor: 0.6182307600975037
medic: 0.6041296720504761
physician: 0.6366410255432129
x_ray_technician: 0.6145442128181458
surgeon: 0.5559583902359009
nurses: 0.6771997213363647
paramedic: 0.579715371131897
anesthetist: 0.5961609482765198
patient: 0.568503201007843
doctors: 0.5336419343948364


In [27]:
help(model)

Help on Word2VecKeyedVectors in module gensim.models.keyedvectors object:

class Word2VecKeyedVectors(WordEmbeddingsKeyedVectors)
 |  Class to contain vectors and vocab for word2vec model.
 |  Used to perform operations on the vectors such as vector lookup, distance, similarity etc.
 |  
 |  Method resolution order:
 |      Word2VecKeyedVectors
 |      WordEmbeddingsKeyedVectors
 |      BaseKeyedVectors
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  get_keras_embedding(self, train_embeddings=False)
 |      Return a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings
 |      
 |      Parameters
 |      ----------
 |      train_embeddings : bool
 |          If False, the weights are frozen and stopped from being updated.
 |          If True, the weights can/will be further trained/updated.
 |      
 |      Returns
 |      -------
 |      :obj: `keras.layers.Embedding`
 |          Embedding layer
 |  
 |  

In [15]:
model.most_similar(positive=['woman', 'nurse'], negative=['man'])

[('registered_nurse', 0.7375059127807617),
 ('nurse_practitioner', 0.6650707721710205),
 ('midwife', 0.6506887078285217),
 ('nurses', 0.6448696851730347),
 ('nurse_midwife', 0.6239830255508423),
 ('birth_doula', 0.5852459669113159),
 ('neonatal_nurse', 0.5670715570449829),
 ('dental_hygienist', 0.5668443441390991),
 ('lactation_consultant', 0.5667990446090698),
 ('respiratory_therapist', 0.5652168989181519)]

In [7]:
help(model)

Help on Word2VecKeyedVectors in module gensim.models.keyedvectors object:

class Word2VecKeyedVectors(WordEmbeddingsKeyedVectors)
 |  Class to contain vectors and vocab for word2vec model.
 |  Used to perform operations on the vectors such as vector lookup, distance, similarity etc.
 |  
 |  Method resolution order:
 |      Word2VecKeyedVectors
 |      WordEmbeddingsKeyedVectors
 |      BaseKeyedVectors
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  get_keras_embedding(self, train_embeddings=False)
 |      Return a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings
 |      
 |      Parameters
 |      ----------
 |      train_embeddings : bool
 |          If False, the weights are frozen and stopped from being updated.
 |          If True, the weights can/will be further trained/updated.
 |      
 |      Returns
 |      -------
 |      :obj: `keras.layers.Embedding`
 |          Embedding layer
 |  
 |  

In [None]:
    
#     in the case in word2vec, is vec(queen). Note that the input
    
#     check_full_list(counterpart='she', obj='king', subj='he')
    
#     would return the results from
    
#     model.most_similar(positive=[counterpart, obj], negative=[subj])
    
#     together with the cosine similarity between
    
#     vec(counterpart) - vec(subj) + vec(obj)
    
#     and
    
#     vec(obj)
    
#     for comparison.