# Setup
https://radimrehurek.com/gensim/models/word2vec.html#pretrained-models

In [1]:
import gensim
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
import numpy as np

In [2]:
model = gensim.downloader.load('word2vec-google-news-300')

# Introduction to Word2Vec

In [3]:
paragraph = "fruit, the fleshy or dry ripened ovary of a flowering plant, enclosing the seed or seeds. Thus, apricots, bananas, and grapes, as well as bean pods, corn grains, tomatoes, cucumbers, and (in their shells) acorns and almonds, are all technically fruits. Popularly, however, the term is restricted to the ripened ovaries that are sweet and either succulent or pulpy. For treatment of the cultivation of fruits, see fruit farming. For treatment of the nutrient composition and processing of fruits, see fruit processing. Botanically, a fruit is a mature ovary and its associated parts. It usually contains seeds, which have developed from the enclosed ovule after fertilization, although development without fertilization, called parthenocarpy, is known, for example, in bananas. Fertilization induces various changes in a flower: the anthers and stigma wither, the petals drop off, and the sepals may be shed or undergo modifications; the ovary enlarges, and the ovules develop into seeds, each containing an embryo plant. The principal purpose of the fruit is the protection and dissemination of the seed. (See also seed.) Fruits are important sources of dietary fibre, vitamins (especially vitamin C), and antioxidants. Although fresh fruits are subject to spoilage, their shelf life can be extended by refrigeration or by the removal of oxygen from their storage or packaging containers. Fruits can be processed into juices, jams, and jellies and preserved by dehydration, canning, fermentation, and pickling. Waxes, such as those from bayberries (wax myrtles), and vegetable ivory from the hard fruits of a South American palm species (Phytelephas macrocarpa) are important fruit-derived products. Various drugs come from fruits, such as morphine from the fruit of the opium poppy."
sentences = []
for sentence in paragraph.split("."):
    test = sentence.split(" ")
    str_list = list(filter(None, test))
    if str_list:
        sentences.append(str_list)

sentences

[['fruit,',
  'the',
  'fleshy',
  'or',
  'dry',
  'ripened',
  'ovary',
  'of',
  'a',
  'flowering',
  'plant,',
  'enclosing',
  'the',
  'seed',
  'or',
  'seeds'],
 ['Thus,',
  'apricots,',
  'bananas,',
  'and',
  'grapes,',
  'as',
  'well',
  'as',
  'bean',
  'pods,',
  'corn',
  'grains,',
  'tomatoes,',
  'cucumbers,',
  'and',
  '(in',
  'their',
  'shells)',
  'acorns',
  'and',
  'almonds,',
  'are',
  'all',
  'technically',
  'fruits'],
 ['Popularly,',
  'however,',
  'the',
  'term',
  'is',
  'restricted',
  'to',
  'the',
  'ripened',
  'ovaries',
  'that',
  'are',
  'sweet',
  'and',
  'either',
  'succulent',
  'or',
  'pulpy'],
 ['For',
  'treatment',
  'of',
  'the',
  'cultivation',
  'of',
  'fruits,',
  'see',
  'fruit',
  'farming'],
 ['For',
  'treatment',
  'of',
  'the',
  'nutrient',
  'composition',
  'and',
  'processing',
  'of',
  'fruits,',
  'see',
  'fruit',
  'processing'],
 ['Botanically,',
  'a',
  'fruit',
  'is',
  'a',
  'mature',
  'ovary'

In [4]:
basic_model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)
# Show basic word associations
basic_model.wv.most_similar("fruit")

[('farming', 0.3491923213005066),
 ('ripened', 0.3042016327381134),
 ('ovule', 0.24979040026664734),
 ('cucumbers,', 0.22287946939468384),
 ('seeds,', 0.17807547748088837),
 ('by', 0.16559582948684692),
 ('dehydration,', 0.16434580087661743),
 ('treatment', 0.16303157806396484),
 ('mature', 0.15538841485977173),
 ('grapes,', 0.15352429449558258)]

In [5]:
model.most_similar(positive=["France","Berlin"],negative=["Germany"])

[('Paris', 0.7672389149665833),
 ('French', 0.6049168705940247),
 ('Parisian', 0.5810437202453613),
 ('Colombes', 0.5599984526634216),
 ('Hopital_Europeen_Georges_Pompidou', 0.555890679359436),
 ('Melun', 0.5512701272964478),
 ('Dinard', 0.5451847314834595),
 ('Brussels', 0.5420990586280823),
 ('Mairie_de', 0.533744752407074),
 ('Cagnes_sur_Mer', 0.5312464237213135)]

In [6]:
model.most_similar(positive=["woman","king"],negative=["man"])

[('queen', 0.7118193507194519),
 ('monarch', 0.6189674735069275),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.518113374710083),
 ('sultan', 0.5098593831062317),
 ('monarchy', 0.5087411999702454)]

In [7]:
model.most_similar(positive=["woman","computer_programmer"],negative=["man"])

[('homemaker', 0.5627118945121765),
 ('housewife', 0.5105046629905701),
 ('graphic_designer', 0.505180299282074),
 ('schoolteacher', 0.497949481010437),
 ('businesswoman', 0.493489146232605),
 ('paralegal', 0.49255111813545227),
 ('registered_nurse', 0.4907974600791931),
 ('saleswoman', 0.48816272616386414),
 ('electrical_engineer', 0.4797726571559906),
 ('mechanical_engineer', 0.4755399227142334)]

In [8]:
model.most_similar(positive=["girl","computer_programmer"],negative=["boy"])

[('homemaker', 0.5472404360771179),
 ('graphic_designer', 0.5406057834625244),
 ('programmer', 0.5336480140686035),
 ('schoolteacher', 0.5153871774673462),
 ('housewife', 0.5088576674461365),
 ('electrical_engineer', 0.5082751512527466),
 ('businesswoman', 0.5011307597160339),
 ('mechanical_engineer', 0.4983106851577759),
 ('keypunch_operator', 0.48656004667282104),
 ('saleswoman', 0.4847942590713501)]

In [9]:
model.most_similar(positive=["black","manager"],negative=["white"])

[('vice_president', 0.599061131477356),
 ('director', 0.5975303053855896),
 ('mananger', 0.5540252327919006),
 ('manger', 0.5326608419418335),
 ('Manager', 0.5218765735626221),
 ('managing_director', 0.5175216794013977),
 ('coordinator', 0.5150246024131775),
 ('supervisor', 0.512852132320404),
 ('vp', 0.5084112882614136),
 ('mana_ger', 0.5057909488677979)]

In [10]:
print(model.similarity("black","criminal"))
print(model.similarity("white","criminal"))

0.083807915
0.041078076


In [11]:
print(model.similarity("muslim","terrorist"))
print(model.most_similar("muslim"))
print(model.similarity("christian","terrorist"))
print(model.similarity("chinese","uber"))

0.3733778
[('muslims', 0.7917742729187012), ('Muslim', 0.7285441756248474), ('Moslem', 0.7245643138885498), ('islamic', 0.7151464223861694), ('moslem', 0.6917665004730225), ('christian', 0.676012396812439), ('islam', 0.6733973026275635), ('jewish', 0.652587890625), ('Muslims', 0.640570878982544), ('christians', 0.6398838758468628)]
0.23209321
0.15567985


In [12]:
gender_pairs = [
    ["man","woman"],
    ["boy","girl"],
    ["male","female"],
    ["masculine","feminine"],
    ["father","mother"]
]

male_dom_occupations = ["computer_programmer","trucker","plumber"]
female_dom_occupations = ["teacher","secretary","nurse"]

In [13]:
for occupation in male_dom_occupations:
    print("OCCUPATION: ",occupation)
    for pair in gender_pairs:
#         print(pair[0], " to ", occupation, ": ",model.similarity(pair[0],occupation))
#         print(pair[1], " to ", occupation, ": ",model.similarity(pair[1],occupation))
        print(model.most_similar(positive=[pair[1],occupation],negative=[pair[0]])[:5])

OCCUPATION:  computer_programmer
[('homemaker', 0.5627118945121765), ('housewife', 0.5105046629905701), ('graphic_designer', 0.505180299282074), ('schoolteacher', 0.497949481010437), ('businesswoman', 0.493489146232605)]
[('homemaker', 0.5472404360771179), ('graphic_designer', 0.5406057834625244), ('programmer', 0.5336480140686035), ('schoolteacher', 0.5153871774673462), ('housewife', 0.5088576674461365)]
[('mechanical_engineer', 0.531915545463562), ('schoolteacher', 0.5259362459182739), ('graphic_designer', 0.5201236009597778), ('programmer', 0.5128348469734192), ('electrical_engineer', 0.49813708662986755)]
[('homemaker', 0.5341627597808838), ('graphic_designer', 0.5120885372161865), ('schoolteacher', 0.5013217926025391), ('mechanical_engineer', 0.48311248421669006), ('housewife', 0.4744608998298645)]
[('homemaker', 0.555813729763031), ('graphic_designer', 0.5386900901794434), ('housewife', 0.5224640965461731), ('schoolteacher', 0.5075528025627136), ('registered_nurse', 0.50568699836

In [14]:
for occupation in female_dom_occupations:
    print("OCCUPATION: ",occupation)
    for pair in gender_pairs:
        print(model.most_similar(positive=[pair[0],occupation],negative=[pair[1]])[:5])

OCCUPATION:  teacher
[('teachers', 0.5810958743095398), ('PE_teacher', 0.556725800037384), ('headmaster', 0.5553570985794067), ('Teacher', 0.5439296364784241), ('pupil', 0.5400640368461609)]
[('teachers', 0.6545760035514832), ('pupil', 0.598250150680542), ('elementary', 0.596847653388977), ('headmaster', 0.5935094952583313), ('school', 0.5858757495880127)]
[('teachers', 0.6461228728294373), ('Teacher', 0.6198713779449463), ('guidance_counselor', 0.6190987229347229), ('elementary', 0.614936888217926), ('headmaster', 0.6035487651824951)]
[('teachers', 0.6065839529037476), ('Teacher', 0.5751699805259705), ('guidance_counselor', 0.5584845542907715), ('PE_teacher', 0.5518540740013123), ('pupil', 0.5505715608596802)]
[('headmaster', 0.6184214353561401), ('phys_ed_teacher', 0.5788652300834656), ('PE_teacher', 0.5780734419822693), ('teachers', 0.5714814066886902), ('Teacher', 0.5488411784172058)]
OCCUPATION:  secretary
[('Secretary', 0.5865545272827148), ('undersecretary', 0.5803497433662415),

In [15]:
religions = ["christian","muslim","jewish","hindu","buddhist"]
stereotypes = ["terrorist","philanthropist","evil","racist","frugal"]

In [16]:
for stereotype in stereotypes:
    print("STEREOTYPE: ", stereotype)
    for religion in religions:
        print(model.similarity(stereotype,religion))

STEREOTYPE:  terrorist
0.23209321
0.3733778
0.23063853
0.203477
0.104290895
STEREOTYPE:  philanthropist
0.11935555
0.102111265
0.13718978
0.032020073
0.1172961
STEREOTYPE:  evil
0.29157263
0.29019472
0.25371784
0.25275838
0.2395346
STEREOTYPE:  racist
0.37873843
0.3851766
0.40511343
0.28848082
0.1873832
STEREOTYPE:  frugal
0.11521689
0.08947105
0.09060955
0.038437527
0.12853253


In [17]:
def cos_similarity(a,b):
    return np.dot(a,b) / (np.linalg.norm(a) + np.linalg.norm(b))

In [18]:
auth = model["power"]

white = model["white"]
black = model["black"]

woman = model["woman"]
man = model["man"]

print(cos_similarity(auth,white))
print(cos_similarity(auth,black))
print(cos_similarity(auth,man))
print(cos_similarity(auth,woman))
print(cos_similarity(auth,white+man))
print(cos_similarity(auth,white+woman))
print(cos_similarity(auth,black+man))
print(cos_similarity(auth,black+woman))

0.10423214
0.08569487
0.119437315
0.055025052
0.17521991
0.12227048
0.16047885
0.10850615


In [19]:
print(model.similarity("power","man"))
print(model.similarity("power","woman"))

0.10005455
0.043001562
