# Creating Word Vectors with word2vec

In this notebook, we create word vectors from a corpus of public-domain books, a selection from [Project Gutenberg](https://www.gutenberg.org/).

#### Load dependencies

In [1]:
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import show, figure

In [2]:
nltk.download('punkt') # English-language sentence tokenizer (not all periods end sentences; not all sentences start with a capital letter)

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#### Load data

In [3]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [4]:
from nltk.corpus import gutenberg

In [5]:
len(gutenberg.fileids())

18

In [6]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

#### Tokenize text

In [7]:
gberg_sent_tokens = sent_tokenize(gutenberg.raw())

In [8]:
gberg_sent_tokens[0:6]

['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.',
 "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.",
 'Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.',
 "Sixteen years had Miss Taylor been in Mr. Woodhouse's family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.",
 'Between _them_ it was more the intimacy\nof sisters.',
 "Even before Miss Taylor had ceased to hold the nominal

In [9]:
gberg_sent_tokens[1]

"She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."

In [10]:
word_tokenize(gberg_sent_tokens[1])

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'s",
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [11]:
word_tokenize(gberg_sent_tokens[1])[14]

'father'

In [12]:
# a convenient method that handles newlines, as well as tokenizing sentences and words in one shot
gberg_sents = gutenberg.sents()

In [13]:
gberg_sents[0:6]

[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'],
 ['VOLUME', 'I'],
 ['CHAPTER', 'I'],
 ['Emma',
  'Woodhouse',
  ',',
  'handsome',
  ',',
  'clever',
  ',',
  'and',
  'rich',
  ',',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  ',',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  ';',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  '-',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her',
  '.'],
 ['She',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  ',',
  'indulgent',
  'father',
  ';',
  'and',
  'had',
  ',',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  "'",
  's',
  'marriage',
  ',',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period',
  '.'],
 ['Her',
  'mother',
  'h

In [14]:
gberg_sents[4]

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [15]:
gberg_sents[4][14]

'father'

In [16]:
# another convenient method that we don't immediately need: 
gutenberg.words() 

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

In [None]:
# gutenberg.words() is analogous to the following line, which need not be run: 
# word_tokenize(gutenberg.raw())

In [17]:
# our Gutenberg corpus is 2.6m words in length: 
len(gutenberg.words())

2621613

#### Run word2vec

In [18]:
#model = Word2Vec(sentences=gberg_sents, size=64, sg=1, window=10, min_count=5, workers=4)

In [19]:
#model.save('raw_gutenberg_model.w2v')

#### Explore model

In [20]:
# skip re-training the model with the next line:  
model = gensim.models.Word2Vec.load('raw_gutenberg_model.w2v')

In [21]:
model.wv['dog']

array([-4.1466146e-03, -3.5718462e-01, -4.2911971e-01,  3.7844461e-01,
       -1.1017207e-01, -1.4578715e-01, -4.6452686e-01,  5.3687042e-01,
       -2.2535205e-01,  3.3747920e-01,  1.4483279e-01,  2.3419902e-01,
        3.2451406e-01, -3.7555844e-01,  5.8527547e-01,  1.0486481e-01,
       -6.8573672e-01, -1.7641590e-04,  2.5709322e-02, -1.4250915e-01,
        2.0193768e-01,  6.1191045e-02, -2.3672807e-01, -3.8559446e-01,
        1.1439926e-01,  1.3206565e-01, -7.2057921e-01,  2.7011576e-01,
       -3.5893741e-01, -1.1442479e-01,  3.5064089e-01, -9.4010234e-02,
       -4.1445348e-01,  6.7528862e-01,  1.3502000e-01,  1.4942047e-02,
       -5.6294747e-02,  4.4102054e-02, -2.1106522e-01, -9.8071322e-02,
        9.8927870e-02, -1.2789845e-01, -9.3406275e-02,  4.5863111e-02,
        7.5735897e-02, -2.1813367e-01, -2.9274407e-01, -5.5584860e-01,
       -5.2432626e-01,  1.0827631e-01, -1.9963792e-01, -9.3112975e-02,
        2.8974520e-02, -3.7829745e-01, -1.5835789e-01, -1.1071358e-01,
      

In [22]:
len(model.wv['dog'])  # The 64 dimension space so we have 64 dimentions. 

64

In [23]:
model.wv.most_similar('dog') # distance to dog 

[('puppy', 0.8245944976806641),
 ('sweeper', 0.8165087699890137),
 ('cage', 0.7668992280960083),
 ('broth', 0.7634357213973999),
 ('chimney', 0.753870964050293),
 ('string', 0.7537059187889099),
 ('butcher', 0.7504701018333435),
 ('bullet', 0.7493541240692139),
 ('kick', 0.741381824016571),
 ('loaf', 0.7398046255111694)]

In [24]:
model.wv.most_similar('think')

[('Mamma', 0.8459633588790894),
 ('pretend', 0.8424856662750244),
 ('hesitate', 0.8378000855445862),
 ('manage', 0.8295996189117432),
 ('interfere', 0.8291898965835571),
 ('suppose', 0.8220747113227844),
 ('impertinent', 0.8152815103530884),
 ('contradict', 0.8148210048675537),
 ('shouldn', 0.8141474723815918),
 ('really', 0.8135504722595215)]

In [25]:
model.wv.most_similar('day')

[('morning', 0.7659273743629456),
 ('time', 0.7555131912231445),
 ('month', 0.7306933999061584),
 ('week', 0.7261040210723877),
 ('night', 0.7117187976837158),
 ('Adar', 0.7099676728248596),
 ('feasting', 0.705502986907959),
 ('Saturday', 0.7037047147750854),
 ('fourteenth', 0.7035631537437439),
 ('evening', 0.6883141994476318)]

In [26]:
model.wv.most_similar('father')

[('mother', 0.8738216161727905),
 ('brother', 0.862291157245636),
 ('sister', 0.8117022514343262),
 ('wife', 0.7752429246902466),
 ('Tamar', 0.7580605149269104),
 ('daughter', 0.7566810846328735),
 ('Amnon', 0.7541859745979309),
 ('bondwoman', 0.7321372032165527),
 ('master', 0.7210524678230286),
 ('younger', 0.719154953956604)]

In [27]:
model.wv.doesnt_match("mother father daughter dog".split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'dog'

In [28]:
model.wv.similarity('father', 'dog')

0.4699294

In [29]:
# close, but not quite; distinctly in female direction: 
model.wv.most_similar(positive=['father', 'woman'], negative=['man']) 

[('sister', 0.8063517808914185),
 ('wife', 0.7883333563804626),
 ('mother', 0.7879281044006348),
 ('daughter', 0.7695640921592712),
 ('husband', 0.7552887201309204),
 ('brother', 0.7475597858428955),
 ('Tamar', 0.7241297960281372),
 ('Sarah', 0.7213785648345947),
 ('Sarai', 0.7129601240158081),
 ('Rachel', 0.6995742321014404)]

In [30]:
# more confident about this one: 
model.wv.most_similar(positive=['son', 'woman'], negative=['man']) 

[('Rachel', 0.7442491054534912),
 ('daughter', 0.7441835403442383),
 ('wife', 0.743617594242096),
 ('Sarah', 0.7401350140571594),
 ('Sarai', 0.7389209866523743),
 ('Abram', 0.7351948022842407),
 ('Bethuel', 0.7261578440666199),
 ('Tamar', 0.7256032824516296),
 ('Leah', 0.7206445336341858),
 ('Hagar', 0.717329204082489)]

In [31]:
model.wv.most_similar(positive=['husband', 'woman'], negative=['man']) 

[('wife', 0.7488378286361694),
 ('sister', 0.7259776592254639),
 ('daughter', 0.6787654161453247),
 ('child', 0.6704478859901428),
 ('conceived', 0.6633370518684387),
 ('widow', 0.659957766532898),
 ('mother', 0.6573160886764526),
 ('maid', 0.6554161310195923),
 ('Rachel', 0.6345469951629639),
 ('nurse', 0.6308585405349731)]

In [32]:
model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=30) 

[('Rachel', 0.7287677526473999),
 ('Sarah', 0.6979366540908813),
 ('Abram', 0.6922593116760254),
 ('Leah', 0.6809811592102051),
 ('Laban', 0.678038477897644),
 ('Padanaram', 0.6767112016677856),
 ('Sarai', 0.6715230941772461),
 ('Hagar', 0.6702522039413452),
 ('Bethuel', 0.6674023270606995),
 ('daughter', 0.6622584462165833),
 ('Solomon', 0.6600829362869263),
 ('Joseph', 0.6573799252510071),
 ('Judah', 0.650351345539093),
 ('Pharaoh', 0.6434875130653381),
 ('Rebekah', 0.642324686050415),
 ('Hamor', 0.6420037746429443),
 ('household', 0.6405993103981018),
 ('Ephron', 0.6393378973007202),
 ('Lot', 0.6368026733398438),
 ('birthright', 0.636254072189331),
 ('Ur', 0.6331424117088318),
 ('Tamar', 0.6325780153274536),
 ('Babylon', 0.6315199136734009),
 ('Onan', 0.6306878924369812),
 ('queen', 0.6305269002914429),
 ('Shechem', 0.6292922496795654),
 ('Bilhah', 0.6286901235580444),
 ('Esau', 0.6285772919654846),
 ('conceived', 0.6227713227272034),
 ('kindred', 0.6225327253341675)]

In [None]:
# impressive for such a small data set, without any cleaning, e.g., to lower case (covered next)

#### Reduce word vector dimensionality with t-SNE

In [33]:
len(model.wv.vocab)

17011

In [34]:
X = model.wv[model.wv.vocab]

In [35]:
tsne = TSNE(n_components=2, n_iter=1000) # 200 is minimum iter; default is 1000

In [36]:
#X_2d = tsne.fit_transform(X)

In [37]:
X_2d[0:5]

array([[ 31.446829 , -45.304195 ],
       [-37.153385 , -51.418427 ],
       [-57.356754 ,   5.9145713],
       [-38.755898 , -50.66579  ],
       [ 31.36291  , -45.269886 ]], dtype=float32)

In [38]:
# create DataFrame for storing results and plotting
coords_df = pd.DataFrame(X_2d, columns=['x','y'])
coords_df['token'] = model.wv.vocab.keys()

In [39]:
coords_df.head()

Unnamed: 0,x,y,token
0,31.446829,-45.304195,[
1,-37.153385,-51.418427,Emma
2,-57.356754,5.914571,by
3,-38.755898,-50.665791,Jane
4,31.362909,-45.269886,]


In [40]:
# coords_df.to_csv('raw_gutenberg_tsne.csv', index=False)

#### Visualize 2D representation of word vectors

In [41]:
coords_df = pd.read_csv('raw_gutenberg_tsne.csv')

In [42]:
output_notebook() # output bokeh plots inline in notebook

In [43]:
subset_df = coords_df.sample(n=5000)

In [44]:
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [45]:
show(p)