# Creating Word Vectors with word2vec

In this notebook, we create word vectors from a corpus of public-domain books, a selection from [Project Gutenberg](https://www.gutenberg.org/).

In [1]:
#load watermark
%load_ext watermark
%watermark -a 'Gopala KR' -u -d -v -p watermark,numpy,pandas,matplotlib,nltk,sklearn,tensorflow,theano,mxnet,chainer,seaborn,keras,tflearn,bokeh,gensim


Using TensorFlow backend.
  if d.decorator_argspec is not None), _inspect.getargspec(target))


Gopala KR 
last updated: 2018-02-03 

CPython 3.6.3
IPython 6.2.1

watermark 1.6.0
numpy 1.13.1
pandas 0.20.3
matplotlib 2.0.2
nltk 3.2.5
sklearn 0.19.0
tensorflow 1.3.0
theano 1.0.1
mxnet 1.0.0
chainer 3.3.0
seaborn 0.8.1
keras 2.1.3
tflearn n
bokeh 0.12.13
gensim 3.3.0


scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


#### Load dependencies

In [2]:
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
from sklearn.manifold import TSNE
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import show, figure
%matplotlib inline

In [3]:
nltk.download('punkt') # English-language sentence tokenizer (not all periods end sentences; not all sentences start with a capital letter)

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Load data

In [4]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /home/jovyan/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [5]:
from nltk.corpus import gutenberg

In [6]:
len(gutenberg.fileids())

18

In [7]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

#### Tokenize text

In [8]:
gberg_sent_tokens = sent_tokenize(gutenberg.raw())

In [9]:
gberg_sent_tokens[0:5]

['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.',
 "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.",
 'Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.',
 "Sixteen years had Miss Taylor been in Mr. Woodhouse's family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.",
 'Between _them_ it was more the intimacy\nof sisters.']

In [10]:
gberg_sent_tokens[1]

"She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."

In [11]:
word_tokenize(gberg_sent_tokens[1])

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'s",
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [12]:
word_tokenize(gberg_sent_tokens[1])[14]

'father'

In [13]:
# a convenient method that handles newlines, as well as tokenizing sentences and words in one shot
gberg_sents = gutenberg.sents()

In [14]:
gberg_sents[0:5]

[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'],
 ['VOLUME', 'I'],
 ['CHAPTER', 'I'],
 ['Emma',
  'Woodhouse',
  ',',
  'handsome',
  ',',
  'clever',
  ',',
  'and',
  'rich',
  ',',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  ',',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  ';',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  '-',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her',
  '.'],
 ['She',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  ',',
  'indulgent',
  'father',
  ';',
  'and',
  'had',
  ',',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  "'",
  's',
  'marriage',
  ',',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period',
  '.']]

In [15]:
gberg_sents[4]

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [16]:
gberg_sents[4][14]

'father'

In [17]:
# another convenient method that we don't immediately need: 
gutenberg.words() 

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

In [18]:
# gutenberg.words() is analogous to the following line, which need not be run: 
# word_tokenize(gutenberg.raw())

In [19]:
# our Gutenberg corpus is 2.6m words in length: 
len(gutenberg.words())

2621613

#### Run word2vec

In [23]:
model = Word2Vec(sentences=gberg_sents, size=64, sg=1, window=10, min_count=5, seed=42, workers=8)

In [24]:
model.save('raw_gutenberg_model.w2v')

  if hasattr(self, attrib):
  asides[attrib] = getattr(self, attrib)
  delattr(self, attrib)
  setattr(obj, attrib, val)


#### Explore model

In [25]:
# skip re-training the model with the next line:  
model = gensim.models.Word2Vec.load('raw_gutenberg_model.w2v')

  setattr(self, attrib, None)


In [26]:
model['dog']

  """Entry point for launching an IPython kernel.


array([ 0.09566437,  0.17259854, -0.34505808,  0.02892219,  0.07668917,
        0.50993335,  0.69135612, -0.1312692 , -0.15725566,  0.74906576,
        0.20732881,  0.52538496, -0.57578909, -0.22764318, -0.11770665,
        0.11812929, -0.01914781,  0.01099039,  0.20910075, -0.41322687,
       -0.00547709,  0.18781266, -0.11802397,  0.2500563 , -0.03084208,
        0.21888663, -0.2254716 ,  0.10392255, -0.38144153,  0.37585607,
        0.54878187,  0.2205839 , -0.03381898, -0.06954347,  0.35287818,
       -0.04519901,  0.09778392, -0.089872  ,  0.18964697, -0.30727661,
        0.09791969,  0.2675367 , -0.06997165,  0.22762068, -0.21050346,
       -0.21686473,  0.64204437,  0.27400666, -0.03136293,  0.31337935,
        0.59558082, -0.27501032,  0.43628675,  0.08534899,  0.00154245,
       -0.27216321,  0.07710008, -0.07775867, -0.19987388, -0.15788484,
        0.06609637,  0.14038439, -0.25328976,  0.27570012], dtype=float32)

In [27]:
len(model['dog'])

  """Entry point for launching an IPython kernel.


64

In [28]:
model.most_similar('dog') # distance

  """Entry point for launching an IPython kernel.


[('puppy', 0.8430217504501343),
 ('cage', 0.7972114086151123),
 ('thief', 0.7930775880813599),
 ('sweeper', 0.7754759192466736),
 ('pig', 0.7575479745864868),
 ('broth', 0.7552684545516968),
 ('shell', 0.7501285672187805),
 ('pet', 0.749057412147522),
 ('bullet', 0.7415783405303955),
 ('chimney', 0.7369805574417114)]

In [29]:
model.most_similar('think')

  """Entry point for launching an IPython kernel.


[('suppose', 0.8603721261024475),
 ('manage', 0.846663236618042),
 ('contradict', 0.8429285287857056),
 ('downright', 0.8252444863319397),
 ('know', 0.8132942914962769),
 ('_you_', 0.8116717338562012),
 ('NOW', 0.8102172613143921),
 ('anyhow', 0.8093609809875488),
 ('hesitate', 0.8079324960708618),
 ('argue', 0.80599445104599)]

In [30]:
model.most_similar('day')

  """Entry point for launching an IPython kernel.


[('morning', 0.7945852279663086),
 ('night', 0.7583019733428955),
 ('time', 0.7528664469718933),
 ('month', 0.7266057729721069),
 ('evening', 0.7166042327880859),
 ('sabbath', 0.68549644947052),
 ('morrow', 0.6776953339576721),
 ('week', 0.6754008531570435),
 ('fourteenth', 0.6640895009040833),
 ('fortnight', 0.6488529443740845)]

In [31]:
model.most_similar('father')

  """Entry point for launching an IPython kernel.


[('mother', 0.863183319568634),
 ('brother', 0.8527032136917114),
 ('sister', 0.8112391233444214),
 ('Amnon', 0.7896923422813416),
 ('wife', 0.7768486738204956),
 ('daughter', 0.7695256471633911),
 ('bondwoman', 0.7412916421890259),
 ('uncle', 0.7265669107437134),
 ('Tamar', 0.7233585119247437),
 ('Dinah', 0.7155628800392151)]

In [32]:
model.doesnt_match("mother father daughter dog".split())

  """Entry point for launching an IPython kernel.


'dog'

In [33]:
model.similarity('father', 'dog')

  """Entry point for launching an IPython kernel.


0.47669729590400789

In [34]:
# close, but not quite; distinctly in female direction: 
model.most_similar(positive=['father', 'woman'], negative=['man']) 

  


[('sister', 0.8015695214271545),
 ('mother', 0.7793022990226746),
 ('wife', 0.7551862001419067),
 ('daughter', 0.7405765056610107),
 ('husband', 0.7301034927368164),
 ('brother', 0.7262083292007446),
 ('Sarah', 0.7168509364128113),
 ('Sarai', 0.7141744494438171),
 ('daughters', 0.7034985423088074),
 ('Amnon', 0.6891980171203613)]

In [35]:
# more confident about this one: 
model.most_similar(positive=['son', 'woman'], negative=['man']) 

  


[('Sarah', 0.7418930530548096),
 ('Leah', 0.7276014685630798),
 ('wife', 0.7269520163536072),
 ('Sarai', 0.7248057126998901),
 ('Hagar', 0.7211147546768188),
 ('Bethuel', 0.7210967540740967),
 ('Abram', 0.7164820432662964),
 ('Hittite', 0.7120416164398193),
 ('conceived', 0.7098621129989624),
 ('Nahor', 0.7070516347885132)]

In [36]:
model.most_similar(positive=['husband', 'woman'], negative=['man']) 

  """Entry point for launching an IPython kernel.


[('wife', 0.7307157516479492),
 ('sister', 0.7228026986122131),
 ('conceived', 0.6862730383872986),
 ('child', 0.6779206395149231),
 ('mother', 0.6650990843772888),
 ('daughter', 0.647702157497406),
 ('widow', 0.646721601486206),
 ('nurse', 0.6304333806037903),
 ('daughters', 0.6292895674705505),
 ('adultery', 0.627461850643158)]

In [37]:
model.most_similar(positive=['king', 'woman'], negative=['man'], topn=30) 

  """Entry point for launching an IPython kernel.


[('Sarah', 0.7333846092224121),
 ('Rachel', 0.7045056819915771),
 ('Pharaoh', 0.7034600973129272),
 ('Sarai', 0.6914916634559631),
 ('Solomon', 0.6911053657531738),
 ('Abram', 0.6873542070388794),
 ('Leah', 0.6831889152526855),
 ('Judah', 0.6798198819160461),
 ('Laban', 0.6793984770774841),
 ('Hagar', 0.6764636039733887),
 ('Bethuel', 0.6697091460227966),
 ('Rebekah', 0.6482451558113098),
 ('Padanaram', 0.6475045680999756),
 ('Bilhah', 0.645843505859375),
 ('birthright', 0.6420023441314697),
 ('Zilpah', 0.6408153772354126),
 ('Ephron', 0.6394474506378174),
 ('damsel', 0.6388168334960938),
 ('Esau', 0.6376100778579712),
 ('Babylon', 0.6368862986564636),
 ('tribute', 0.6367999315261841),
 ('Lot', 0.6338628530502319),
 ('Mephibosheth', 0.6329473257064819),
 ('David', 0.6314777731895447),
 ('Hittite', 0.6299331784248352),
 ('Uriah', 0.6295434832572937),
 ('Hamor', 0.6294640302658081),
 ('ministered', 0.6281746029853821),
 ('princes', 0.6280882358551025),
 ('Jerubbaal', 0.6260474920272827)]

In [38]:
# impressive for such a small data set, without any cleaning, e.g., to lower case (covered next)

#### Reduce word vector dimensionality with t-SNE

In [39]:
model.wv.vocab

{'[': <gensim.models.keyedvectors.Vocab at 0x7fc4264c4cc0>,
 'Emma': <gensim.models.keyedvectors.Vocab at 0x7fc4264c4550>,
 'by': <gensim.models.keyedvectors.Vocab at 0x7fc4264c4fd0>,
 'Jane': <gensim.models.keyedvectors.Vocab at 0x7fc4264c4518>,
 ']': <gensim.models.keyedvectors.Vocab at 0x7fc4264c4b70>,
 'I': <gensim.models.keyedvectors.Vocab at 0x7fc4264c4a58>,
 'CHAPTER': <gensim.models.keyedvectors.Vocab at 0x7fc4264c4ba8>,
 'Woodhouse': <gensim.models.keyedvectors.Vocab at 0x7fc4264c41d0>,
 ',': <gensim.models.keyedvectors.Vocab at 0x7fc4264c4668>,
 'handsome': <gensim.models.keyedvectors.Vocab at 0x7fc4264c4f28>,
 'clever': <gensim.models.keyedvectors.Vocab at 0x7fc4264c49b0>,
 'and': <gensim.models.keyedvectors.Vocab at 0x7fc4264c45f8>,
 'rich': <gensim.models.keyedvectors.Vocab at 0x7fc4264c4710>,
 'with': <gensim.models.keyedvectors.Vocab at 0x7fc4264c4e48>,
 'a': <gensim.models.keyedvectors.Vocab at 0x7fc4264c4a90>,
 'comfortable': <gensim.models.keyedvectors.Vocab at 0x7fc4

In [40]:
len(model.wv.vocab)

17011

In [41]:
X = model[model.wv.vocab]

  """Entry point for launching an IPython kernel.


In [42]:
tsne = TSNE(n_components=2, n_iter=1000) # 200 is minimum iter; default is 1000

In [None]:
X_2d = tsne.fit_transform(X)

In [None]:
X_2d[0:5]

In [None]:
# create DataFrame for storing results and plotting
coords_df = pd.DataFrame(X_2d, columns=['x','y'])
coords_df['token'] = model.wv.vocab.keys()

In [None]:
coords_df.head()

In [None]:
# coords_df.to_csv('raw_gutenberg_tsne.csv', index=False)

#### Visualize 2D representation of word vectors

In [None]:
coords_df = pd.read_csv('raw_gutenberg_tsne.csv')

In [None]:
_ = coords_df.plot.scatter('x', 'y', figsize=(12,12), marker='.', s=10, alpha=0.2)

In [None]:
output_notebook() # output bokeh plots inline in notebook

In [None]:
subset_df = coords_df.sample(n=5000)

In [None]:
p = figure(plot_width=800, plot_height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [None]:
show(p)

In [None]:
test complete