In [1]:
import numpy as np

sentence = 'The quick brown for jumped over the tree?'
word_list = sentence.split()
token_index = {} #used to map word to int

#use set to remove duplicate words
for word in set(word_list):
    if word not in token_index:
        token_index[word] = len(token_index) 

token_index

{'The': 4,
 'brown': 3,
 'for': 2,
 'jumped': 5,
 'over': 6,
 'quick': 0,
 'the': 1,
 'tree?': 7}

In [2]:
from gensim.models import Word2Vec #prebuilt word to vec implementation
import glob #finds all pathnames matching a pattern, like regex
import codecs #unicode support when reading files
from multiprocessing import cpu_count #use to get number of cpus on host machine
from gensim.utils import simple_preprocess,simple_tokenize #text processing
from string import punctuation #string  containing all puncuation



In [3]:
book_filenames = sorted(glob.glob("book/*.txt"))
print("Found books:")
book_filenames

Found books:


['book\\Pride and Prejudice.txt']

In [4]:
corpus_raw = u""
#for each book, read it, open it un utf 8 format, 
#add it to the raw corpus
for book_filename in book_filenames:
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()

print("Corpus is {0} characters long".format(len(corpus_raw)))

Corpus is 717029 characters long


In [5]:
table=str.maketrans("","",punctuation)     #create translation table
text = corpus_raw.translate(table) # remove puncuation
sentences = text.split('\n') #split into sentences
sentences = list(filter(None,sentences)) #remove empty strings
for i,sentence in enumerate(sentences):
    sentences[i] = sentence.lower().split() #lower case and split into words

In [6]:
sentences = corpus_raw.split('\n') #split at new lines
sentences =  filter(None, sentences) # remove empty strings
sentences =  list(map(simple_preprocess,sentences)) #clean text 

In [7]:
workers = cpu_count()

In [8]:
model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=workers) #fit model

In [9]:
len(model.wv.vocab) #size of vocab

2117

In [10]:
model.wv.vectors.shape  # how we can access the word embeddings matrix

(2117, 100)

In [11]:
'word'  in model.wv.vocab #check if word in vocab

True

In [12]:
model.wv['man'] #get word vector for man

array([-0.19864893, -0.20620547,  0.13557515,  0.6751277 ,  0.19841546,
       -0.30418634, -0.00583177, -0.16800143,  0.06088564, -0.12502596,
       -0.1372419 ,  0.08277331, -0.18771084, -0.12487579,  0.1958325 ,
       -0.8932824 , -0.03137023, -0.25765646,  0.13894023,  0.58164805,
        0.24447006, -0.5220212 , -0.26179418, -0.18274675, -0.32865795,
       -0.0870305 , -0.09695656,  0.00582173,  0.37042493, -0.22931446,
       -0.06636467,  0.06597514,  0.31375328, -0.2373373 ,  0.4282515 ,
       -0.24698992, -0.00165689,  0.21547677, -0.02950837,  0.18457799,
       -0.41804236, -0.36304793,  0.49508312,  0.1282711 , -0.3950212 ,
       -0.20290186,  0.38320732, -0.12128406,  0.67209566, -0.10775404,
        0.29955307,  0.16264375,  0.225446  , -0.5443164 , -0.09479618,
       -0.16551189, -0.0403454 , -0.6080351 ,  0.25760886,  0.4514904 ,
       -0.14883967,  0.03961065,  0.04853184, -0.1672887 ,  0.03205734,
       -0.02569219,  0.02184038, -0.09513454,  0.50605434,  0.25

In [13]:
model.wv.most_similar('man')  # find most similar words

[('something', 0.9996433258056641),
 ('person', 0.9995888471603394),
 ('about', 0.9995883703231812),
 ('our', 0.9995846748352051),
 ('woman', 0.9995682239532471),
 ('want', 0.999567985534668),
 ('good', 0.999547004699707),
 ('men', 0.9995465278625488),
 ('people', 0.9995226860046387),
 ('anything', 0.9995101690292358)]

In [14]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man']) 

[('de', 0.9979858994483948),
 ('lucas', 0.997879147529602),
 ('elizabeth', 0.9978147745132446),
 ('bourgh', 0.9977259635925293),
 ('eyes', 0.9976943731307983),
 ('turned', 0.997608482837677),
 ('returned', 0.9975976347923279),
 ('eliza', 0.9975690245628357),
 ('away', 0.9974694848060608),
 ('sat', 0.9974604845046997)]

In [15]:
from sklearn.manifold import TSNE #from dimensionality reduction
import pandas as pd 

In [16]:
from sklearn.manifold import MDS

In [17]:
n = 1000 #only use first 1000 vectors

In [18]:
tsne = TSNE(n_components=2, perplexity=3,random_state=0)
tsne_vectors = tsne.fit_transform(model.wv.vectors[:n])

In [19]:
words = model.wv.index2word[:n] #get first n words from model

In [20]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, value
output_notebook()

In [21]:
#create a dataframe to plot with
df = pd.DataFrame(tsne_vectors,index=words,columns=['x_coord','y_coord'])
df.index.name = 'word'
df.head()

Unnamed: 0_level_0,x_coord,y_coord
word,Unnamed: 1_level_1,Unnamed: 2_level_1
the,-17.91184,-13.161377
to,1.054723,7.896479
of,-17.263145,-12.283184
and,-14.452357,-16.845068
her,-15.770219,-16.054001


In [22]:
# add our DataFrame as a ColumnDataSource for Bokeh
plot_data = ColumnDataSource(df)

# create the plot and configure the
# title, dimensions, and tools
tsne_plot = figure(title=u't-SNE Word Embeddings',
                   plot_width = 800,
                   plot_height = 800,
                   tools= (u'pan, wheel_zoom, box_zoom,'
                           u'box_select, reset'),
                   active_scroll=u'wheel_zoom')


In [23]:

# add a hover tool to display words on roll-over
tsne_plot.add_tools( HoverTool(tooltips = u'@word') )

# draw the words as circles on the plot
tsne_plot.circle(u'x_coord', u'y_coord', source=plot_data,
                 color=u'blue', line_alpha=0.2, fill_alpha=0.1,
                 size=10, hover_line_color=u'black')

# configure visual elements of the plot
tsne_plot.title.text_font_size = value(u'16pt')
tsne_plot.xaxis.visible = False
tsne_plot.yaxis.visible = False
tsne_plot.grid.grid_line_color = None
tsne_plot.outline_line_color = None

# plot!
show(tsne_plot);