In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir('../../notebook_format')
from formats import load_style
load_style()

In [3]:
os.chdir(path)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# plt.rcParams['figure.figsize'] = 8, 6 # change default figure size

# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
%matplotlib inline
%load_ext watermark
%load_ext autoreload 
%autoreload 2

import joblib
from operator import itemgetter
from gensim.models import Word2Vec
from sklearn.datasets import fetch_20newsgroups

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,gensim,sklearn,joblib

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Ethen 2017-04-07 20:27:18 

CPython 3.5.2
IPython 5.3.0

numpy 1.12.1
pandas 0.19.2
gensim 1.0.1
tqdm 4.11.2
numba 0.31.0
requests 2.13.0


# Word2vec

`Word2Vec` is a unsupervised learning algorithm that uses a shallow neural network (with one hidden layer) to learn the vectorial representations of all the term words/phrases for a given corpus. The advantage that word2vec offers is tries to preserve the semantic meaning behind those terms. For example, a document may employ the words "dog" and "canine" to mean the same thing, but never use them together in a sentence. Ideally, the word2vec algorithm would be able to learn the context and place them together in similar vector semantic space.

gensim’s `Word2vec` expects a sequence of sentences as its input, where each sentence a list of words.

In [10]:
sentences = [['first', 'sentence'], ['second', 'sentence']]
model = Word2Vec(sentences, min_count = 2, size = 200, workers = 4)

The model also accepts several key parameters that affect both training speed and quality.

- `min_count`: For pruning the internal dictionary. Words that appear only once or twice in a billion-word corpus are probably uninteresting typos and garbage. In addition, there’s not enough data to make any meaningful training on those words, so it’s best to ignore them. A reasonable value for min_count is between 0-100, depending on the size of the dataset.
- `size`: Refers to the hidden layers size. Bigger size values require more training data, but can lead to better (more accurate) models. Reasonable values are in the tens to hundreds
- `workers`: Number of cores/threads used for training
- `window`: Only terms hat occur within a window-neighbourhood of a term, in a sentence, are associated with it during training. The usual value is 4. Unless your text contains big sentences, leave it at that.
- `sg` – This defines the algorithm. If equal to 1, the skip-gram technique is used. Else, the CBoW method is employed

The full list of parameters can be obtained [here](http://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec)

---

In the example above, keeping the input as a Python built-in list is convenient, but can use up a lot of RAM when the input is large.

Gensim only requires that the input must provide sentences sequentially, hence if our input files are scattered across several different places then instead of loading everything into an in-memory list, we can process the input file by file, line by line:

In [8]:
class Sentences:
    """
    iterate over files in a directory, and read in each line
    as a list of words. Used with gensim's Word2Vec
    
    Parameter
    ---------
    dirname: str
        directory that contains the file of text
    
    Example
    -------
    # a memory-friendly iterator
    dirname = 'test'
    sentences = Sentences(dirname)
    model = Word2Vec(sentences)
    """
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for file in os.listdir(self.dirname):
            fname = os.path.join(self.dirname, file)
            with open(fname) as f:
                for line in f:
                    # we can also do other text preprocessing
                    # such as remove stop words, lower-case
                    # the strings, etc. here
                    yield line.split()

In [68]:
newsgroups_train = fetch_20newsgroups(subset = 'train')
newsgroups_train.data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [39]:
word2vec_model_path = 'mymodel'

workers = joblib.cpu_count()
documents = [doc.strip().split() for doc in newsgroups_train.data]

model = Word2Vec(documents, min_count = 2, size = 200, workers = workers)
model.save(word2vec_model_path)
model = Word2Vec.load(word2vec_model_path)

In [61]:
# acess the vocabulary attribute of 
# the word vector to build a list of the 
# terms, integer indices and term counts from model's vocabulary
ordered_vocab = [(term, info.index, info.count)
                 for term, info in model.wv.vocab.items()]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab, key = itemgetter(2), reverse = True)

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# create a DataFrame with the learnt vectors as data,
# and the terms as row labels
word_vectors = pd.DataFrame(model.wv.syn0norm[term_indices, :],
                            index = ordered_terms)
print(word_vectors.shape)
word_vectors.head()

(118461, 200)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
the,0.114811,0.059105,0.052656,0.040745,0.02385,-0.118442,0.119151,0.066576,0.063207,-0.016908,...,0.044477,-0.011778,0.0032,0.070038,-0.172101,0.089449,-0.018589,-0.010821,-0.104495,-0.055577
to,0.057079,0.097144,0.017742,0.057017,0.011044,-0.136049,0.099916,-0.064588,-0.067024,0.004666,...,-0.091574,-0.021603,-0.000227,-0.008603,-0.033955,0.041081,-0.003849,0.083453,-0.118852,-0.06327
of,0.015743,0.057191,-0.051213,0.092693,-0.098047,-0.018424,0.047705,0.018564,0.045533,-0.062696,...,0.000653,0.008302,0.000931,0.057572,-0.080246,0.073219,0.082902,-0.070216,-0.046113,-0.053717
a,0.133431,0.064562,0.143721,0.075947,-0.060838,-0.09563,0.044568,-0.082651,0.074516,0.056178,...,-0.009355,0.026335,0.000145,0.022681,0.013456,-0.068525,-0.038995,0.065747,0.004777,-0.082451
and,0.165271,0.184787,0.010603,0.068729,-0.00645,-0.031344,-0.060679,0.042055,-0.044052,0.037975,...,-0.037406,0.010678,0.020201,-0.054917,0.005005,0.11852,-0.019917,0.003818,-0.051698,-0.028886


we can use them for is to look up related words and phrases (words that have similar semantic meaning) for a given term of interest.

In [67]:
# find the top 5 most similar term
model.wv.most_similar(positive = ['computer'], topn = 5)

[('keyboard', 0.8721893429756165),
 ('application', 0.8527505993843079),
 ('manual', 0.8435473442077637),
 ('network', 0.8413074016571045),
 ('hardware', 0.8397271633148193)]

# Reference

- [Blog: Word2vec API Tutorial](http://rare-technologies.com/word2vec-tutorial/)