<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Word2vec" data-toc-modified-id="Word2vec-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Word2vec</a></span><ul class="toc-item"><li><span><a href="#Tensorflow-Word2vec" data-toc-modified-id="Tensorflow-Word2vec-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Tensorflow Word2vec</a></span></li><li><span><a href="#Gensim-Word2vec" data-toc-modified-id="Gensim-Word2vec-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Gensim Word2vec</a></span></li></ul></li><li><span><a href="#Reference" data-toc-modified-id="Reference-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Reference</a></span></li></ul></div>

In [1]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# change default style figure and font size
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 12

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,matplotlib

Ethen 2019-06-05 19:00:27 

CPython 3.6.4
IPython 7.5.0

numpy 1.16.3
pandas 0.24.2
sklearn 0.20.3
matplotlib 3.0.3


# Word2vec

In [2]:
import os


class MovieDataUtils:
    """
    Utility class that downloads the polarity data to disk and load it into memory.
    """

    MOVIE_FOLDER_NAME = 'rt-polaritydata'
    POS_FILE_NAME = 'rt-polarity.pos'
    NEG_FILE_NAME = 'rt-polarity.neg'
    MOVIE_DATA_URL = 'http://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'

    def load_data(self, save_folder_name: str='data'):
        movie_folder_path = os.path.join(save_folder_name, MovieDataUtils.MOVIE_FOLDER_NAME)
        pos_file_path = os.path.join(movie_folder_path, MovieDataUtils.POS_FILE_NAME)
        neg_file_path = os.path.join(movie_folder_path, MovieDataUtils.NEG_FILE_NAME)
        
        if not os.path.exists(save_folder_name):
            os.makedirs(save_folder_name, exist_ok=True)

        if not os.path.exists(movie_folder_path):
            self.download_and_extract_tar_gz_data(save_folder_name)

        with open(pos_file_path, encoding='latin-1') as f:
            pos_data = f.readlines()

        with open(neg_file_path, encoding='latin-1') as f:
            neg_data = f.readlines()

        texts = pos_data + neg_data
        target = [1] * len(pos_data) + [0] * len(neg_data)
        return texts, target

    def download_and_extract_tar_gz_data(self, save_folder_name: str):
        import tarfile
        import requests
        response = requests.get(MovieDataUtils.MOVIE_DATA_URL, stream=True)

        tar_gz_file = os.path.join(save_folder_name, MovieDataUtils.MOVIE_FOLDER_NAME + '.tar.gz')
        with open(tar_gz_file, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)

        tar = tarfile.open(tar_gz_file, 'r:gz')
        tar.extractall(path=save_folder_name)
        tar.close()
        return self

In [3]:
movie_data_utils = MovieDataUtils()
texts, target = movie_data_utils.load_data()
print('total sample size:', len(texts))
print('example text: ', texts[0])
print('corresponding target:', target[0])

total sample size: 10662
example text:  the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 

corresponding target: 1


Create a function that normalizes/cleans the text.

In [4]:
import string


def normalize_texts(texts, stop_words=None, min_token_len=2, remove_digits=True, remove_punctuation=True):
    
    if stop_words is None:
        stop_words = set()

    punc_and_digit = set()
    if remove_digits:
        punc_and_digit |= set(string.digits)

    if remove_punctuation:
        punc_and_digit |= set(string.punctuation)

    normed_texts = []
    for text in texts:
        normed_tokens = []
        tokens = text.lower().strip().split(' ')

        for token in tokens:
            if (token not in stop_words and
                token not in punc_and_digit and
                len(token) >= min_token_len):
                normed_tokens.append(token)

        normed_text = ' '.join(normed_tokens)
        normed_texts.append(normed_text)

    return normed_texts

In [5]:
from nltk.corpus import stopwords

stops_words = stopwords.words('english')
texts = normalize_texts(texts, stops_words)
print('example normalized text: ', texts[0])

example normalized text:  rock destined 21st century's new conan he's going make splash even greater arnold schwarzenegger jean-claud van damme steven segal


## Tensorflow Word2vec

In [6]:
batch_size = 50 
embedding_size = 200 
vocab_size = 10000 
generations = 50000 
print_loss_every = 500 
num_sampled = batch_size // 2
window_size = 2  
print_valid_every = 2000 
valid_words = ['cliche', 'love', 'hate', 'silly', 'sad']

In [7]:
from collections import Counter

def build_vocab(texts, vocab_size):
    """
    Parameters
    ----------
    texts: list[str]

    vocab_size : int

    Returns
    -------
    word2index : dict[str, int]
        Each distinct word in the corpus gets map to a numeric index.
        e.g. {'UNK': 0, 'film': 1}
    """
    # list[str] convert list of sentences to list of words
    words = [token for text in texts for token in text.split()]

    # Initialize list of [word, word_count] for each word, starting with unknown
    word_count = [('UNK', -1)]
    word_count.extend(Counter(words).most_common(vocab_size - 1))

    index2word = []
    word2index = {}
    for word, _ in word_count:
        word2index[word] = len(word2index)
        index2word.append(word)

    return word2index, index2word

In [8]:
def build_vocab(texts, min_count):
    """
    Parameters
    ----------
    texts: list[str]

    min_count : int

    Returns
    -------
    word2index : dict[str, int]
        Each distinct word in the corpus gets map to a numeric index.
        e.g. {'UNK': 0, 'film': 1}
    """
    # list[str] convert list of sentences to list of words
    words = [token for text in texts for token in text.split()]
    
    word_count = {}
    for word in words:
        word_count[word] = word_count.get(word, 0) + 1

    index2word = ['UNK']
    word2index = {'UNK': 0}
    for word, count in word_count.items():
        if count >= min_count:
            word2index[word] = len(word2index)
            index2word.append(word)

    return word2index, index2word

In [9]:
def texts_to_index(texts, word2index):
    """
    Returns
    -------
    texts_index : list[list[int]]
        e.g. [[0, 2], [3, 1]]
        each element in the outer list is the sentence, e.g. [0, 2]
        and each element in the inner list is each word represented in numeric index.
    """
    texts_index = []
    for text in texts:
        text_index = [word2index.get(token, 0) for token in text.split()]
        texts_index.append(text_index)

    return texts_index

In [10]:
word2index, index2word = build_vocab(texts, min_count=5)
indexed_texts = texts_to_index(texts, word2index)
print('vocabulary size: ', len(word2index))

vocabulary size:  4296


TODO: check how the batch data are generated !!!!

In [11]:
def create_window_and_label(indexed_texts, window_size, i):
    start_slice = max(i - window_size, 0)
    end_slice = i + window_size + 1
    window = indexed_texts[start_slice:end_slice]
    label = i if i < window_size else window_size
    return window, label

In [12]:
rand_indexed_texts = [148, 4, 101, 29, 53, 6956, 9, 207, 518]
window, label = create_window_and_label(rand_indexed_texts, window_size=2, i=0)
assert window == [148, 4, 101]
assert label == 0

window, label = create_window_and_label(rand_indexed_texts, window_size=2, i=3)
assert window == [4, 101, 29, 53, 6956]
assert label == 2

In [13]:
def generate_batch_data(indexed_texts, batch_size, window_size, method='skip_gram'):
    batch_data = []
    batch_label = []
    while len(batch_data) < batch_size:
        # list[int]
        rand_indexed_texts = np.random.choice(indexed_texts)

        # print(rand_indexed_texts)
        for i in range(len(rand_indexed_texts)):
            window, label = create_window_and_label(rand_indexed_texts, window_size, i)
            # print(window, label)
            center, context = window[label], window[:label] + window[(label + 1):]
            center = [center] * len(context)
            if method == 'skip_gram':
                batch_data.extend(center)
                batch_label.extend(context)
            elif method == 'cbow':
                batch_data.extend(context)
                batch_label.extend(center)

    # trim batch and label at the end and convert to numpy array
    batch_data = np.array(batch_data[:batch_size])
    batch_label = np.array(batch_label[:batch_size]).reshape(-1, 1)
    return batch_data, batch_label

In [14]:
batch_data, batch_label = generate_batch_data(indexed_texts, 32, 2)
print(batch_data)
batch_data.shape

[1644 1644  789  789  789 1052 1052 1052 1052  391  391  391  391   27
   27   27   27  657  657  657  657 4066 4066 4066 4066  197  197  197
  197  489  489  489]


(32,)

In [19]:
learning_rate = 0.05
method = 'skip_gram'
epochs = 10000

batch_size = 32 
embed_size = 100 
vocab_size = len(word2index) 
num_neg_samples = 5
window_size = 5  

In [20]:
from tf_word2vec import TfWord2vec

model_tf_word2vec = TfWord2vec(batch_size, embed_size, vocab_size, window_size,
                               num_neg_samples, epochs, learning_rate, method)
model_tf_word2vec.fit(indexed_texts)

100%|██████████| 10000/10000 [01:15<00:00, 132.75it/s]


<tf_word2vec.TfWord2vec at 0x1395b7ef0>

In [28]:
from tf_word2vec import most_similar


most_similar(model_tf_word2vec.embed_in_, word2index, index2word, ['good'])

[('movie', 0.9388548),
 ('UNK', 0.93558717),
 ('film', 0.93402964),
 ('like', 0.9303813),
 ('one', 0.9191068),
 ('story', 0.91847056),
 ('--', 0.89264494),
 ('much', 0.8769109),
 ('may', 0.8745483),
 ('funny', 0.8658905)]

In [29]:
most_similar(model_tf_word2vec.embed_out_, word2index, index2word, ['good'])

[('around', 0.4175861),
 ('dig', 0.3923855),
 ('edge', 0.3674501),
 ('ease', 0.35598972),
 ('allen', 0.33458835),
 ('[the', 0.3266014),
 ('cartoons', 0.31436068),
 ('attractive', 0.3120032),
 ("moore's", 0.31183684),
 ('interaction', 0.31126958)]

## Gensim Word2vec

In [23]:
from gensim.models.word2vec import Word2Vec
from gensim.utils import save_as_line_sentence

corpus_file = 'polaritydata.txt'
corpus = [text.split(' ') for text in texts]
save_as_line_sentence(corpus, corpus_file)

In [26]:
import time
from joblib import cpu_count

start_time = time.time()
model_word2vec = Word2Vec(corpus_file=corpus_file, iter=10, size=100, alpha=0.05, sg=0,
                          workers=cpu_count())
elapse_time = time.time() - start_time

print('vocabulary size: ', len(model_word2vec.wv.vocab))
print('elapse time: ', elapse_time)

vocabulary size:  4295
elapse time:  2.1541221141815186


In [27]:
model_word2vec.wv.most_similar(positive=['good'])

  if np.issubdtype(vec.dtype, np.int):


[('idea', 0.8201919794082642),
 ('intentions', 0.7895775437355042),
 ('job', 0.7637943029403687),
 ('pretty', 0.7432918548583984),
 ('except', 0.7155461311340332),
 ('sign', 0.7142113447189331),
 ('talent', 0.7096766233444214),
 ('playing', 0.6989737749099731),
 ('girl', 0.6977887153625488),
 ('funnier', 0.6914594173431396)]

In [30]:
start_time = time.time()
model_word2vec = Word2Vec(corpus_file=corpus_file, iter=30, size=100, alpha=0.05, sg=1,
                          workers=cpu_count())
elapse_time = time.time() - start_time

print('vocabulary size: ', len(model_word2vec.wv.vocab))
print('elapse time: ', elapse_time)
model_word2vec.wv.most_similar(positive=['good'])

vocabulary size:  4295
elapse time:  6.305377006530762


  if np.issubdtype(vec.dtype, np.int):


[('intentions', 0.5170398950576782),
 ('ops', 0.48557227849960327),
 ('clean', 0.45620492100715637),
 ('club', 0.429222971200943),
 ('deliver', 0.42575153708457947),
 ('idea', 0.4245274066925049),
 ('rousing', 0.41510117053985596),
 ('tell', 0.4081522524356842),
 ('dynamic', 0.40439286828041077),
 ('necessarily', 0.4020853042602539)]

cbow: similar to skipgram except we are predicting a single target word from a surrounding window of context words.

# Reference

- [Jupyter Notebook: Gensim Docs - 2Vec File-based Training: API Tutorial](https://nbviewer.jupyter.org/github/RaRe-Technologies/gensim/blob/develop/docs/notebooks/Any2Vec_Filebased.ipynb)