## Learning Word Embeddings

In [1]:
%matplotlib inline
import collections 
import math
import numpy as np
import os
import random
import tensorflow as tf
from matplotlib import pylab
#download the data
import bz2
from six.moves import range
from six.moves.urllib.request import urlretrieve
#visualise the word
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

import nltk # standard preprocessing
import operator #sorting items in dictionary by value

from math import ceil
import csv

  from ._conv import register_converters as _register_converters


## Dataset
This code downloads a [dataset](http://www.evanjones.ca/software/wikipedia2text.html) consisting of several Wikipedia articles totaling up to roughly 61 megabytes. Additionally the code makes sure the file has the correct size after downloading it.

In [3]:
url = 'http://www.evanjones.ca/software/'

def download_data(filename, expected_bytes):
    """Download a file if not present and make sure it's the right size"""
    if not os.path.exists(filename):
        print('Attempting to download:', filename)
        filename, _ = urlretrieve(url + filename, filename)
        print('\nDownload complete!')
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified %s' % filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?.'
        )
    return filename

filename = download_data('wikipedia2text-extracted.txt.bz2', 18377035)

Attempting to download: wikipedia2text-extracted.txt.bz2

Download complete!
Found and verified wikipedia2text-extracted.txt.bz2


## Read data without processing
Reads data as it is to a string and tokenize it using spaces and returns a list of words

In [None]:
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words"""
    
    with bz2.BZ2File(filename) as f:
        data = []
        file_string = f.read().decode('utf-8')
        file_string = nltk.word_tokenize(file_string)
        data.extend(file_string)
    return data

words = read_data(filename)
print('Data size %d' % len(words))
print('Example words (start): ', words[:10])
print('Example words (end): ', words[-10:])

## Read Data with Preprocessing with NLTK
Reads data as it is to a string, convert to lower-case and tokenize it using the nltk library. This code reads data in 1MB portions as processing the full text at once slows down the task and returns a list of words. You will have to download the necessary tokenizer.

In [None]:
def read_data(filename):
    """ Extract the first file enclosed in a zip file as a list of words
    and pre-processes it using the nltk python library
    """
    
    with bz2.BZ2File(filename) as f:
        data = []
        file_size = os.stat(filename).st_size
        chuck_size = 1024*1024 #reading 1MB at a time as the dataset is large
        print('Reading data...')
        for i in range(ceil(file_size//chuck_size) + 1):
            bytes_to_read = min(chuck_size, file_size - (i * chuck_size))
            file_string = f.read(bytes_to_read).decode('utf-8')
            file_string = file_string.lower()
            #tokenize a string to word residing in a list
            file_string = nltk.word_tokenize(file_string)
            data.extend(file_string)
    
    return data

words = read_data(filename)
print('Data size %d' % len(words))
print('Example words (start): ', words[:10])
print('Example words (end): ', words[-10:])

## Building the Dictionaries
Builds the following. To understand each of these elements, let us also assume the text "I like to go to school"
* `dictionary`: maps a string word to an ID (e.g. {I:0, like:1, to:2, go:3, school:4})
* `reverse_dictionary`: maps an ID to a string word (e.g. {0:I, 1:like, 2:to, 3:go, 4:school}
* `count`: List of list of (word, frequency) elements (e.g. [(I,1),(like,1),(to,2),(go,1),(school,1)]
* `data` : Contain the string of text we read, where string words are replaced with word IDs (e.g. [0, 1, 2, 3, 2, 4])

It also introduces an additional special token `UNK` to denote rare words to are too rare to make use of.

In [None]:
# we restrict our vocabulary size to 50000
vocabulary_size = 50000

def build_dataset(words):
    count = [['UNK', -1]]
    # Gets only the vocabulary_size most common words as the vocabulary
    # All the other words will be replaced with UNK token
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dictionary = dict()
    
    # Create an ID for each word by giving the current length of the dictionary
    # And adding that item to the dictionary
    for word, _ in count:
        dictionary[word] = len(dictionary)
        
    data = list()
    unk_count = 0
    # Traverse through all the text we have and produce a list
    # where each element corresponds to the ID of the word found at that index
    
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0 #dictionary['UNK']
            unk_count = unk_count + 1
        data.append(index)
        
    #update the count variavl ewith the number of UNK occurance
    count[0][1] = unk_count
    
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    
    assert len(dictionary) == vocabulary_size
    
    return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:10])
print('Sample data', data[:10])
del words  # Hint to reduce memory.

## Generating Batches of Data for Skip-Gram
Generates a batch or target words (batch) and a batch of corresponding context words (labels). It reads 2*window_size+1 words at a time (called a span) and create 2*window_size datapoints in a single span. The function continue in this manner until batch_size datapoints are created. Everytime we reach the end of the word sequence, we start from beginning.

In [17]:
data_index = 0

def generate_batch_skip_gram(batch_size, window_size):
    #data_index is updated by 1 everywhere we read a data point
    global data_index
    
    #two numpy arras to hold target words (batch)
    # and context words (labels)
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    
    span = 2 * window_size + 1
    # the buffer holds the data contained within the span
    buffer = collections.deque(maxlen=span)
    #fill the buffer and update the data_index
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
        
    #this is the number of context words we sample for single target word
    num_samples = 2 * window_size
    
    # We break the batch reading into two for loops
    # The inner for loop fills in the batch and labels with 
    # num_samples data points using data contained withing the span
    # The outper for loop repeat this for batch_size//num_samples times
    # to produce a full batch
    for i in range(batch_size // num_samples):
        k = 0
        #avoid the target word itself as a prediction 
        #fill in batch and label numpy arrays
        for j in list(range(window_size))+list(range(window_size+1,2*window_size+1)):
            batch[i * num_samples + k] = buffer[window_size]
            labels[i * num_samples + k, 0] = buffer[j]
            k += 1 
        # Everytime we read num_samples data points,
        # we have created the maximum number of datapoints possible
        # withing a single span, so we need to move the span by 1
        # to create a fresh new span
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

print('data:', [reverse_dictionary[di] for di in data[:8]])

for window_size in [1, 2]:
    data_index = 0
    batch, labels = generate_batch_skip_gram(batch_size=8, window_size=window_size)
    print('\nwith window_size = %d:' %window_size)
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])

data: ['propaganda', 'is', 'a', 'concerted', 'set', 'of', 'messages', 'aimed']

with window_size = 1:
    batch: ['is', 'is', 'a', 'a', 'concerted', 'concerted', 'set', 'set']
    labels: ['propaganda', 'a', 'is', 'concerted', 'a', 'set', 'concerted', 'of']

with window_size = 2:
    batch: ['a', 'a', 'a', 'a', 'concerted', 'concerted', 'concerted', 'concerted']
    labels: ['propaganda', 'is', 'concerted', 'set', 'is', 'a', 'set', 'of']


## Skip-Gram Algorithm
### Defining Hyperparameters
Here we define several hyperparameters including batch_size (amount of samples in a single batch) embedding_size (size of embedding vectors) window_size (context window size).

In [27]:
batch_size = 128 # Data points in a single batch
embedding_size = 128 # Dimension of the embedding vector.
window_size = 4 # How many words to consider left and right.

# We pick a random validation set to sample nearest neighbors
valid_size = 16 # Random set of words to evaluate similarity on.
# We sample valid datapoints randomly from a large window without always being deterministic
valid_window = 50

# When selecting valid examples, we select some of the most frequent words as well as
# some moderately rare words as well
valid_examples = np.array(random.sample(range(valid_window), valid_size))
valid_examples = np.append(valid_examples,random.sample(range(1000, 1000+valid_window), valid_size),axis=0)

num_sampled = 32 # Number of negative examples to sample.

### Defining Inputs and Outputs
Here we define placeholders for feeding in training inputs and outputs (each of size batch_size) and a constant tensor to contain validation examples.

In [28]:
tf.reset_default_graph()

# Training input data (target word IDs).
train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
# Training input label data (context word IDs)
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
# Validation input data, we don't need a placeholder
# as we have already defined the IDs of the words selected
# as validation data
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

TypeError: Expected DataType for argument 'dtype' not 128.

### Defining Model Parameters and Other Variables
We now define several TensorFlow variables such as an embedding layer (embeddings) and neural network parameters (softmax_weights and softmax_biases)