### Resources:
- [TensorFlow - Word2Vec Tutorial](https://www.tensorflow.org/versions/r0.11/tutorials/word2vec/index.html)
- [TensorFlow - GitHub](https://github.com/tensorflow/tensorflow/blob/r0.11/tensorflow/examples/tutorials/word2vec/word2vec_basic.py)

# Goal
- Build a template for a basic regression problem using an recurrent neural network.

In [1]:
import tensorflow as tf

### Learning Parameters

In [3]:
LEARNING_RATE = 1.0   # How much we move our gradient parameters per episode
EPISODES = 300         # Total episodes
PRINT = 50            # Print info every x itteration

# Data

### Import Data

In [4]:
from six.moves import urllib
import os

# Downloads data
path = "victorian-jokes.txt"
if not os.path.isfile(path):
    urllib.request.urlretrieve("https://raw.githubusercontent.com/dylanjorgensen/datasets/master/victorian/victorian-jokes.txt", path)

In [5]:
# Reads text to string
with open(path, 'r') as fp:
    raw_txt = fp.read()
    
raw_txt

'The Battle of the Nile. Two naval officers were disputing as to the importance of Lord Nelson\'s victories. They wereunable to agree in opin\n\nThe Latter. Days of Bonaparte. At the close of the year 1820, Napoleon\'s health began to fail so as to excite the greatest apprehensions.\n\nAn Unrehearsed Stage Effect. A good story is told of a certain actor whose fate it was to represent the inferior personages in the drama, s\n\n" What\'s in a Name." Iremember, says an old writer,a school-fellow of mine who wasa striking instance of the inconvenience ofa remarkableCh\n\nThe Blessing of Forgiveness. The brave only know how to forgive; it is the most refined and generous pitch of virtue human nature can arriv\n\n\n\n'

### List: Simple List Each Word

In [6]:
# List of each word
words_list = tf.compat.as_str(raw_txt).split()

type(words_list), len(words_list) #data

(list, 121)

In [7]:
# Vocabulary size
vocabulary_size = len(words_list)

vocabulary_size

121

### Tuple: Count Word Frequency

In [8]:
# Counts instances
import collections

# Tuple of (word, instance_count) pairs ranked by most common
count = [['UNK', -1]]
count.extend(collections.Counter(words_list).most_common(vocabulary_size - 1))

type(count), len(count), # count
#  ('to', 6),
#  ('The', 4),
#  ('in', 3),

(list, 92)

### Dictionary: Maps Unique Words to Unique Values

In [9]:
# Dictionary of unique words with an index
dictionary = dict()
for word, _ in count:
    dictionary[word] = len(dictionary)

import operator
# sorted(dictionary.items(), key=operator.itemgetter(0)) # Sorted by key
# sorted(dictionary.items(), key=operator.itemgetter(1)) # Sorted by value

#  'A': 52,
#  'An': 51,
#  'At': 75,

### List: Replaces Text to Intigers

In [10]:
# Empty list and value
data = []
unk_count = 0

# Read through all text
for word in words_list:    
    if word in dictionary:
        index = dictionary[word] # Store the words index number
    else:
        index = 0  # # Store the int zero - dictionary['UNK']
        unk_count += 1 # Keep track of how many unknowns we have
    
    # Make a list of all our word indexes and unknowns
    data.append(index)

# Turns our text into a mapping where each word is replaced with a number
len(data), # data
# [4, 50, 1, 2, 78,]

(121,)

In [11]:
# Add the new unknowns to our cout tuple
count[0][1] = unk_count

#  ('to', 6),
#  ('The', 4),
#  ('in', 3),

### Reverse Dictionary

In [12]:
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))

# Gives us a new reverse dictionary with the numbers as keys
{k: reverse_dictionary[k] for k in sorted(reverse_dictionary.keys())[:10]}
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

Sample data [4, 80, 2, 1, 34, 15, 60, 68, 57, 89] ['The', 'Battle', 'of', 'the', 'Nile.', 'Two', 'naval', 'officers', 'were', 'disputing']


# Training Batch
- This function will be called each iteration. 
- It's built in a way that provides the right data for the skip-gram model.

In [13]:
import numpy as np
import random

data_index = 0

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1 # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    for _ in range(span):
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [ skip_window ]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)
    return batch, labels

In [14]:
# Step 4: Build and train a skip-gram model.
batch_size = 128
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 1       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by construction are also the most frequent.
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64    # Number of negative examples to sample.

In [15]:
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

80 Battle -> 4 The
80 Battle -> 2 of
2 of -> 80 Battle
2 of -> 1 the
1 the -> 34 Nile.
1 the -> 2 of
34 Nile. -> 15 Two
34 Nile. -> 1 the


# Graph

### Layer 0 (Input)

In [16]:
# Input data.
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

### Layer 1 (Output)

In [17]:
# Look up embeddings for inputs.
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [18]:
import math

# Construct the variables for the NCE loss
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

### Cost

In [19]:
# Compute the average NCE loss for the batch.
# tf.nce_loss automatically draws a new sample of the negative labels each
# time we evaluate the loss.
cost = tf.reduce_mean( tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels, num_sampled, vocabulary_size))

### Optimizer

In [20]:
# Optimizer: SGD
optimizer = tf.train.GradientDescentOptimizer(learning_rate=LEARNING_RATE).minimize(cost)

In [21]:
# # Compute the cosine similarity between minibatch examples and all embeddings.
# norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
# normalized_embeddings = embeddings / norm
# valid_embeddings = tf.nn.embedding_lookup(
#   normalized_embeddings, valid_dataset)
# similarity = tf.matmul(
#   valid_embeddings, normalized_embeddings, transpose_b=True)

# Training

In [22]:
# Interactive TensorFlow session.
sess = tf.InteractiveSession()

In [23]:
# Initialize all vars
sess.run(tf.initialize_all_variables())

In [24]:
num_steps = 10

# Train & Log
for ep in range(EPISODES):
    
    # Specify our batch size
    batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
    batch_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
    
    # Iterativly runs our optimizer
    sess.run(optimizer, feed_dict=batch_dict)
    sess.run(cost, feed_dict=batch_dict)

    # Print
    if (ep+1) == 1 or (ep+1) % PRINT == 0:
        print("Ep:", ep+1)     
        print("Cost", sess.run(cost, feed_dict=batch_dict))

Ep: 1
Cost 44.8232
Ep: 50
Cost 2.55298
Ep: 100
Cost 2.34584
Ep: 150
Cost 1.99374
Ep: 200
Cost 2.15693
Ep: 250
Cost 2.09207
Ep: 300
Cost 1.99474


### WARNING!!! - TSNE Issue Update (Wait for fix)
- https://github.com/scikit-learn/scikit-learn/issues/6665

In [25]:
# Step 6: Visualize the embeddings.

# def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
#   assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
#   plt.figure(figsize=(18, 18))  #in inches
#   for i, label in enumerate(labels):
#     x, y = low_dim_embs[i,:]
#     plt.scatter(x, y)
#     plt.annotate(label,
#                  xy=(x, y),
#                  xytext=(5, 2),
#                  textcoords='offset points',
#                  ha='right',
#                  va='bottom')

#   plt.savefig(filename)

# try:
#   from sklearn.manifold import TSNE
#   import matplotlib.pyplot as plt

#   tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
#   plot_only = 500
#   low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
#   labels = [reverse_dictionary[i] for i in xrange(plot_only)]
#   plot_with_labels(low_dim_embs, labels)

# except ImportError:
#   print("Please install sklearn, matplotlib, and scipy to visualize embeddings.")