# TensorFlow for Natural Language Processing with Neural Networks

Python 3, Tensorflow 1.12

### 1. Word Embeddings

Word embeddings, or word vectors, provide a way of mapping words from a vocabulary into a low-dimensional space, where words with similar meanings are grouped together. Here, we will look at the implementation of the two main methods to train word embeddings in an unsupervised manner: skip-gram and CBOW models. 

#### Preparing the text data
#1. Download the training corpus (e.g., Wikipedia dump, GoogleNews, Bookcorpus);
#2. Build a vocabulary by choosing those top frequent words (can be as many as a billion);
#3. Build the two dictionaries: ixtoword, wordtoix;
#4. Split the corpus into short text sequences (e.g. five consecutive words).

In [None]:
#4. get the input and output lists for training
from random import randint

sequences = [['i', 'love', 'coding', 'in', 'tensorflow'], ['the', 'dog', 'barked', 'at', 'him']]
# Skip-gram model
skip_input = []
skip_output = []
for s in sequences:
    skip_input.append(s[len(s)//2])
    temp = s[:len(s)//2] + s[len(s)//2+1:]
    skip_output.append(temp[randint(0, len(temp)-1)])
print('skip-gram training input: ', skip_input)
print('skip-gram training output: ', skip_output)

# CBOW model
cbow_input = []
cbow_output = []
for s in sequences: 
    cbow_input.append(s[:len(s)//2] + s[len(s)//2+1:])
    cbow_output.append(s[len(s)//2])
print('CBOW training input: ', cbow_input)
print('CBOW training output: ', cbow_output)

# Transfer the input and output lists to indexes 

In [None]:
# Write the Skip-gram model
import tensorflow as tf
import math

vocabulary_size = 10000
embedding_size = 300
batch_size = 128

# setup TensorFlow placeholders
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

# Look up embeddings for inputs
embeddings = tf.Variable(tf.random_uniform(
    [vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

# Construct the variables for the softmax
weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                          stddev=1.0 / math.sqrt(embedding_size)))
biases = tf.Variable(tf.zeros([vocabulary_size]))
hidden_out = tf.matmul(embed, tf.transpose(weights)) + biases

# convert train_context to a one-hot format
train_one_hot = tf.one_hot(train_labels, vocabulary_size)
print('train_one_hot shape: %s'%(train_one_hot.get_shape().as_list()))
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hidden_out, 
    labels=train_one_hot))

# Construct the SGD optimizer using a learning rate of 1.0.
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(cross_entropy)

In [None]:
# Write the CBOW model
vocabulary_size = 10000
embedding_size = 300
batch_size = 128
skip_window = 2 # how many words to consider left and right.

# setup TensorFlow placeholders
train_inputs = tf.placeholder(tf.int32, shape=[batch_size, 2*skip_window])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

# Look up embeddings for inputs
embeddings = tf.Variable(tf.random_uniform(
    [vocabulary_size, embedding_size], -1.0, 1.0))
embeds = None
for i in range(2*skip_window):
    embedding_i = tf.nn.embedding_lookup(embeddings, train_inputs[:,i])
    #print('embedding %d shape: %s'%(i,embedding_i.get_shape().as_list()))
    emb_x, emb_y = embedding_i.get_shape().as_list()
    if embeds is None:
        embeds = tf.reshape(embedding_i,[emb_x,emb_y,1])
        print('embedding shape: %s'%(embeds.get_shape().as_list()))
    else:
        embeds = tf.concat([embeds,tf.reshape(embedding_i,[emb_x,emb_y,1])], 2)
        print('embedding shape: %s'%(embeds.get_shape().as_list()))

avg_embed =  tf.reduce_mean(embeds,2,keep_dims=False)
print('avg_embed shape: %s'%(avg_embed.get_shape().as_list()))

# Construct the variables for the softmax
weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
biases = tf.Variable(tf.zeros([vocabulary_size]))
hidden_out = tf.matmul(embed, tf.transpose(weights)) + biases

# convert train_context to a one-hot format
train_one_hot = tf.one_hot(train_labels, vocabulary_size)
print('train_one_hot shape: %s'%(train_one_hot.get_shape().as_list()))
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hidden_out, 
    labels=train_one_hot))

# Construct the SGD optimizer using a learning rate of 1.0.
optimizer = tf.train.AdagradOptimizer(1.0).minimize(cross_entropy)

##### A toy example

In [None]:
import tensorflow as tf
import numpy as np
# choose which GPU to use
import os

GPUID = 1
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPUID)

In [None]:
corpus_raw = 'He is the king . The king is royal . She is the royal  queen '
# corpus_raw = 'He is the king . The king is royal . She is the queen. The queen is royal '
# convert to lower case
corpus_raw = corpus_raw.lower()

words = []
for word in corpus_raw.split():
    if word != '.': # because we don't want to treat . as a word
        words.append(word)

words = set(words) # so that all duplicate words are removed
word2int = {}
int2word = {}
vocab_size = len(words) # gives the total number of unique words

for i,word in enumerate(words):
    word2int[word] = i
    int2word[i] = word

In [None]:
# raw sentences is a list of sentences.
raw_sentences = corpus_raw.split('.')
sentences = []
for sentence in raw_sentences:
    sentences.append(sentence.split())

WINDOW_SIZE = 2

data = []
for sentence in sentences:
    for word_index, word in enumerate(sentence):
        for nb_word in sentence[max(word_index - WINDOW_SIZE, 0) : min(word_index + WINDOW_SIZE, len(sentence)) + 1] : 
            if nb_word != word:
                data.append([word, nb_word])

print(data)

In [None]:
# function to convert numbers to one hot vectors
def to_one_hot(data_point_index, vocab_size):
    temp = np.zeros(vocab_size)
    temp[data_point_index] = 1
    return temp

x_train = [] # input word
y_train = [] # output word

for data_word in data:
    x_train.append(to_one_hot(word2int[ data_word[0] ], vocab_size))
    y_train.append(to_one_hot(word2int[ data_word[1] ], vocab_size))

# convert them to numpy arrays
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
print('x:', x_train[:4])
print('y:', y_train[:4])

In [None]:
# making placeholders for x_train and y_train
x = tf.placeholder(tf.float32, shape=(None, vocab_size))
y_label = tf.placeholder(tf.float32, shape=(None, vocab_size))

EMBEDDING_DIM = 5 # you can choose your own number
W1 = tf.Variable(tf.random_normal([vocab_size, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([EMBEDDING_DIM])) #bias
hidden_representation = tf.add(tf.matmul(x,W1), b1)
# hidden_representation = tf.matmul(x,W1)

W2 = tf.Variable(tf.random_normal([EMBEDDING_DIM, vocab_size]))
b2 = tf.Variable(tf.random_normal([vocab_size]))
prediction = tf.nn.softmax(tf.add( tf.matmul(hidden_representation, W2), b2))

In [None]:
# Set up the configuration for using the utility
config = tf.ConfigProto(
    log_device_placement = False, 
    allow_soft_placement = True, 
    graph_options=tf.GraphOptions(build_cost_model=1))
# config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.1

sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init) #make sure you do this!

# define the loss function:
cross_entropy_loss = tf.reduce_mean(-tf.reduce_sum(y_label * tf.log(prediction), reduction_indices=[1]))

# define the training step:
train_step = tf.train.GradientDescentOptimizer(0.1).minimize(cross_entropy_loss)

n_iters = 10000
# train for n_iter iterations

for i in range(n_iters):
    _, _loss = sess.run([train_step,cross_entropy_loss], feed_dict={x: x_train, y_label: y_train})
    if i % 500 == 0:
        print('loss is : ', _loss)

vectors = sess.run(W1 + b1)
# vectors = sess.run(W1)

In [None]:
from sklearn.manifold import TSNE
from sklearn import preprocessing
import matplotlib.pyplot as plt
def euclidean_dist(vec1, vec2):
    return np.sqrt(np.sum((vec1-vec2)**2))

def find_closest(word_index, vectors):
    min_dist = 10000 # to act like positive infinity
    min_index = -1
    query_vector = vectors[word_index]
    for index, vector in enumerate(vectors):
        if euclidean_dist(vector, query_vector) < min_dist and not np.array_equal(vector, query_vector):
            min_dist = euclidean_dist(vector, query_vector)
            min_index = index
    return min_index

In [None]:
model = TSNE(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
tsne_vectors = model.fit_transform(vectors) 

normalizer = preprocessing.Normalizer()
tsne_vectors =  normalizer.fit_transform(tsne_vectors, 'l2')

print(tsne_vectors)

In [None]:
fig, ax = plt.subplots()
print(words)
for word in words:
    print(word, tsne_vectors[word2int[word]][0], tsne_vectors[word2int[word]][1])
    ax.annotate(word, (tsne_vectors[word2int[word]][0],tsne_vectors[word2int[word]][1] ))
plt.show()

In [None]:
print(int2word[find_closest(word2int['king'], vectors)])

#### Using exsiting word2vec word embeddings
For the following part, we will look at how we can load pretrained word2vec word embeddings. Some interesting properties of these word vectors will also be exhibited.

First step: download the embedding file from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit, and then unzip the .gz file. Put the file in the same folder as this jupyter file.

In [None]:
import gensim

# Load Google's pre-trained Word2Vec model
# model = gensim.models.Word2Vec.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
# getting word vectors of a word
vector = model['computer']
print(vector.shape)
print(vector)

In [None]:
# performing king queen magic
print(model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1))

In [None]:
# picking odd one out
print(model.doesnt_match("breakfast cereal dinner lunch".split()))

In [None]:
# printing similarity index
print(model.similarity('woman', 'man'))

In [None]:
# find the most similar words
w = 'usa'
model.most_similar(positive=w, topn=5)

In [None]:
# find the most similar words
w = 'dirty'
model.most_similar(positive=w, topn=5)

In [None]:
# find the most similar words
w = 'obama'
model.most_similar(positive=w, topn=5)

### 2. Language Modeling

We will show how to train a recurrent neural network on a challenging task of language modeling. The goal of the problem is to fit a probabilistic model which assigns probabilities to each word within a sentence. 

The cornerstone for this NLP problem is the Recurrent Neual Networks (RNNs). To improve RNNs' ability to capture long-term dependency, LSTMs are typically employed.

#### Recurrent Neural Networks (RNNs)

In the context of deep learning, natural language is commonly modeled with Recurrent Neural Networks (RNNs).
RNNs pass the output of a neuron back to the input of the next time step of the same neuron.
These directed cycles in the RNN architecture gives them the ability to model temporal dynamics, making them particularly suited for modeling sequences (e.g. text).
We can visualize an RNN layer as follows:

<img src="Figures/basic_RNN.PNG" alt="basic_RNN" style="width: 80px;"/>
<center>Figure from *Understanding LSTMs*. https://colah.github.io/posts/2015-08-Understanding-LSTMs/</center>

We can unroll an RNN through time, making the sequence aspect of them more obvious:

<img src="Figures/unrolled_RNN.PNG" alt="basic_RNN" style="width: 400px;"/>
<center>Figure from *Understanding LSTMs*. https://colah.github.io/posts/2015-08-Understanding-LSTMs/</center>

#### RNNs in TensorFlow
How would we implement an RNN in TensorFlow? Given the different forms of RNNs, there are quite a few ways, but we'll stick to a simple one. 

#### Preparing the text data
#1. Download the training corpus (e.g., Yelp reviews, news articles);
#2. Truncate each sentence to a maximum length (typically 20 or 25 words);
#3. Build a vocabulary by choosing those top frequent words (typically within the range of 10k ~ 100k);
#4. Build the two dictionaries: ixtoword, wordtoix;
#5. Convert each sentence to a list of indexes.

In [None]:
import tensorflow as tf

batch_size = 64   # number of samples within a batch
num_steps = 25  # the maximum length of sentences
vocab_size = 10000   # number of words in the vocabulary
embedding_size = 300   # word embedding dimension
hidden_size = 512  # the number of hidden states for LSTM
keep_prob = 0.8   # the percentage of words left after the dropout layer
num_layers = 3 # number of LSTM layers

# placeholders for data
train_inputs = tf.placeholder(tf.int32, shape=[batch_size, num_steps])
train_targets = tf.placeholder(tf.int32, shape=[batch_size, num_steps])
    
# create the word embeddings
embedding = tf.Variable(tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0))
inputs = tf.nn.embedding_lookup(embedding, train_inputs)
inputs = tf.nn.dropout(inputs, keep_prob)
print('inputs shape: %s'%(inputs.get_shape().as_list()))  # batch_size * num_steps * embedding_size

In [None]:
# RNN cell
rnn_cell = tf.nn.rnn_cell.BasicRNNCell(hidden_size)
initial_state = rnn_cell.zero_state(batch_size, dtype=tf.float32)
outputs, state = tf.nn.dynamic_rnn(rnn_cell, inputs,
                                   initial_state=initial_state,
                                   dtype=tf.float32)

In [None]:
# Or multilayer LSTM
rnn_layers = [tf.nn.rnn_cell.LSTMCell(size) for size in [hidden_size, hidden_size, hidden_size]]

# create a RNN cell composed sequentially of a number of RNNCells
multi_rnn_cell = tf.nn.rnn_cell.MultiRNNCell(rnn_layers)

# 'outputs' is a tensor of shape [batch_size, max_time, 256]
# 'state' is a N-tuple where N is the number of LSTMCells containing a
# tf.contrib.rnn.LSTMStateTuple for each cell
outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
                                   inputs=inputs,
                                   dtype=tf.float32)

In [None]:
outputs, state = tf.nn.dynamic_rnn(cell=multi_rnn_cell,
                                   inputs=inputs,
                                   dtype=tf.float32)

In [None]:
# claculate the output probabilities
# a linear transformation for the LSTM output
output = tf.reshape(outputs, [-1, hidden_size])
softmax_w = tf.Variable(tf.random_uniform([hidden_size, vocab_size]))
softmax_b = tf.Variable(tf.random_uniform([vocab_size]))
logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)

# Reshape logits to be a 3-D tensor for sequence loss
logits = tf.reshape(logits, [batch_size, num_steps, vocab_size])
print('logits shape: %s'%(logits.get_shape().as_list()))
# output shape: batch_size * sentence_length * vocub_size

In [None]:
# Use the contrib sequence loss and average over the batches;
# constitutes the weighting of each prediction in the sequence; 
# When using weights as masking, set all valid timesteps to 1 and all padded timesteps to 0.
loss = tf.contrib.seq2seq.sequence_loss(
            logits,
            train_targets,
            tf.ones([batch_size, num_steps], dtype=tf.float32),
            average_across_timesteps=False,
            average_across_batch=True)

# Update the cost
cost = tf.reduce_sum(loss)

# Construct the SGD optimizer using a learning rate of 1.0. 
optimizer = tf.train.AdagradOptimizer(1.0).minimize(cost)

In [None]:
# Some samples generated from a trained LSTM language model (I was training the LSTM model on EMNLP news dataset
# for about 10 hours):
print("Generated Text: We got to a bus station in the evening , but our connection didn ' t leave until the following morning .")
print("\n")
print("Generated Text: An estimated 80 million people across 20 states are facing a second day of being trapped inside due to heavy snow and dangerous conditions , which are expected to last until Sunday .")
print("\n")
print("Generated Text: The security guard claimed he suffered back pain and shock on his way home later that day and was taken to hospital .   ")
print("\n")
print("Generated Text: But these are all things that save me time and that I ' m happy to share with other iPhone users .  ")
print("\n")
print("Generated Text: We just have to keep putting our hands up , both of us , and put in a good performance .")

In [None]:
# Some samples generated from a trained LSTM language model (I was training the LSTM model on Yelp Reviews dataset
# for about 17 hours):
print("Generated Text: ate dinner here last night, the portions were small, i had the chicken and waffles, but i was not impressed .")
print("\n")
print("Generated Text: this is my favorite restaurant in the valley, with a great view of the strict, my parents and i have been going to this place for years .")
print("\n")
print("Generated Text: we go there for eakfast, i ve been here 3 times and it s always good, the hot dogs are delicious, and the hot dogs are delicious . ")
print("\n")
print("Generated Text: i was in vegas and was told to see a show in vegas, i was told it would be a great show, but i was told to see the show, i was told it would be the best show in vegas .")
print("\n")
print("Generated Text: this place needs to be shut down, one of the worst experiences i have ever had, the manager was very rude .")