Collaborative filtering with articles embeddings
=============
<span style="color: lightsteelblue;">Resulting embeddings are used in ranking _'read also'_ block.</span>

The goal of this notebook is to train a embedding space over articles from Russian online newspaper.

### Import dependencies

In [None]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
from __future__ import print_function
import collections
import math
import numpy as np
import os
import random
import tensorflow as tf
import zipfile
from pathlib import Path
import pickle
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
from sklearn.manifold import TSNE
from IPython.display import clear_output

# custom libraries
from nlp.preparer import Corpus

### Read data

In [None]:
def read_data(folder):
  data = {}
  data_files = os.listdir(folder)
  for data_file in data_files:
    with open(os.path.join(folder, data_file)) as f:
      for line in f:
        # logs are in the following format:
        # "GET /articles/123456 HTTP/1.1 200 Accept: ... \nCookie: user_id=ssdfrfrf34f4r34 ..."
        article_id, user_id = line[/GET \/articles\/(\d+).*user_id=([a-A0-9\-_]+)/]
        data[user_id].append(article_id)

  return data

# We want to group all articles by user_ids
# user_id => [articles_ids]
articles_by_users = read_data('./articles/logs/access')

### Build features (tokenize articles)

In [None]:
all_articles = set(article_id for user_id in articles_by_user for article_id in articles_by_user[user_id])
num_articles = len(all_articles)

print(f"Total number of accessed articles: {num_articles}")

### Defining batch

In [None]:
data_index = 0

def generate_batch(batch_size, window_size):
  global data_index
  wrap_size = 2 * window_size

  batch = np.ndarray(shape=(batch_size, window_size * 2), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * window_size + 1 # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)
  for _ in range(span):
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)

  for i in range(batch_size):
    # to the left-hand of the target
    for j in range(window_size):
      batch[i, j] = buffer[j]
    # central element
    labels[i, 0] = buffer[window_size]
    # to the right-hand of the target
    for j in range(window_size):
      batch[i, j + window_size] = buffer[j + window_size + 1]

    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
  return batch, labels

### Defining neural network graph

In [None]:
batch_size = 128
embedding_size = 128 # Dimension of the embedding vector.
window_size = 2 # How many articles to consider.
# We pick a random validation set to sample nearest neighbors. here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. 
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(15, valid_window), valid_size))
num_sampled = 64 # Number of negative examples to sample.

graph = tf.Graph()

with graph.as_default(), tf.device('/cpu:0'):

  # Input data.
  train_dataset = tf.placeholder(tf.int32, shape=[batch_size, 2 * window_size])
  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

  # Variables.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0), name='embeddings')
  softmax_weights = tf.Variable(
    tf.truncated_normal([vocabulary_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))

  # Model.
  # Look up embeddings for inputs.
  embed = tf.nn.embedding_lookup(embeddings, train_dataset)
  print(embed.shape)
  # Compute the softmax loss, using a sample of the negative labels each time.
  loss = tf.reduce_mean(
    tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases,
                               inputs=tf.reduce_sum(embed, 1),
                               labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))

  # Optimizer.
  # Note: The optimizer will optimize the softmax_weights AND the embeddings.
  # This is because the embeddings are defined as a variable quantity and the
  # optimizer's `minimize` method will by default modify all variable quantities 
  # that contribute to the tensor it is passed.
  # See docs on `tf.train.Optimizer.minimize()` for more details.
  optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)

  # Compute the similarity between minibatch examples and all embeddings.
  # We use the cosine distance:
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(
    normalized_embeddings, valid_dataset)
  similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))

  # Storing trained model to the disk
  saver = tf.train.Saver([embeddings])

### Training

In [None]:
num_steps = 150001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  average_loss = 0
  for step in range(num_steps):
    batch_data, batch_labels = generate_CBOW_batch(batch_size, window_size)
    feed_dict = {train_dataset : batch_data, train_labels : batch_labels}
    _, l, embs = session.run([optimizer, loss, embeddings], feed_dict=feed_dict)
    average_loss += l

    # note that this is expensive (~20% slowdown if computed every 500 steps)
    if step % 1000 == 0:
      if step > 0:
        average_loss = average_loss / 1000
      # The average loss is an estimate of the loss over the last 1000 batches.
      log = 'Average loss at step %d: %f' % (step, average_loss)
      average_loss = 0

      sim = similarity.eval()

      clear_output(wait=True)
      log = 'Initialized\n' + log

      for i in range(valid_size):
        valid_word = reverse_dictionary[valid_examples[i]]
        top_k = 8 # number of nearest neighbors
        nearest = (-sim[i, :]).argsort()[1:top_k+1]
        log = log + '\nNearest to %s:' % valid_word
        for k in range(top_k):
          close_word = reverse_dictionary[nearest[k]]
          log = '%s %s,' % (log, close_word)
      print(log)
  final_embeddings = normalized_embeddings.eval()

  # save what have been learned
  saver.save(session, STORAGE_PATH)

### Validation

In [None]:
num_points = 400

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])

def plot(embeddings, labels):
  assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
  pylab.figure(figsize=(15,15))  # in inches
  for i, label in enumerate(labels):
    x, y = embeddings[i,:]
    pylab.scatter(x, y)
    pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
                   ha='right', va='bottom')
  pylab.show()

words = [reverse_dictionary[i] for i in range(1, num_points+1)]
plot(two_d_embeddings, words)