In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");


Author: Nikhil Mehta  
Description: Analysis of Item Density Weighting.

In [None]:
# %reset
import collections
import os
import pprint
import tempfile
from typing import Dict, List, Text, Tuple, Union, Optional, Any

from absl import logging
from colabtools import drive
from colabtools import adhoc_import
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.colors import ListedColormap
from matplotlib.colors import to_rgb, to_rgba
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import sklearn
from sklearn.cluster import KMeans
import tensorflow as tf
import yaml
from multiple_user_representations.synthetic_data import util

In [None]:
def load_dataset(root_dir, dataset_path, alpha_str, is_npz = False):

  data_path = '{}/{}/synthetic_data_{}'.format(
      root_dir, dataset_path, alpha_str)
  data = util.load_data(data_path)
  item_clusters = data['item_clusters']
  all_items = data['items']
  test_cluster = item_clusters[data['user_item_sequences'][:, -1]]
  user_item_sequences = data['user_item_sequences']
  user_interests = data.get('user_interests', None)
    
  return (item_clusters, all_items, user_interests, test_cluster, user_item_sequences)

In [None]:
root_dir = 'root_dir/'
dataset_path = 'datasets/sparse_C5_I10_U3_N10000/'
alpha_str = 'interest-power1.0_item-power1.0_alpha0.6_gamma0.3/'
results_path = 'results/d2/density_smoothing/reruns_3'

## Visualize Final Embedding Space

In [None]:
dataset = load_dataset(root_dir, dataset_path, alpha_str)
(item_clusters, all_items, user_interests, test_cluster, user_item_sequences) = dataset

seed = 1235
model_str = 'MUR_5'

item_embeddings_path = os.path.join(root_dir, results_path, dataset_path,
                                    f'synthetic_data_{alpha_str}',
                                    f'seed_{seed}', model_str,
                                    'item_embeddings.npy')

user_embeddings_path = os.path.join(root_dir, results_path, dataset_path,
                                      f'synthetic_data_{alpha_str}',
                                      f'seed_{seed}', model_str,
                                      'user_embeddings.npy')

with tf.io.gfile.GFile(item_embeddings_path, 'rb') as f:
  item_embeddings = np.load(f)

with tf.io.gfile.GFile(user_embeddings_path, 'rb') as f:
  user_embeddings = np.load(f, allow_pickle=True)

label_set = np.unique(item_clusters).astype(int)
cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)

for label in label_set:
  label_indices = np.where(item_clusters == label)
  rgb_alphas = list(reversed(np.linspace(5.0, 50.0, len(label_indices[0]))))
  plt.scatter(item_embeddings[label_indices,0], 
              item_embeddings[label_indices,1], 
              c=cmap(label),
              s=rgb_alphas,
              label='{}'.format(label))

  
plt.legend()
plt.title('Output embedding space')
plt.xlabel('X1')
plt.ylabel('X0')
plt.show()

In [None]:
fig, axs = plt.subplots(4, 5, figsize=(20, 14), constrained_layout=True,
                        sharex=True, sharey=True)

for user_ix, ax in enumerate(axs.flat):

  label_set = np.unique(item_clusters).astype(int)
  cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)
  for label in label_set:
    label_indices = np.where(item_clusters == label)
    rgb_alphas = list(reversed(np.linspace(5.0, 50.0, len(label_indices[0]))))
    ax.scatter(item_embeddings[label_indices,0], 
                item_embeddings[label_indices,1], 
                c=cmap(label),
                s=rgb_alphas,
                label='{}'.format(label))
    
  user_embeddings = tf.squeeze(user_embeddings)
  if len(user_embeddings.shape) > 2:
    H = user_embeddings.shape[1]
    ax.quiver(np.zeros(H), np.zeros(H), user_embeddings[user_ix,:, 0], user_embeddings[user_ix,:, 1], scale=7.)
  else:
    ax.quiver(0., 0., user_embeddings[user_ix, 0], user_embeddings[user_ix, 1], scale=6.)

  ax.set_title("""$Y_u$ = {}""".format(sorted(set(user_interests[user_ix]))))
  ax.legend(loc = 'upper left', bbox_to_anchor=(1.0, 1.0))

fig.suptitle('User representations for 10 different users.', fontsize=16)
plt.show()

In [None]:
user_ix = 2
fig = plt.figure(figsize=(5, 4), constrained_layout=True)

# for user_ix, ax in enumerate(axs.flat):
label_set = np.unique(item_clusters).astype(int)
cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)
for label in label_set:
  label_indices = np.where(item_clusters == label)
  rgb_alphas = list(reversed(np.linspace(5.0, 50.0, len(label_indices[0]))))
  plt.scatter(item_embeddings[label_indices,0], 
              item_embeddings[label_indices,1], 
              c=cmap(label),
              s=rgb_alphas,
              label='{}'.format(label))
  
user_embeddings = tf.squeeze(user_embeddings)
if len(user_embeddings.shape) > 2:
  H = user_embeddings.shape[1]
  plt.quiver(np.zeros(H), np.zeros(H), user_embeddings[user_ix,:, 0],
            user_embeddings[user_ix,:, 1], scale=6.)
else:
  plt.quiver(0., 0., user_embeddings[user_ix, 0],
            user_embeddings[user_ix, 1], scale=6.)

# ax.set_title("""$Y_u$ = {} & Interest Recall: {:.2f}""".format(
#     user_interests[user_ix], interest_acc[user_ix]))
plt.legend(loc = 'upper left', bbox_to_anchor=(1.0, 1.0))

plt.title('MUR (H = 5) for a user having interests: {}.'.format(sorted(set(user_interests[user_ix]))), fontsize=11)
plt.show()

## Visualize Weight Updates 

#### Iteration 0: weights 

In [None]:
train_next_items = user_item_sequences[:, -3] # validation
train_next_item_counter = collections.Counter(train_next_items)
train_next_item_count = np.array([train_next_item_counter[item]
                                  for item in all_items],
                                 dtype=np.float32)

total_count = np.sum(train_next_item_count)
prior_w = train_next_item_count/total_count

cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)
color=[cmap(0)] * 10 + [cmap(1)] * 10 + [cmap(2)] * 10 + [cmap(3)] * 10 + [cmap(4)] * 10

plt.bar(all_items, prior_w, color=color)
plt.xlabel('Item ID')
plt.ylabel('Normalized Frequency')
plt.title('Empirical density of items.')
plt.show()

In [None]:
train_next_items = user_item_sequences[:, -3] # validation
train_next_item_counter = collections.Counter(train_next_items)
train_next_item_count = np.array([train_next_item_counter[item]
                                  for item in all_items],
                                 dtype=np.float32)

total_count = np.sum(train_next_item_count)
prior_w = train_next_item_count/total_count

plt.bar(all_items, prior_w)
plt.xlabel('Item ID')
plt.ylabel('Frequency')
plt.title('Empirical density of items.')
plt.show()

## Load iterations

In [None]:
model_results_path = os.path.join(root_dir, results_path, dataset_path, f'synthetic_data_{alpha_str}', f'seed_{seed}', model_str)

iteration = 1
iteration_path = os.path.join(model_results_path,
                              'iteration_{}'.format(iteration))

all_embeddings = []
all_queries = []
all_weights = []
while tf.io.gfile.exists(iteration_path):

  embeddings_path = os.path.join(iteration_path, 'embeddings.npy')
  with tf.io.gfile.GFile(embeddings_path, 'rb') as f:
    all_embeddings.append(np.load(f))

  queries_path = os.path.join(iteration_path, 'user_queries.npy')
  if tf.io.gfile.exists(queries_path):
    with tf.io.gfile.GFile(queries_path, 'rb') as f:
      all_queries.append(np.load(f))

  weights_path = os.path.join(iteration_path, 'weights.npy')
  with tf.io.gfile.GFile(weights_path, 'rb') as f:
    count_weight_dict = np.load(f, allow_pickle=True).item()
    weights = np.array([count_weight_dict[item][1] for item in all_items],
                       dtype=np.float32)
    all_weights.append(weights)

  iteration += 1
  iteration_path = os.path.join(model_results_path,
                                'iteration_{}'.format(iteration))


## Track  ||w(t+1) - w(t)||

In [None]:
delta_norm = lambda w1, w2: np.linalg.norm(w1-w2, ord=2)

w_delta = []

prev_w = all_weights[0]
for w in all_weights[1:]:
  w_delta.append(delta_norm(w, prev_w))
  prev_w = w

plt.plot(w_delta)
plt.show()

In [None]:
def flatten(d, parent_key='', sep='_'):
  items = []
  for k, v in d.items():
      new_key = parent_key + sep + k if parent_key else k
      if isinstance(v, collections.MutableMapping):
          items.extend(flatten(v, new_key, sep=sep).items())
      else:
          items.append((new_key, v))
  return dict(items)

def get_cluster_wise_performance(
    user_embeddings, item_embeddings, test_cluster, target_next_items):

  cluster_results = dict()
  for user_cluster_slice_ix in np.unique(test_cluster).astype(int):
    user_embeddings_slice = user_embeddings[test_cluster == user_cluster_slice_ix]
    target_next_items_slice = target_next_items[test_cluster == user_cluster_slice_ix]

    num_samples = user_embeddings_slice.shape[0]
    hr5 = compute_top_k_mean_accuracy(user_embeddings_slice, item_embeddings,
                                        target_next_items_slice, k=5)
    hr10 = compute_top_k_mean_accuracy(user_embeddings_slice, item_embeddings,
                                        target_next_items_slice, k=10)
    
    cluster_results[f'HR@5_Cluster{user_cluster_slice_ix}'] = hr5
    cluster_results[f'HR@10_Cluster{user_cluster_slice_ix}'] = hr10
    cluster_results[f'N_Cluster{user_cluster_slice_ix}'] = num_samples

  return cluster_results

def load_results(results_path: str, only_eval_result: bool):
  data = dict()
  with tf.io.gfile.GFile(os.path.join(results_path, 'eval_result.yaml')) as f:
    result_eval = yaml.safe_load(f)
  data['eval_result'] = flatten(result_eval)
  if only_eval_result:
    return data['eval_result']      

  for fname in ['user_embeddings',
                'item_embeddings',]:
    fpath = os.path.join(results_path, fname+'.npy')
    with tf.io.gfile.GFile(fpath, 'rb') as f:
        data[fname] = np.load(f, allow_pickle=True)
        # result_eval = data['eval_result'][()]   
  
  return data['user_embeddings'], data['item_embeddings'], data['eval_result']

def compute_top_k_mean_accuracy(query_embeddings, item_embeddings,
                                 target_indices, k=5):
  
  if len(query_embeddings.shape) == 2:
    query_embeddings = np.expand_dims(query_embeddings, axis=1)

  m = tf.keras.metrics.TopKCategoricalAccuracy(k=k)
  num_items = item_embeddings.shape[0]

  target_label = np.eye(num_items)[target_indices]
  
  all_scores = np.max(
    np.matmul(query_embeddings, np.transpose(item_embeddings)), axis=1)
   
  m.update_state(target_label, all_scores)
  return m.result().numpy()

def compute_top_k_elementwise_accuracy(query_embeddings, item_embeddings,
                                 target_indices, k=5):
  
  if len(query_embeddings.shape) == 2:
    query_embeddings = np.expand_dims(query_embeddings, axis=1)

  num_items = item_embeddings.shape[0]

  target_label = np.eye(num_items)[target_indices]
  
  all_scores = np.max(
    np.matmul(query_embeddings, np.transpose(item_embeddings)), axis=1)

  m = tf.keras.metrics.top_k_categorical_accuracy(target_label, all_scores, k=k)
  
  return m.numpy()

def evaluate_results(model_str, results_dir, alpha_str, item_clusters = None,
                     user_interests = None, test_cluster = None,
                     is_npz=False, only_eval_result=False, k_list=None,
                     normalize_embeddings=False):

  seed_list = [1234, 1235, 1236]
  seed_found = len(seed_list)
  print (model_str)
  mean_results = dict()
  if k_list is None:
    k_list = range(50, 250, 50)
  for seed in seed_list:
 
    if alpha_str:
      file_path = os.path.join(results_dir, 
                                'synthetic_data_{}'.format(alpha_str))
    else:
      file_path = results_dir
      
    file_path = os.path.join(file_path, 'seed_{}'.format(seed), model_str)
      
    if not tf.io.gfile.exists(file_path):  
      seed_found  -= 1
      print ("{} does not exist! Ignoring it.".format(file_path))
      continue

    if only_eval_result:
      result_eval = load_results(file_path, only_eval_result)
    else:
      user_embeddings, item_embeddings, result_eval = load_results(
        file_path, only_eval_result)
      
      # Cluster wise performance
      target_next_items = user_item_sequences[:, -1]
      results_slice_cluster = get_cluster_wise_performance(
          user_embeddings, item_embeddings, test_cluster, target_next_items)
      result_eval.update(results_slice_cluster)
      if normalize_embeddings:
        user_embeddings /= np.linalg.norm(
            user_embeddings, axis=-1, keepdims=True)
        item_embeddings /= np.linalg.norm(
            item_embeddings, axis=-1, keepdims=True)
        
              
      # Sillhouette score
      silhouette_dict = evaluate_silhouette_score(item_embeddings,
                                                  item_clusters)
      result_eval.update(silhouette_dict)

      # # Interest Eval scores
      # for k in k_list:
      #   interest_eval = get_interest_eval_result(
      #       user_embeddings, item_embeddings, user_interests, test_cluster, 
      #       item_clusters, K=k)
      #   result_eval.update(interest_eval)

      # # Anisotorpy
      # normalized_item_embeddings = item_embeddings / np.linalg.norm(
      #     item_embeddings, axis=1, keepdims=True)
      # num_items = normalized_item_embeddings.shape[0]
      # result_eval['isotropy'] = compute_mean_cosine_similarity(item_embeddings)

    if not mean_results:
      mean_results.update(result_eval)
    else:
      for k in mean_results.keys():
        mean_results[k] += result_eval[k] 

  for k in mean_results.keys():
    mean_results[k] /= seed_found
    
  return mean_results

def evaluate_silhouette_score(X_data, Y_data, use_kmeans = False):

  if use_kmeans:
    candidate_clusters_size = [2, 3, 5, 7, 10]
    silhouette_scores = []
    for cluster_size in candidate_clusters_size:
      kmeans = KMeans(n_clusters = cluster_size, random_state=1234).fit(X_data)
      silhouette_scores.append(sklearn.metrics.silhouette_score(X_data, kmeans.labels_, random_state=1234))
    silhouette_kmeans = max(silhouette_scores)
  else:
    silhouette_kmeans = 0

  silhouette_actual = sklearn.metrics.silhouette_score(
      X_data, Y_data, random_state=1234)
  
  result_scores = {
      'silhouette_kmeans': silhouette_kmeans,
      'silhouette_actual': silhouette_actual
  }
  
  return result_scores

## Plot iterations

In [None]:
num_plots = len(all_embeddings)

cols = 4
rows = num_plots // cols
if (rows * cols) < num_plots:
  rows += 1

fig, axs = plt.subplots(rows, cols, figsize=(14, (3.0)*rows), constrained_layout=True, 
                        sharex=True, sharey=True)
axs = axs.flat
# fig.suptitle(f"I=50, |Yu|=5, |U|=50000", fontsize=16)
# metric_name = 'HR@10'

for i, (ax, embeddings, queries) in enumerate(zip(axs, all_embeddings, all_queries)):
  
  for label in label_set:
    label_indices = np.where(item_clusters == label)
    rgb_alphas = list(reversed(np.linspace(5.0, 50.0, len(label_indices[0]))))
    ax.scatter(embeddings[label_indices,0], 
                embeddings[label_indices,1], 
                c=cmap(label),
                s=rgb_alphas,
                label='{}'.format(label))
    
    silhouette = evaluate_silhouette_score(
        embeddings,item_clusters)['silhouette_actual']
    
  # queries = tf.squeeze(queries)
  # if len(queries.shape) > 1:
  #   H = queries.shape[0]
  #   ax.quiver(np.zeros(H), np.zeros(H), queries[:, 0], queries[:, 1], scale=7.)
  # else:
  #   ax.quiver(0., 0., queries[0], queries[1], scale=6.) 
  
  ax.set_title(f'Iteration {i} (S = {silhouette:.2f})', fontsize=12)

plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
plt.show()

In [None]:
len(all_weights), len(w_delta)

In [None]:
fig, axs = plt.subplots(rows, cols, figsize=(14, 3.0*rows), constrained_layout=True, 
                        sharex=True, sharey=True)
axs = axs.flat

for i, ax in enumerate(axs):
  
  if i >= len(all_weights):
    break
    
  if i == 0 or i == 4:
    ax.bar(all_items, prior_w)
    ax.set_ylabel('Weight')
    ax.set_title(f'Iteration {i}', fontsize=10)
  else:
    ax.bar(all_items, all_weights[i-1])

  if i != 0:
    ax.text(20.0, 0.1, f'$||\Delta w|| = {w_delta[i-1]:.4f}$', fontsize=9)

  ax.set_title(f'Iteration {i}', fontsize=12)
  ax.set_xlabel('Item ID')
  
  
plt.suptitle('Weights of items in iterative training.', fontsize=14)
plt.show()

## Plot item density in the output embedding space.

In [None]:
all_items

In [None]:
all_weights[0]

In [None]:
def plot_kernel_estimates(ax, iter_w, iter_embeddings):

  num_samples = 10000
  sampled_items = np.random.choice(all_items, size=num_samples, p=w)
  data = iter_embeddings[sampled_items]
  data += np.random.normal(loc=0.0, scale=0.05, size=data.shape)
  kernel = stats.gaussian_kde(data.T)

  xmin = -2
  ymin = -2
  xmax = 2
  ymax = 2

  X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
  positions = np.vstack([X.ravel(), Y.ravel()])

  Z = np.reshape(kernel(positions).T, X.shape)

  ax.set_xlim(xmin, xmax)
  ax.set_ylim(ymin, ymax)
  cfset = ax.contourf(X, Y, Z, cmap='Blues')
  cset = ax.contour(X, Y, Z, colors='k')
  ax.clabel(cset, inline=1, fontsize=10)


In [None]:
fig, axs = plt.subplots(rows, cols, figsize=(14, 3.0*rows), constrained_layout=True, 
                        sharex=True, sharey=True)
axs = axs.flat

for i, (ax, iter_w, iter_embeddings) in enumerate(zip(axs, all_weights, all_embeddings)):

  plot_kernel_estimates(ax, iter_w, iter_embeddings)
  
  ax.set_xlabel('X1', fontsize=12)
  ax.set_ylabel('X0', fontsize=12)
  ax.set_title(f'Iteration {i}', fontsize=12)
  
plt.suptitle('Density in the output embedding space over iterations.', fontsize=14)
plt.show()

In [None]:
hr5 = []
hr10 = []
cluster_indices = []
models = []

In [None]:
model_str = 'MUR_5'
results_path = 'results/d2/density_smoothing/baseline/'

results_dir = os.path.join(root_dir, results_path, dataset_path)    

MUR5_results = evaluate_results(model_str, results_dir, alpha_str,
                                item_clusters, user_interests, test_cluster,
                                k_list=[1, 5, 10])

for cluster_ix in range(5):
  hr5.append(MUR5_results[f'HR@5_Cluster{cluster_ix}'])
  hr10.append(MUR5_results[f'HR@10_Cluster{cluster_ix}'])
  cluster_indices.append(cluster_ix)
  models.append('MUR_5_NW')

In [None]:
model_str = 'MUR_5'
results_path = 'results/d2/density_smoothing/baseline-sample_weight/'

results_dir = os.path.join(root_dir, results_path, dataset_path)    

MUR5_SW_results = evaluate_results(model_str, results_dir, alpha_str,
                                item_clusters, user_interests, test_cluster,
                                k_list=[1, 5, 10])

for cluster_ix in range(5):
  hr5.append(MUR5_SW_results[f'HR@5_Cluster{cluster_ix}'])
  hr10.append(MUR5_SW_results[f'HR@10_Cluster{cluster_ix}'])
  cluster_indices.append(cluster_ix)
  models.append(model_str + '_FW')

In [None]:
model_str = 'MUR_5'
results_path = 'results/d2/density_smoothing/reruns_3/'

results_dir = os.path.join(root_dir, results_path, dataset_path)    

MUR5_DW_results = evaluate_results(model_str, results_dir, alpha_str,
                                item_clusters, user_interests, test_cluster,
                                k_list=[1, 5, 10])


for cluster_ix in range(5):
  hr5.append(MUR5_DW_results[f'HR@5_Cluster{cluster_ix}'])
  hr10.append(MUR5_DW_results[f'HR@10_Cluster{cluster_ix}'])
  cluster_indices.append(cluster_ix)
  models.append(model_str + '_DW')


In [None]:
# Plotting cluster slice performance
# Subplot for alpha_str
d = {'Model': models, 'HR@10': hr10, 'HR@5': hr5, 'Cluster': cluster_indices}

df = pd.DataFrame(data=d)

ax = plt.gca()
ax.hlines(y=MUR5_results['HR@5'], xmin=-0.5, xmax=4.5,
          colors='teal', alpha=0.6)
ax.hlines(y=MUR5_SW_results['HR@5'], xmin=-0.5, xmax=4.5,
          colors='coral', alpha=0.6)
ax.hlines(y=MUR5_DW_results['HR@5'], xmin=-0.5, xmax=4.5,
          colors='steelblue', alpha=0.6)

sns.barplot(x="Cluster", y="HR@10", hue="Model", data=df, ax=ax,
            palette = {'MUR_5_NW': 'teal', 'MUR_5_FW': 'coral', 'MUR_5_DW': 'steelblue'})

plt.ylabel('HR@10', fontsize=12)
plt.xlabel('Cluster', fontsize=12)
plt.title('Performance sliced by cluster index.', fontsize=14)
plt.show()

In [None]:
# Plotting cluster slice performance
# Subplot for alpha_str
d = {'Model': models, 'HR@10': hr10, 'HR@5': hr5, 'Cluster': cluster_indices}

df = pd.DataFrame(data=d)

ax = plt.gca()
ax.hlines(y=MUR5_results['HR@10'], xmin=-0.5, xmax=4.5,
          colors='teal', alpha=0.6)
ax.hlines(y=MUR5_SW_results['HR@10'], xmin=-0.5, xmax=4.5,
          colors='coral', alpha=0.6)
ax.hlines(y=MUR5_DW_results['HR@10'], xmin=-0.5, xmax=4.5,
          colors='steelblue', alpha=0.6)

sns.barplot(x="Cluster", y="HR@10", hue="Model", data=df, ax=ax,
            palette = {'MUR_5_NW': 'teal', 'MUR_5_FW': 'coral', 'MUR_5_DW': 'steelblue'})
plt.ylabel('HR@10', fontsize=12)
plt.xlabel('Cluster', fontsize=12)
plt.title('Performance sliced by cluster index.', fontsize=14)
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
plt.show()