In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");

Author: Nikhil Mehta  
Description: Embedding visualization for synthetic data



In [None]:
# %reset
import collections
import os
import pprint
import tempfile
from typing import Dict, List, Text, Tuple, Union, Optional, Any

from absl import logging
from colabtools import drive
from colabtools import adhoc_import
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.colors import ListedColormap
from matplotlib.colors import to_rgb, to_rgba
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import sklearn
from sklearn.cluster import KMeans
import tensorflow as tf
import yaml
from multiple_user_representations.synthetic_data import util

In [None]:
def load_dataset(root_dir, dataset_path, alpha_str, is_npz = False):

  if is_npz:
    data_path = '{}/{}/synthetic_data_{}.npz'.format(
        root_dir, dataset_path, alpha_str)

    with tf.io.gfile.GFile(data_path, 'rb') as f:
        data = np.load(f, allow_pickle=True)
        if 'item_clusters' in data:
          item_clusters = data['item_clusters']
        else:
          item_clusters = data['item_labels']
        all_items = data['items']
        user_interests = data['user_interests']
        test_cluster = item_clusters[data['user_item_sequences'][:, -1]]
        user_item_sequences = data['user_item_sequences']
  else:
    data_path = '{}/{}/synthetic_data_{}'.format(
        root_dir, dataset_path, alpha_str)
    data = util.load_data(data_path)
    item_clusters = data['item_clusters']
    all_items = data['items']
    test_cluster = item_clusters[data['user_item_sequences'][:, -1]]
    user_item_sequences = data['user_item_sequences']
    user_interests = data.get('user_interests', None)
    
  return (item_clusters, all_items, user_interests, test_cluster, user_item_sequences)

In [None]:
def evaluate_silhouette_score(X_data, Y_data, use_kmeans = False):

  if use_kmeans:
    candidate_clusters_size = [2, 3, 5, 7, 10]
    silhouette_scores = []
    for cluster_size in candidate_clusters_size:
      kmeans = KMeans(n_clusters = cluster_size, random_state=1234).fit(X_data)
      silhouette_scores.append(sklearn.metrics.silhouette_score(X_data, kmeans.labels_, random_state=1234))
    silhouette_kmeans = max(silhouette_scores)
  else:
    silhouette_kmeans = 0

  silhouette_actual = sklearn.metrics.silhouette_score(
      X_data, Y_data, random_state=1234)
  
  result_scores = {
      'silhouette_kmeans': silhouette_kmeans,
      'silhouette_actual': silhouette_actual
  }
  
  return result_scores
  
def evaluate_interest_retrieval(user_embeddings, item_embeddings,
                                ground_truth_interests, item_clusters, K=10):

  def compute_mutilabel_recall(predictions, labels):
    # print (predictions, labels)
    intersection = np.intersect1d(predictions, labels)
    return len(intersection) / len(labels)

  num_user_interests = len(ground_truth_interests[0]) # 3
  num_items = np.unique(item_clusters, return_counts=True)[1][0] # items p cluster

  if len(user_embeddings.shape) == 2:
    user_embeddings = np.expand_dims(user_embeddings, axis=1)

  result_score = np.matmul(user_embeddings, np.transpose(item_embeddings))
  result_score = np.max(result_score, axis=1)
  sorted_items = np.argsort(-result_score, axis=1)
  top_k_items = sorted_items[:, :K]
  top_k_item_clusters = item_clusters[top_k_items]
  
  interest_acc = []
  for user_k_predictions, user_interest in zip(top_k_item_clusters,
                                               ground_truth_interests):
    predicted_clusters, count = np.unique(user_k_predictions,
                                          return_counts=True)
    sorted_indices = np.argsort(-count)
    predicted_clusters = predicted_clusters[sorted_indices][:num_user_interests]
    interest_acc.append(compute_mutilabel_recall(predicted_clusters, user_interest))

  return interest_acc

def get_interest_eval_result(user_embeddings, item_embeddings, 
                             user_interests, test_cluster, item_clusters, K=20):

  return_dict = dict()
  if len(test_cluster.shape) < 2:
    test_cluster = np.expand_dims(test_cluster, axis=-1)

  interest_acc = evaluate_interest_retrieval(user_embeddings, item_embeddings,
                                             test_cluster, item_clusters, K=K)
  return_dict[f'next_interest_hr@{K}'] = np.mean(interest_acc)

  interest_acc = evaluate_interest_retrieval(user_embeddings, item_embeddings,
                                             user_interests, item_clusters, K=K)
  return_dict[f'interest_recall@{K}'] = np.mean(interest_acc)

  return return_dict

In [None]:
def flatten(d, parent_key='', sep='_'):
  items = []
  for k, v in d.items():
      new_key = parent_key + sep + k if parent_key else k
      if isinstance(v, collections.MutableMapping):
          items.extend(flatten(v, new_key, sep=sep).items())
      else:
          items.append((new_key, v))
  return dict(items)

def load_results(results_path: str, only_eval_result: bool):

  if results_path.endswith('.npz'):
    with tf.io.gfile.GFile(results_path, 'rb') as f:
          data = np.load(f, allow_pickle=True)

          result_eval = data['eval_result'][()]
          result_eval = flatten(result_eval)
          if only_eval_result:
            return result_eval
            
          item_embeddings = data['item_embeddings']
          user_embeddings = data['user_embeddings']

    return user_embeddings, item_embeddings, result_eval
  else:
    
    data = dict()
    with tf.io.gfile.GFile(os.path.join(results_path, 'eval_result.yaml')) as f:
      result_eval = yaml.safe_load(f)
    data['eval_result'] = flatten(result_eval)
    if only_eval_result:
      return data['eval_result']      

    for fname in ['user_embeddings',
                  'item_embeddings',]:
      fpath = os.path.join(results_path, fname+'.npy')
      with tf.io.gfile.GFile(fpath, 'rb') as f:
          data[fname] = np.load(f, allow_pickle=True)
          # result_eval = data['eval_result'][()]   
    
    return data['user_embeddings'], data['item_embeddings'], data['eval_result']
  
def compute_mean_cosine_similarity(item_embeddings):

  normalized_item_embeddings = item_embeddings / np.linalg.norm(
          item_embeddings, axis=1, keepdims=True)
  num_items = normalized_item_embeddings.shape[0]
  pairwise_similarity = np.sum(
      np.matmul(normalized_item_embeddings,
                np.transpose(normalized_item_embeddings))) - num_items

  return np.abs(pairwise_similarity / (num_items*(num_items-1)))

def compute_top_k_mean_accuracy(query_embeddings, item_embeddings,
                                 target_indices, k=5):
  
  if len(query_embeddings.shape) == 2:
    query_embeddings = np.expand_dims(query_embeddings, axis=1)

  m = tf.keras.metrics.TopKCategoricalAccuracy(k=k)
  num_items = item_embeddings.shape[0]

  target_label = np.eye(num_items)[target_indices]
  
  all_scores = np.max(
    np.matmul(query_embeddings, np.transpose(item_embeddings)), axis=1)
   
  m.update_state(target_label, all_scores)
  return m.result().numpy()

def compute_top_k_elementwise_accuracy(query_embeddings, item_embeddings,
                                 target_indices, k=5):
  
  if len(query_embeddings.shape) == 2:
    query_embeddings = np.expand_dims(query_embeddings, axis=1)

  num_items = item_embeddings.shape[0]

  target_label = np.eye(num_items)[target_indices]
  
  all_scores = np.max(
    np.matmul(query_embeddings, np.transpose(item_embeddings)), axis=1)

  m = tf.keras.metrics.top_k_categorical_accuracy(target_label, all_scores, k=k)
  
  return m.numpy()

def get_cluster_wise_performance(
    user_embeddings, item_embeddings, test_cluster, target_next_items):

  cluster_results = dict()
  for user_cluster_slice_ix in np.unique(test_cluster).astype(int):
    user_embeddings_slice = user_embeddings[test_cluster == user_cluster_slice_ix]
    target_next_items_slice = target_next_items[test_cluster == user_cluster_slice_ix]

    num_samples = user_embeddings_slice.shape[0]
    hr5 = compute_top_k_mean_accuracy(user_embeddings_slice, item_embeddings,
                                        target_next_items_slice, k=5)
    hr10 = compute_top_k_mean_accuracy(user_embeddings_slice, item_embeddings,
                                        target_next_items_slice, k=10)
    
    cluster_results[f'HR@5_Cluster{user_cluster_slice_ix}'] = hr5
    cluster_results[f'HR@10_Cluster{user_cluster_slice_ix}'] = hr10
    cluster_results[f'N_Cluster{user_cluster_slice_ix}'] = num_samples

  return cluster_results

def evaluate_results(model_str, results_dir, alpha_str, item_clusters = None,
                     user_interests = None, test_cluster = None,
                     is_npz=False, only_eval_result=False, k_list=None,
                     normalize_embeddings=False):

  seed_list = [1234, 1235, 1236]
  seed_found = len(seed_list)
  print (model_str)
  mean_results = dict()
  if k_list is None:
    k_list = range(50, 250, 50)
  for seed in seed_list:

    if is_npz:
      fname = 'embeddings_data.npz'    
      file_path = os.path.join(results_dir,
                              'synthetic_data_{}'.format(alpha_str),
                              'seed_{}'.format(seed),
                              model_str, fname)
    else:
      if alpha_str:
        file_path = os.path.join(results_dir, 
                                 'synthetic_data_{}'.format(alpha_str))
      else:
        file_path = results_dir
        
      file_path = os.path.join(file_path, 'seed_{}'.format(seed), model_str)
      
    if not tf.io.gfile.exists(file_path):
      seed_found  -= 1
      print ("{} does not exist! Ignoring it.".format(file_path))
      continue

    if only_eval_result:
      result_eval = load_results(file_path, only_eval_result)
    else:
      user_embeddings, item_embeddings, result_eval = load_results(
        file_path, only_eval_result)
      
      # Cluster wise performance
      target_next_items = user_item_sequences[:, -1]
      results_slice_cluster = get_cluster_wise_performance(
          user_embeddings, item_embeddings, test_cluster, target_next_items)
      result_eval.update(results_slice_cluster)
      if normalize_embeddings:
        user_embeddings /= np.linalg.norm(
            user_embeddings, axis=-1, keepdims=True)
        item_embeddings /= np.linalg.norm(
            item_embeddings, axis=-1, keepdims=True)
        
              
      # Sillhouette score
      silhouette_dict = evaluate_silhouette_score(item_embeddings,
                                                  item_clusters)
      result_eval.update(silhouette_dict)

      # Interest Eval scores
      for k in k_list:
        interest_eval = get_interest_eval_result(
            user_embeddings, item_embeddings, user_interests, test_cluster, 
            item_clusters, K=k)
        result_eval.update(interest_eval)

      # Anisotorpy
      normalized_item_embeddings = item_embeddings / np.linalg.norm(
          item_embeddings, axis=1, keepdims=True)
      num_items = normalized_item_embeddings.shape[0]
      result_eval['isotropy'] = compute_mean_cosine_similarity(item_embeddings)

    if not mean_results:
      mean_results.update(result_eval)
    else:
      for k in mean_results.keys():
        mean_results[k] += result_eval[k] 

  for k in mean_results.keys():
    mean_results[k] /= seed_found
  return mean_results
  

In [None]:
def get_volatility_str(gamma: float):
  if gamma == 0.0:
    return 'No Volatility'
  elif gamma >= 0.4:
    return 'High Volatility'
  elif gamma <= 0.1:
    return 'Low Volatility'
  else:
    return 'Medium Volatility'

Plot for varying the number of heads, and analyzing performance for user slices, where each slice has different number of interests.

In [None]:
root_dir = 'root_dir/'
dataset_path = 'datasets/C100_I50_U7-5-3-2_T40/'
d = 16

model_heads = range(1, 21)
# model_heads.extend(list(range(10, 130, 10)))
results_dir = 'results/reruns/d{}'.format(d)
results_path = os.path.join(root_dir, results_dir, dataset_path)      

plot_alphas = [0.4, 0.6, 0.8]
model_arr = []
results_arr_hr10 = []
results_arr_hr50 = []
results_arr_hr100 = []
results_arr_hr200 = []
split_ix_arr = []
gamma_arr = []

for ix, alpha in enumerate(plot_alphas):
  gamma = 0.9 - alpha
  alpha_str = 'alpha{:0.1f}_gamma{:0.1f}'.format(alpha, gamma)
  # dataset = load_dataset(root_dir, dataset_path, alpha_str, is_npz = False)
  print(alpha_str)
  for h in model_heads:
    if h == 1:
      model_str = 'SUR'
    else:
      model_str = f'MUR_{h}'
    result = evaluate_results(
        model_str, results_path, alpha_str, is_npz=False, only_eval_result=True)
    for split_ix in range(4):
      hr_10 = result[f'split_{split_ix}_top_10_categorical_accuracy']
      hr_50 = result[f'split_{split_ix}_top_50_categorical_accuracy']
      hr_100 = result[f'split_{split_ix}_top_100_categorical_accuracy']
      hr_200 = result[f'split_{split_ix}_top_200_categorical_accuracy']
      
      split_ix_arr.append(split_ix)
      results_arr_hr10.append(hr_10)
      results_arr_hr50.append(hr_50)
      results_arr_hr100.append(hr_100)
      results_arr_hr200.append(hr_200)
      model_arr.append(h)
      gamma_arr.append(get_volatility_str(gamma))
      # print(hr_100, end = ' ')
    # print('')
  # print('', end='\n\n')

In [None]:
# Plotting results
# initialize fig plot
fig, axs = plt.subplots(2, 4, figsize=(18, 8), constrained_layout=True, 
                        sharex=True)
axs = axs.flat
fig.suptitle(f"I=100, |Yu|=7_5_3_2, |U|=50000", fontsize=16)

# Subplot for alpha_str
d = {'Num Heads': model_arr, 'HR@100': results_arr_hr100,
     'HR@200': results_arr_hr200, 'HR@50': results_arr_hr50,
     'HR@10': results_arr_hr10, 'split_ix_arr': split_ix_arr,
     'gamma_arr': gamma_arr}

df = pd.DataFrame(data=d)

for metric_ix, metric_k in enumerate([10, 50]):
  for split_ix, interests in enumerate([7, 5, 3, 2]):

    title = f'Num_Interests: {interests}'
    df_split = df.loc[df['split_ix_arr'] == split_ix]

    ax = sns.lineplot(
        x='Num Heads', y=f'HR@{metric_k}', hue="gamma_arr", data=df_split,
        markers=True, style="gamma_arr", ax=axs[metric_ix*4+split_ix])

    ax.legend(title='Interest Volatility')
    ax.set_title(title)

  # ax.set_ylabel(f'HR@{K}')
  # ax.set_xticks(model_heads)

In [None]:
root_dir = 'root_dir/'
dataset_path = 'datasets/C10_I50_U7-5-3-2_T40/'
d = 16

model_heads = range(1, 21)
# model_heads.extend(list(range(10, 130, 10)))
results_dir = 'results/reruns/d{}'.format(d)
results_path = os.path.join(root_dir, results_dir, dataset_path)      

plot_alphas = [0.4, 0.6, 0.8]
model_arr = []
results_arr_hr10 = []
results_arr_hr50 = []
results_arr_hr100 = []
results_arr_hr200 = []
split_ix_arr = []
gamma_arr = []

for ix, alpha in enumerate(plot_alphas):
  gamma = 0.9 - alpha
  alpha_str = 'alpha{:0.1f}_gamma{:0.1f}'.format(alpha, gamma)
  # dataset = load_dataset(root_dir, dataset_path, alpha_str, is_npz = False)
  print(alpha_str)
  for h in model_heads:
    if h == 1:
      model_str = 'SUR'
    else:
      model_str = f'MUR_{h}'
    result = evaluate_results(
        model_str, results_path, alpha_str, is_npz=False, only_eval_result=True)
    for split_ix in range(4):
      hr_10 = result[f'split_{split_ix}_top_10_categorical_accuracy']
      hr_50 = result[f'split_{split_ix}_top_50_categorical_accuracy']
      hr_100 = result[f'split_{split_ix}_top_100_categorical_accuracy']
      hr_200 = result[f'split_{split_ix}_top_200_categorical_accuracy']
      
      split_ix_arr.append(split_ix)
      results_arr_hr10.append(hr_10)
      results_arr_hr50.append(hr_50)
      results_arr_hr100.append(hr_100)
      results_arr_hr200.append(hr_200)
      model_arr.append(h)
      gamma_arr.append(get_volatility_str(gamma))
      # print(hr_100, end = ' ')
    # print('')
  # print('', end='\n\n')

In [None]:
# Plotting results
# initialize fig plot
fig, axs = plt.subplots(2, 4, figsize=(18, 8), constrained_layout=True, 
                        sharex=True)
axs = axs.flat
fig.suptitle(f"I=100, |Yu|=7_5_3_2, |U|=50000", fontsize=16)

# Subplot for alpha_str
d = {'Num Heads': model_arr, 'HR@100': results_arr_hr100,
     'HR@200': results_arr_hr200, 'HR@50': results_arr_hr50,
     'HR@10': results_arr_hr10, 'split_ix_arr': split_ix_arr,
     'gamma_arr': gamma_arr}

df = pd.DataFrame(data=d)

for metric_ix, metric_k in enumerate([50, 100]):
  for split_ix, interests in enumerate([7, 5, 3, 2]):

    title = f'Num_Interests: {interests}'
    df_split = df.loc[df['split_ix_arr'] == split_ix]

    ax = sns.lineplot(
        x='Num Heads', y=f'HR@{metric_k}', hue="gamma_arr", data=df_split,
        markers=True, style="gamma_arr", ax=axs[metric_ix*4+split_ix])

    ax.legend(title='Interest Volatility')
    ax.set_title(title)

  # ax.set_ylabel(f'HR@{K}')
  # ax.set_xticks(model_heads)

# Plotting results performance vs. num_heads for different intereset volatility slices of the dataset


In [None]:
root_dir = 'root_dir/'
dataset_path = 'datasets/C20_I50_U5_T40/synthetic_data_mixture_alpha0.9_0.8_0.7_0.6/'
d = 16


model_heads = list(range(1, 21))
# model_heads.extend(list(range(10, 130, 10)))
results_dir = 'results/reruns/d{}'.format(d)
results_path = os.path.join(root_dir, results_dir, dataset_path)      

plot_alphas = [0.4, 0.6, 0.8]
model_arr = []
split_ix_arr = []
gamma_arr = []

results_arr_hr10 = []
results_arr_hr50 = []
results_arr_hr100 = []
results_arr_hr200 = []

alpha_str = ''
for h in model_heads:
  if h == 1:
    model_str = 'SUR'
  else:
    model_str = f'MUR_{h}'
  result = evaluate_results(
      model_str, results_path, alpha_str, is_npz=False, only_eval_result=True)
  for split_ix in range(4):
    hr_10 = result[f'split_{split_ix}_top_10_categorical_accuracy']
    hr_50 = result[f'split_{split_ix}_top_50_categorical_accuracy']
    hr_100 = result[f'split_{split_ix}_top_100_categorical_accuracy']
    hr_200 = result[f'split_{split_ix}_top_200_categorical_accuracy']
     
    results_arr_hr10.append(hr_10)
    results_arr_hr50.append(hr_50)
    results_arr_hr100.append(hr_100)
    results_arr_hr200.append(hr_200)
    
    split_ix_arr.append(split_ix)
    model_arr.append(h)
    gamma_arr.append(get_volatility_str(gamma))
#     print(hr_100, end = ' ')
#   print('')
# print('', end='\n\n')

In [None]:
# initialize fig plot
# fig, axs = plt.subplots(1, 1, figsize=(4, 4), constrained_layout=True, 
#                         sharex=True, sharey=True)
# axs = axs.flat
# fig.suptitle(f"I=100, |Yu|=5, |U|=50000", fontsize=16)

fig = plt.figure(figsize=(4, 4), constrained_layout=True)

d = {'Num Heads': model_arr, 'HR@100': results_arr_hr100,
     'HR@200': results_arr_hr200, 'HR@50': results_arr_hr50,
     'HR@10': results_arr_hr10, 'split_ix_arr': split_ix_arr,
     'gamma_arr': gamma_arr}

df = pd.DataFrame(data=d)

ax = sns.lineplot(
      x='Num Heads', y='HR@50', hue='split_ix_arr', data=df, style='split_ix_arr',
      markers=True)

# for split_ix in range(4):

#   title = f'Split: {split_ix}'
#   df_split = df.loc[df['split_ix_arr'] == split_ix]

#   ax = sns.lineplot(
#       x='Num Heads', y='HR@100', data=df_split,
#       markers=True, ax=axs[split_ix])

#   ax.legend(loc='upper left')
#   ax.set_title(title)
  # ax.set_ylabel(f'HR@{K}')
  # ax.set_xticks(model_heads)

In [None]:

def get_dataset_results(dataset_path: str, alpha_str: str, vocab_size: int = 1000):

  print (f'Getting results for {dataset_path} with {alpha_str} dataset.')
  models = ['SUR', 'MUR_3', 'MUR_4', 'MUR_7']
  H= [1, 3, 4, 7]
  d_arr = [4, 8, 16, 32, 64, 128]
  df_model = []
  df_results = dict()
  df_embedding_dim = []
  df_parameters = []
  for d in d_arr:
    results_dir = 'results/d{}'.format(d)
    results_path = os.path.join(root_dir, results_dir, dataset_path)
    for model_str, h in zip(models, H):
      result = evaluate_results(model_str, results_path, alpha_str,
                      is_npz=False, only_eval_result=True)
      
      for key in result:
        if key in df_results:
          df_results[key].append(result[key])
        else:
          df_results[key] = [result[key]]
      
      params = d * (vocab_size + h)
      df_embedding_dim.append(d)
      df_model.append(model_str)
      df_parameters.append(params)
      
  d = {
      'Model': df_model, 
      'Embedding size': df_embedding_dim,
       'Parameters': df_parameters
    }
  for key, value in df_results.items():
    d[key] = value

  return d

def get_cluster_results(clusters: List[int],
                        alpha_str: str,
                        only_eval_result: bool = True):

  models = ['SUR', 'MUR_3', 'MUR_4', 'MUR_7']
  d = 16 
  df_model = []
  df_results = dict()
  df_embedding_dim = []
  df_clusters = []
  for cluster in clusters:
    dataset_path = f'datasets/C{cluster}_I50_U5'
    
    if only_eval_result:
      item_clusters = None
      user_interests = None
      test_cluster = None
    else:
      item_clusters, all_items, user_interests, test_cluster, _ = load_dataset(
          root_dir, dataset_path, alpha_str, is_npz=False)

    print (f'Getting results for {dataset_path} with {alpha_str} dataset.')
    results_dir = f'results/d{d}'
    results_path = os.path.join(root_dir, results_dir, dataset_path)
    for model_str in models:
      
      result = evaluate_results(model_str, results_path, alpha_str, 
                                item_clusters = item_clusters, 
                                user_interests=user_interests, 
                                test_cluster=test_cluster,
                                is_npz=False, only_eval_result=only_eval_result)
      
      for key in result:
        if key in df_results:
          df_results[key].append(result[key])
        else:
          df_results[key] = [result[key]]
      
      df_embedding_dim.append(d)
      df_clusters.append(cluster)
      df_model.append(model_str)
      
  d = {
      'Model': df_model, 
      'Embedding size': df_embedding_dim,
      'Clusters': df_clusters
    }

  for key, value in df_results.items():
    d[key] = value

  return d

In [None]:
d_arr = [4, 8, 16, 32, 64, 128]
K = 100

fig, axs = plt.subplots(2, 4, figsize=(18, 6), constrained_layout=True, 
                        sharex=True, sharey=True)
axs = axs.flat
fig.suptitle(f"C=20, I=50, |Yu|=5, |U|={user_interests.shape[0]}", fontsize=16)

plot_metric = f'top_{K}_categorical_accuracy'

for ix, alpha in enumerate(np.linspace(0.2, 0.9, 8)):
  gamma = 0.9 - alpha

  alpha_str = 'alpha{:0.1f}_gamma{:0.1f}'.format(alpha, gamma)
  title = alpha_str
  d = get_dataset_results(dataset_path, alpha_str)
  df = pd.DataFrame(data=d)
  ax = sns.lineplot(
      x='Embedding size', y=plot_metric, hue="Model", data=df,
      markers=True, style="Model", ax=axs[ix])
  ax.legend(loc='upper right')
  ax.set_title(title)
  ax.set_ylabel(f'HR@{K}')
  ax.set_xticks(d_arr)

plt.show()

In [None]:
d_arr = [4, 8, 16, 32, 64, 128]
K = 100
plot_metric = f'top_{K}_categorical_accuracy'

alphas = [0.4, 0.6, 0.8]

alpha = alphas[0]
gamma = 0.9 - alpha

alpha_str = 'alpha{:0.1f}_gamma{:0.1f}'.format(alpha, gamma)
dataset_path = 'datasets/C20_I50_U5/'
d = get_dataset_results(dataset_path, alpha_str, vocab_size=1000)
df = pd.DataFrame(data=d)

In [None]:
df

In [None]:
plt.figure()
ax = sns.lineplot(x='Parameters', y = plot_metric, hue='Model', data=df, markers=True, style="Model")
plt.legend(loc='upper right')
# plt.xticks(d_arr)
plt.ylabel(f'HR@{K}')
# plt.title('Low interest volatility')

plt.show()

In [None]:
fig = plt.figure(figsize=(5, 3.5), constrained_layout=True)

sns.lineplot(x='Embedding size', y=plot_metric, hue="Model", data=df,
      markers=True, style="Model")
plt.legend(loc='upper right')
plt.xticks(d_arr)
plt.ylabel(f'HR@{K}')
plt.title('Low interest volatility')

plt.show()

In [None]:
# d_arr = np.array([4, 8, 16, 32, 64, 128])
# d = np.tile(d_arr, (6))

# num_items = 1000

# MUR_3 = d_arr*(3+num_items)
# MUR_4 = d_arr*(4+num_items)
# MUR_7 = d_arr*(7+num_items)
# SUR = d_arr*(1+num_items)

# # params = SUR + MUR_3 + MUR_4 + MUR_7
# params = np.concatenate([SUR, MUR_3, MUR_4, MUR_7])
# model_str = ['SUR'] * 6 + ['MUR_3'] * 6 + ['MUR_4'] * 6 + ['MUR_7'] * 6

# d_arr = np.tile(d_arr, (4))

# d = {
#       'Model': model_str, 
#       'Embedding Size': d_arr,
#       'Parameters': params}
# df = pd.DataFrame(data = d)

# plt.figure()
# ax = sns.lineplot(x='Parameters', y = '', hue='Model', data=df)

# plt.figure()
# ax = sns.barplot(
#       x='Embedding Size', y='Parameters', hue="Model", data=df)
# # ax.legend(loc='upper right')
# # ax.set_title('Model Parameters')
# # ax.set_xticks(d_arr)
# plt.show()

In [None]:
clusters = [20,30,40,50,60]
K = 100
alpha = 0.8
gamma = 0.9 - alpha

plot_metric = f'top_{K}_categorical_accuracy'
alpha_str = 'alpha{:0.1f}_gamma{:0.1f}'.format(alpha, gamma)

d = get_cluster_results(clusters, alpha_str)
df = pd.DataFrame(data=d)

In [None]:
fig = plt.figure(figsize=(6, 3.5), constrained_layout=True)

sns.lineplot(x='Clusters', y=plot_metric, hue="Model", data=df, markers=True,
             style="Model")
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
plt.ylabel(f'HR@{K}')
plt.title('Low interest volatility')

plt.show()

In [None]:
clusters = [10,20,30,40,50,60]
K = 100

fig, axs = plt.subplots(1, 3, figsize=(9, 6), constrained_layout=True, 
                        sharex=True, sharey=True)
axs = axs.flat
fig.suptitle(f"I=50, |Yu|=5, |U|={user_interests.shape[0]}", fontsize=16)

plot_metric = f'top_{K}_categorical_accuracy'
alphas = [0.4, 0.6, 0.8]
# for ix, alpha in enumerate(np.linspace(0.2, 0.9, 8)):
for ix, alpha in enumerate(alphas):
  gamma = 0.9 - alpha

  alpha_str = 'alpha{:0.1f}_gamma{:0.1f}'.format(alpha, gamma)
  title = get_volatility_str(0.9 - alpha)
  d = get_cluster_results(clusters, alpha_str)
  df = pd.DataFrame(data=d)
  ax = sns.lineplot(
      x='Clusters', y=plot_metric, hue="Model", data=df,
      markers=True, style="Model", ax=axs[ix])
  ax.legend(loc='upper right', bbox_to_anchor=(1.0, 1.0))
  ax.set_title(title)
  ax.set_ylabel(f'HR@{K}')
  ax.set_xticks(clusters)

plt.show()

In [None]:
clusters = [10,20,30,40,50,60]
K = 100

fig, axs = plt.subplots(1, 3, figsize=(9, 6), constrained_layout=True, 
                        sharex=True, sharey=True)
axs = axs.flat
fig.suptitle(f"I=50, |Yu|=5, |U|={user_interests.shape[0]}", fontsize=16)

plot_metric = f'top_{K}_categorical_accuracy'
alphas = [0.4, 0.6, 0.8]
# for ix, alpha in enumerate(np.linspace(0.2, 0.9, 8)):
for ix, alpha in enumerate(alphas):
  gamma = 0.9 - alpha

  alpha_str = 'alpha{:0.1f}_gamma{:0.1f}'.format(alpha, gamma)
  title = get_volatility_str(0.9 - alpha)
  d = get_cluster_results(clusters, alpha_str)
  df = pd.DataFrame(data=d)
  ax = sns.lineplot(
      x='Clusters', y=plot_metric, hue="Model", data=df,
      markers=True, style="Model", ax=axs[ix])
  ax.legend(loc='upper right', )
  ax.set_title(title)
  ax.set_ylabel(f'HR@{K}')
  ax.set_xticks(clusters)

plt.show()

In [None]:
root_dir = 'root_dir/'

# dataset_path = 'datasets/C5_I10_U3/'
# (item_clusters, all_items, user_interests, test_cluster) = load_dataset(
#     root_dir, dataset_path, alpha_str, is_npz=True)
# dataset_path = 'datasets/C20_I50_U5/'
# (item_clusters, all_items, title = 'Dataset: {}, C=20, I=50, |Yu|=5'.format(alpha_str), test_cluster, user_item_sequences) = load_dataset(
#     root_dir, dataset_path, alpha_str, is_npz=False)

clusters = [10, 20, 30, 40, 50, 60, 70, 80, 90]
K = 100

# fig, axs = plt.subplots(2, 4, figsize=(18, 6), constrained_layout=True, 
#                         sharex=True, sharey=True)
# axs = axs.flat
# fig.suptitle(f"I=50, |Yu|=5, |U|={user_interests.shape[0]}", fontsize=16)
# plot_metric = f'top_{K}_categorical_accuracy'

all_d = []
for ix, alpha in enumerate(np.linspace(0.2, 0.9, 8)):
  gamma = 0.9 - alpha

  alpha_str = 'alpha{:0.1f}_gamma{:0.1f}'.format(alpha, gamma)
  dataset_path = 'datasets/C20_I50_U5/'
  
  # title = alpha_str
  d = get_cluster_results(clusters, alpha_str, only_eval_result=False)
  print (d)
  all_d.append(d)
  # df = pd.DataFrame(data=d)
  # ax = sns.lineplot(
  #     x='Clusters', y=plot_metric, hue="Model", data=df,
  #     markers=True, style="Model", ax=axs[ix])
  # ax.legend(loc='upper right')
  # ax.set_title(title)
  # ax.set_ylabel(f'HR@{K}')
  # ax.set_xticks(clusters)

# plt.show()

In [None]:
def get_cluster_anisotropy(clusters: List[int], 
                           alpha_str: str, 
                           only_eval_result: bool = True):

  models = ['SUR', 'MUR_3', 'MUR_4', 'MUR_7']
  d = 16 
  df_model = []
  df_results = dict()
  df_embedding_dim = []
  df_clusters = []
  for cluster in clusters:
    dataset_path = f'datasets/C{cluster}_I50_U5'
    print (f'Getting results for {dataset_path} with {alpha_str} dataset.')
    item_clusters, _, _, _, _ = load_dataset(
        root_dir, dataset_path, alpha_str, is_npz=False)
    results_dir = f'results/reruns/d{d}'
    results_dir = os.path.join(root_dir, results_dir, dataset_path)
    for model_str in models:
      seed_list = [1234, 1235, 1236]
      seed_found = len(seed_list)
      mean_results = dict()
      result = dict()
      result['silhoutte'] = 0
      result['anisotropy'] = 0

      for seed in seed_list:
        fpath = os.path.join(results_dir,
                            'synthetic_data_{}'.format(alpha_str),
                            'seed_{}'.format(seed),
                            model_str, 'item_embeddings.npy')
        with tf.io.gfile.GFile(fpath, 'rb') as f:
            item_embeddings = np.load(f, allow_pickle=True)

        result['anisotropy'] += compute_mean_cosine_similarity(item_embeddings)
        result['silhoutte'] += evaluate_silhouette_score(item_embeddings,
                                                         item_clusters)['silhouette_actual']
      
      
      result['silhoutte'] /= seed_found
      result['anisotropy'] /= seed_found
      for key in result:
        if key in df_results:
          df_results[key].append(result[key])
        else:
          df_results[key] = [result[key]]
      
      df_embedding_dim.append(d)
      df_clusters.append(cluster)
      df_model.append(model_str)
      
  d = {
      'Model': df_model, 
      'Embedding size': df_embedding_dim,
      'Clusters': df_clusters
    }

  for key, value in df_results.items():
    d[key] = value

  return d

clusters = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
fig, axs = plt.subplots(2, 4, figsize=(18, 6), constrained_layout=True, 
                        sharex=True, sharey=True)
axs = axs.flat
fig.suptitle(f"I=50, |Yu|=5, |U|=50000", fontsize=16)
d_embeddings = []
for ix, alpha in enumerate(np.linspace(0.2, 0.9, 8)):
  gamma = 0.9 - alpha
  alpha_str = 'alpha{:0.1f}_gamma{:0.1f}'.format(alpha, gamma)
  title = alpha_str
  d_anisotropy = get_cluster_anisotropy(clusters, alpha_str)
  d_embeddings.append(d_anisotropy)

  df = pd.DataFrame(data=d_anisotropy)
  ax = sns.lineplot(
      x='Clusters', y='anisotropy', hue="Model", data=df,
      markers=True, style="Model", ax=axs[ix])
  
  ax.legend(loc='upper right')
  ax.set_title(title)
  ax.set_ylabel('Anisotropy')
  ax.set_xticks(clusters)

plt.show()


In [None]:
clusters = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
fig, axs = plt.subplots(2, 4, figsize=(18, 6), constrained_layout=True, 
                        sharex=True, sharey=True)
axs = axs.flat
fig.suptitle(f"I=50, |Yu|=5, |U|=50000", fontsize=16)
# d_embeddings = []
for ix, alpha in enumerate(np.linspace(0.2, 0.9, 8)):
  gamma = 0.9 - alpha
  alpha_str = 'alpha{:0.1f}_gamma{:0.1f}'.format(alpha, gamma)
  # d_anisotropy = get_cluster_anisotropy(clusters, alpha_str)
  # d_embeddings.append(d_anisotropy)
  
  d_anisotropy = d_embeddings[ix]
  df = pd.DataFrame(data=d_anisotropy)
  ax = sns.lineplot(
      x='Clusters', y='anisotropy', hue="Model", data=df,
      markers=True, style="Model", ax=axs[ix])
  
  ax.legend(loc='upper left')
  title = alpha_str
  ax.set_title(title)
  ax.set_ylabel('Anisottropy')
  ax.set_xticks(clusters)

plt.show()


In [None]:
clusters = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
fig, axs = plt.subplots(2, 4, figsize=(18, 6), constrained_layout=True, 
                        sharex=True, sharey=True)
axs = axs.flat
fig.suptitle(f"I=50, |Yu|=5, |U|=50000", fontsize=16)
# d_embeddings = []
for ix, alpha in enumerate(np.linspace(0.2, 0.9, 8)):
  gamma = 0.9 - alpha
  alpha_str = 'alpha{:0.1f}_gamma{:0.1f}'.format(alpha, gamma)
  # d_anisotropy = get_cluster_anisotropy(clusters, alpha_str)
  # d_embeddings.append(d_anisotropy)
  
  d_anisotropy = d_embeddings[ix]
  df = pd.DataFrame(data=d_anisotropy)
  ax = sns.lineplot(
      x='Clusters', y='silhoutte', hue="Model", data=df,
      markers=True, style="Model", ax=axs[ix])
  
  ax.legend(loc='upper center')
  title = alpha_str
  ax.set_title(title)
  ax.set_ylabel('Sillhoutte')
  ax.set_xticks(clusters)

plt.show()


In [None]:
# results_dir = 'results/train-test-split'.format(root_dir)
results_dir = 'results/'.format(root_dir)
results_path = os.path.join(root_dir, results_dir, dataset_path)

In [None]:
model_str = 'SUR'

print(results_path)

mean_results = evaluate_results(model_str, results_path, is_npz=False)

print()

mean_results

In [None]:
model_str = 'MUR_5'
mean_results = evaluate_results(model_str, results_path, is_npz=False)
print()
mean_results

In [None]:
model_str = 'MUR_3'

mean_results = evaluate_results(model_str, results_path, is_npz=False)
print()
mean_results

In [None]:
seed = 1234
model_str = 'SUR'
alpha_str = 'alpha0.6_gamma0.3'
results_path = '{}/results/train-test-split/{}/seed_{}/{}/{}/embeddings_data.npz'.format(
        root_dir, dataset_path, seed, model_str, alpha_str)

with tf.io.gfile.GFile(results_path, 'rb') as f:
    data = np.load(f, allow_pickle=True)
    result_eval = data['eval_result'][()]
    item_embeddings = data['item_embeddings']
    user_embeddings = data['user_embeddings']

label_set = np.unique(item_clusters).astype(int)
cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)
for label in label_set:
  label_indices = np.where(item_clusters == label)
  plt.scatter(item_embeddings[label_indices,0], 
              item_embeddings[label_indices,1], 
              c=cmap(label),
              label='{}'.format(label))

plt.title('Medium interest volatility.')
plt.show()

In [None]:
dataset_path = 'datasets/C5_I10_U3/'
item_clusters, _, user_interests, test_cluster, _ = load_dataset(root_dir, dataset_path, alpha_str, is_npz=True)

In [None]:
alpha_str

In [None]:
seed = 1234
model_str = 'MUR_3'
results_path = '{}/results/train-test-split/{}/seed_{}/{}/{}/embeddings_data.npz'.format(
        root_dir, dataset_path, seed, model_str, alpha_str)

with tf.io.gfile.GFile(results_path, 'rb') as f:
    data = np.load(f, allow_pickle=True)
    result_eval = data['eval_result'][()]
    item_embeddings = data['item_embeddings']
    user_embeddings = data['user_embeddings']

interest_acc = evaluate_interest_retrieval(
    user_embeddings[:20], item_embeddings, user_interests[:20], item_clusters,
    K=20)

np.mean(interest_acc)

In [None]:
fig, axs = plt.subplots(4, 5, figsize=(20, 14), constrained_layout=True,
                        sharex=True, sharey=True)

for user_ix, ax in enumerate(axs.flat):

  label_set = np.unique(item_clusters).astype(int)
  cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)
  for label in label_set:
    label_indices = np.where(item_clusters == label)
    ax.scatter(item_embeddings[label_indices,0], 
                item_embeddings[label_indices,1], 
                c=cmap(label),
                label='{}'.format(label))
    
  user_embeddings = tf.squeeze(user_embeddings)
  if len(user_embeddings.shape) > 2:
    H = user_embeddings.shape[1]
    ax.quiver(np.zeros(H), np.zeros(H), user_embeddings[user_ix,:, 0], user_embeddings[user_ix,:, 1], scale=7.)
  else:
    ax.quiver(0., 0., user_embeddings[user_ix, 0], user_embeddings[user_ix, 1], scale=6.)

  ax.set_title("""$Y_u$ = {} & InterestRecall@20: {:.2f}""".format(
      sorted(user_interests[user_ix]), interest_acc[user_ix]))
  ax.legend(loc = 'upper left', bbox_to_anchor=(1.0, 1.0))

fig.suptitle('User representations for 10 different users.', fontsize=16)
plt.show()

In [None]:
fig, axs = plt.subplots(4, 5, figsize=(20, 14), constrained_layout=True,
                        sharex=True, sharey=True)

for user_ix, ax in enumerate(axs.flat):

  label_set = np.unique(item_clusters).astype(int)
  cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)
  for label in label_set:
    label_indices = np.where(item_clusters == label)
    ax.scatter(item_embeddings[label_indices,0], 
                item_embeddings[label_indices,1], 
                c=cmap(label),
                label='{}'.format(label))
    
  user_embeddings = tf.squeeze(user_embeddings)
  if len(user_embeddings.shape) > 2:
    H = user_embeddings.shape[1]
    ax.quiver(np.zeros(H), np.zeros(H), user_embeddings[user_ix,:, 0], user_embeddings[user_ix,:, 1], scale=7.)
  else:
    ax.quiver(0., 0., user_embeddings[user_ix, 0], user_embeddings[user_ix, 1], scale=6.)

  ax.set_title("""$Y_u$ = {} & InterestRecall@20: {:.2f}""".format(
      sorted(user_interests[user_ix]), interest_acc[user_ix]))
  ax.legend(loc = 'upper left', bbox_to_anchor=(1.0, 1.0))

fig.suptitle('User representations for 10 different users.', fontsize=16)
plt.show()

In [None]:
fig, axs = plt.subplots(2, 5, figsize=(18, 6), constrained_layout=True,
                        sharex=True, sharey=True)

for user_ix, ax in enumerate(axs.flat):

  label_set = np.unique(item_clusters).astype(int)
  cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)
  for label in label_set:
    label_indices = np.where(item_clusters == label)
    ax.scatter(item_embeddings[label_indices,0], 
                item_embeddings[label_indices,1], 
                c=cmap(label),
                label='{}'.format(label))
    
  user_embeddings = tf.squeeze(user_embeddings)
  if len(user_embeddings.shape) > 2:
    H = user_embeddings.shape[1]
    ax.quiver(np.zeros(H), np.zeros(H), user_embeddings[user_ix,:, 0], user_embeddings[user_ix,:, 1], scale=7.)
  else:
    ax.quiver(0., 0., user_embeddings[user_ix, 0], user_embeddings[user_ix, 1], scale=6.)

  ax.set_title("""$Y_u$ = {} & InterestRecall@20: {:.2f}""".format(
      user_interests[user_ix], interest_acc[user_ix]))
  ax.legend(loc = 'upper left')

fig.suptitle('User representations for 10 different users.', fontsize=16)
plt.show()

In [None]:
user_ix = 15
fig = plt.figure(figsize=(5, 4), constrained_layout=True)

# for user_ix, ax in enumerate(axs.flat):
label_set = np.unique(item_clusters).astype(int)
cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)
for label in label_set:
  label_indices = np.where(item_clusters == label)
  plt.scatter(item_embeddings[label_indices,0], 
              item_embeddings[label_indices,1], 
              c=cmap(label),
              label='{}'.format(label))
  
user_embeddings = tf.squeeze(user_embeddings)
if len(user_embeddings.shape) > 2:
  H = user_embeddings.shape[1]
  plt.quiver(np.zeros(H), np.zeros(H), user_embeddings[user_ix,:, 0],
            user_embeddings[user_ix,:, 1], scale=7.)
else:
  plt.quiver(0., 0., user_embeddings[user_ix, 0],
            user_embeddings[user_ix, 1], scale=6.)

# ax.set_title("""$Y_u$ = {} & Interest Recall: {:.2f}""".format(
#     user_interests[user_ix], interest_acc[user_ix]))
plt.legend(loc = 'upper left', bbox_to_anchor=(1.0, 1.0))

plt.title('SUR for a user having interests: {}.'.format(sorted(user_interests[user_ix])), fontsize=11)
plt.show()



In [None]:
user_ix = 15
fig = plt.figure(figsize=(5, 4), constrained_layout=True)

# for user_ix, ax in enumerate(axs.flat):
label_set = np.unique(item_clusters).astype(int)
cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)
for label in label_set:
  label_indices = np.where(item_clusters == label)
  plt.scatter(item_embeddings[label_indices,0], 
              item_embeddings[label_indices,1], 
              c=cmap(label),
              label='{}'.format(label))
  
user_embeddings = tf.squeeze(user_embeddings)
if len(user_embeddings.shape) > 2:
  H = user_embeddings.shape[1]
  plt.quiver(np.zeros(H), np.zeros(H), user_embeddings[user_ix,:, 0],
            user_embeddings[user_ix,:, 1], scale=4.)
else:
  plt.quiver(0., 0., user_embeddings[user_ix, 0],
            user_embeddings[user_ix, 1], scale=6.)

# ax.set_title("""$Y_u$ = {} & Interest Recall: {:.2f}""".format(
#     user_interests[user_ix], interest_acc[user_ix]))
plt.legend(loc = 'upper left', bbox_to_anchor=(1.0, 1.0))

plt.title('SUR for a user having interests: {}.'.format(sorted(user_interests[user_ix])), fontsize=11)
plt.show()

In [None]:
user_ix = 9
fig = plt.figure(figsize=(5, 4), constrained_layout=True)

# for user_ix, ax in enumerate(axs.flat):
label_set = np.unique(item_clusters).astype(int)
cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)
for label in label_set:
  label_indices = np.where(item_clusters == label)
  plt.scatter(item_embeddings[label_indices,0], 
              item_embeddings[label_indices,1], 
              c=cmap(label),
              label='{}'.format(label))
  
user_embeddings = tf.squeeze(user_embeddings)
if len(user_embeddings.shape) > 2:
  H = user_embeddings.shape[1]
  plt.quiver(np.zeros(H), np.zeros(H), user_embeddings[user_ix,:, 0],
            user_embeddings[user_ix,:, 1], scale=4.)
else:
  plt.quiver(0., 0., user_embeddings[user_ix, 0],
            user_embeddings[user_ix, 1], scale=6.)

# ax.set_title("""$Y_u$ = {} & Interest Recall: {:.2f}""".format(
#     user_interests[user_ix], interest_acc[user_ix]))
plt.legend(loc = 'upper left', bbox_to_anchor=(1.0, 1.0))

plt.title('MUR (H=3) for a user having interests: {}.'.format(sorted(user_interests[user_ix])), fontsize=11)
plt.show()

In [None]:
user_ix = 9
fig = plt.figure(figsize=(5, 4), constrained_layout=True)

# for user_ix, ax in enumerate(axs.flat):
label_set = np.unique(item_clusters).astype(int)
cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)
for label in label_set:
  label_indices = np.where(item_clusters == label)
  plt.scatter(item_embeddings[label_indices,0], 
              item_embeddings[label_indices,1], 
              c=cmap(label),
              label='{}'.format(label))
  
user_embeddings = tf.squeeze(user_embeddings)
if len(user_embeddings.shape) > 2:
  H = user_embeddings.shape[1]
  plt.quiver(np.zeros(H), np.zeros(H), user_embeddings[user_ix,:, 0],
            user_embeddings[user_ix,:, 1], scale=4.)
else:
  plt.quiver(0., 0., user_embeddings[user_ix, 0],
            user_embeddings[user_ix, 1], scale=6.)

# ax.set_title("""$Y_u$ = {} & Interest Recall: {:.2f}""".format(
#     user_interests[user_ix], interest_acc[user_ix]))
plt.legend(loc = 'upper left', bbox_to_anchor=(1.0, 1.0))

plt.title('SUR for a user having interests: {}.'.format(sorted(user_interests[user_ix])), fontsize=11)
plt.show()

In [None]:
candidate_clusters_size = [2, 3, 5, 7, 10]
silhouette_scores = []
for cluster_size in candidate_clusters_size:
  kmeans = KMeans(n_clusters = cluster_size, random_state=1234).fit(item_embeddings)
  silhouette_scores.append(sklearn.metrics.silhouette_score(item_embeddings, kmeans.labels_, random_state=1234))

In [None]:
root_dir = 'root_dir/Test/data'
data_path = '{}/synthetic_data.npz'.format(root_dir, dataset_path)

with tf.io.gfile.GFile(data_path, 'rb') as f:
    data = np.load(f, allow_pickle=True)
    if 'item_clusters' in data:
      item_clusters = data['item_clusters']
    else:
      item_clusters = data['item_labels']
    all_items = data['items']
    # user_interests = data['user_interests']
    # test_cluster = item_clusters[data['user_item_sequences'][:, -1]]
    seq = data['user_item_sequences']


## Visualize learned embeddings for sparse datasets.

In [None]:
root_dir = root_dir/'
dataset_path = 'datasets/sparse_C5_I10_U3_N10000/'
results_path = 'results/d2/validation/logQ-separate_embedding/lr0.1/'
seed = 1236
model_str = 'MUR_5'
alphas = np.linspace(0.6, 0.9, 3)

fig, axs = plt.subplots(1, 3, figsize=(12, 4), constrained_layout=True,
                        sharex=True, sharey=True)

for ax, alpha in zip(axs.flat, alphas):

  alpha_str = f'interest-power1.0_item-power1.0_alpha{alpha:.1f}_gamma{0.9-alpha:.1f}/'
  dataset = load_dataset(root_dir, dataset_path, alpha_str)
  (item_clusters, all_items, user_interests, test_cluster, user_item_sequences) = dataset
  # target_next_items = user_item_sequences[:, -1]

  print (alpha_str)
  item_embeddings_path = os.path.join(root_dir, results_path, dataset_path,
                                      f'synthetic_data_{alpha_str}',
                                      f'seed_{seed}', model_str,
                                      'item_embeddings.npy')

  user_embeddings_path = os.path.join(root_dir, results_path, dataset_path,
                                      f'synthetic_data_{alpha_str}',
                                      f'seed_{seed}', model_str,
                                      'user_embeddings.npy')

  # eval_results_path = os.path.join(root_dir, results_path, dataset_path,
  #                                     f'synthetic_data_{alpha_str}',
  #                                     f'seed_{seed}', model_str,
  #                                     'eval_result.yaml')

  with tf.io.gfile.GFile(item_embeddings_path, 'rb') as f:
    item_embeddings = np.load(f)

  with tf.io.gfile.GFile(user_embeddings_path, 'rb') as f:
    user_embeddings = np.load(f, allow_pickle=True)
  #   print (user_embeddings[user_ix])
  #   # user_embeddings /= np.linalg.norm(user_embeddings, axis=-1, keepdims=True)

  # with tf.io.gfile.GFile(eval_results_path) as f:
  #   result_eval = yaml.safe_load(f)


  # Print Cluster wise performance
  # print_cluster_wise_performance(user_embeddings, item_embeddings,
  #                              test_cluster, target_next_items)

  label_set = np.unique(item_clusters).astype(int)
  cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)

  for label in label_set:
    label_indices = np.where(item_clusters == label)
    rgb_alphas = list(reversed(np.linspace(5.0, 50.0, len(label_indices[0]))))
    ax.scatter(item_embeddings[label_indices,0], 
                item_embeddings[label_indices,1], 
                c=cmap(label),
                s=rgb_alphas,
                label='{}'.format(label))
  
  ax.set_title(f'{get_volatility_str(0.9-alpha)}')

plt.legend()
fig.suptitle(f'Item embedding space using {model_str} for sparse data.',
             fontsize=16)
plt.show()

In [None]:
root_dir = 'root_dir/'
dataset_path = 'datasets/sparse_C5_I10_U3_N10000/'
results_path = 'results/d2/validation/logQ-separate_embedding/lr0.1/'
seed = 1234
model_str = 'SUR'
alphas = reversed(np.linspace(0.5, 0.8, 3))

fig, axs = plt.subplots(1, 3, figsize=(12, 4), constrained_layout=True,
                        sharex=True, sharey=True)

for ax, alpha in zip(axs.flat, alphas):

  alpha_str = f'interest-power1.0_item-power1.0_alpha{alpha:.1f}_gamma{0.9-alpha:.1f}/'
  dataset = load_dataset(root_dir, dataset_path, alpha_str)
  (item_clusters, all_items, user_interests, test_cluster, user_item_sequences) = dataset
  # target_next_items = user_item_sequences[:, -1]

  print (alpha_str)
  item_embeddings_path = os.path.join(root_dir, results_path, dataset_path,
                                      f'synthetic_data_{alpha_str}',
                                      f'seed_{seed}', model_str,
                                      'item_embeddings.npy')

  user_embeddings_path = os.path.join(root_dir, results_path, dataset_path,
                                      f'synthetic_data_{alpha_str}',
                                      f'seed_{seed}', model_str,
                                      'user_embeddings.npy')

  # eval_results_path = os.path.join(root_dir, results_path, dataset_path,
  #                                     f'synthetic_data_{alpha_str}',
  #                                     f'seed_{seed}', model_str,
  #                                     'eval_result.yaml')

  with tf.io.gfile.GFile(item_embeddings_path, 'rb') as f:
    item_embeddings = np.load(f)

  with tf.io.gfile.GFile(user_embeddings_path, 'rb') as f:
    user_embeddings = np.load(f, allow_pickle=True)
  #   print (user_embeddings[user_ix])
  #   # user_embeddings /= np.linalg.norm(user_embeddings, axis=-1, keepdims=True)

  # with tf.io.gfile.GFile(eval_results_path) as f:
  #   result_eval = yaml.safe_load(f)


  # Print Cluster wise performance
  # print_cluster_wise_performance(user_embeddings, item_embeddings,
  #                              test_cluster, target_next_items)

  label_set = np.unique(item_clusters).astype(int)
  cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)

  for label in label_set:
    label_indices = np.where(item_clusters == label)
    rgb_alphas = list(reversed(np.linspace(5.0, 50.0, len(label_indices[0]))))
    ax.scatter(item_embeddings[label_indices,0], 
                item_embeddings[label_indices,1], 
                c=cmap(label),
                s=rgb_alphas,
                label='{}'.format(label))
  
  ax.set_title(f'{get_volatility_str(0.9-alpha)}')

plt.legend()
fig.suptitle(f'Item embedding space using {model_str} for sparse data.',
             fontsize=16)
plt.show()

In [None]:
alpha_str = 'interest-power1.0_item-power1.0_alpha0.7_gamma0.2/'
dataset = load_dataset(root_dir, dataset_path, alpha_str)
(item_clusters, all_items, user_interests, test_cluster, user_item_sequences) = dataset

item_embeddings_path = os.path.join(root_dir, results_path, dataset_path,
                                    f'synthetic_data_{alpha_str}',
                                    f'seed_{seed}', model_str,
                                    'item_embeddings.npy')

user_embeddings_path = os.path.join(root_dir, results_path, dataset_path,
                                    f'synthetic_data_{alpha_str}',
                                    f'seed_{seed}', model_str,
                                    'user_embeddings.npy')

In [None]:
fig, axs = plt.subplots(4, 5, figsize=(20, 14), constrained_layout=True,
                        sharex=True, sharey=True)

for user_ix, ax in enumerate(axs.flat):

  label_set = np.unique(item_clusters).astype(int)
  cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)
  for label in label_set:
    label_indices = np.where(item_clusters == label)
    rgb_alphas = list(reversed(np.linspace(5.0, 50.0, len(label_indices[0]))))
    ax.scatter(item_embeddings[label_indices,0], 
                item_embeddings[label_indices,1], 
                c=cmap(label),
                s=rgb_alphas,
                label='{}'.format(label))
    
  user_embeddings = tf.squeeze(user_embeddings)
  if len(user_embeddings.shape) > 2:
    H = user_embeddings.shape[1]
    ax.quiver(np.zeros(H), np.zeros(H), user_embeddings[user_ix,:, 0], user_embeddings[user_ix,:, 1], scale=7.)
  else:
    ax.quiver(0., 0., user_embeddings[user_ix, 0], user_embeddings[user_ix, 1], scale=6.)

  ax.set_title("""$Y_u$ = {}""".format(sorted(set(user_interests[user_ix]))))
  ax.legend(loc = 'upper left', bbox_to_anchor=(1.0, 1.0))

fig.suptitle('User representations for 10 different users.', fontsize=16)
plt.show()

In [None]:
user_ix = 0
fig = plt.figure(figsize=(5, 4), constrained_layout=True)

# for user_ix, ax in enumerate(axs.flat):
label_set = np.unique(item_clusters).astype(int)
cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)
for label in label_set:
  label_indices = np.where(item_clusters == label)
  rgb_alphas = list(reversed(np.linspace(5.0, 50.0, len(label_indices[0]))))
  plt.scatter(item_embeddings[label_indices,0], 
              item_embeddings[label_indices,1], 
              c=cmap(label),
              s=rgb_alphas,
              label='{}'.format(label))
  
user_embeddings = tf.squeeze(user_embeddings)
if len(user_embeddings.shape) > 2:
  H = user_embeddings.shape[1]
  plt.quiver(np.zeros(H), np.zeros(H), user_embeddings[user_ix,:, 0],
            user_embeddings[user_ix,:, 1], scale=4.)
else:
  plt.quiver(0., 0., user_embeddings[user_ix, 0],
            user_embeddings[user_ix, 1], scale=6.)

# ax.set_title("""$Y_u$ = {} & Interest Recall: {:.2f}""".format(
#     user_interests[user_ix], interest_acc[user_ix]))
plt.legend(loc = 'upper left', bbox_to_anchor=(1.0, 1.0))

plt.title('SUR for a user having interests: {}.'.format(sorted(set(user_interests[user_ix]))), fontsize=11)
plt.show()

In [None]:
root_dir = 'root_dir/'
dataset_path = 'datasets/sparse_C5_I10_U3_N10000/'
results_path = 'results/d2/validation/logQ-separate_embedding/lr0.1/'
seed = 1235
model_str = 'MUR_5'
alphas = reversed(np.linspace(0.5, 0.8, 3))

fig, axs = plt.subplots(1, 3, figsize=(12, 4), constrained_layout=True,
                        sharex=True, sharey=True)

for ax, alpha in zip(axs.flat, alphas):

  alpha_str = f'interest-power1.0_item-power1.0_alpha{alpha:.1f}_gamma{0.9-alpha:.1f}/'
  dataset = load_dataset(root_dir, dataset_path, alpha_str)
  (item_clusters, all_items, user_interests, test_cluster, user_item_sequences) = dataset
  # target_next_items = user_item_sequences[:, -1]

  print (alpha_str)
  item_embeddings_path = os.path.join(root_dir, results_path, dataset_path,
                                      f'synthetic_data_{alpha_str}',
                                      f'seed_{seed}', model_str,
                                      'item_embeddings.npy')

  user_embeddings_path = os.path.join(root_dir, results_path, dataset_path,
                                      f'synthetic_data_{alpha_str}',
                                      f'seed_{seed}', model_str,
                                      'user_embeddings.npy')

  # eval_results_path = os.path.join(root_dir, results_path, dataset_path,
  #                                     f'synthetic_data_{alpha_str}',
  #                                     f'seed_{seed}', model_str,
  #                                     'eval_result.yaml')

  with tf.io.gfile.GFile(item_embeddings_path, 'rb') as f:
    item_embeddings = np.load(f)

  with tf.io.gfile.GFile(user_embeddings_path, 'rb') as f:
    user_embeddings = np.load(f, allow_pickle=True)
  #   print (user_embeddings[user_ix])
  #   # user_embeddings /= np.linalg.norm(user_embeddings, axis=-1, keepdims=True)

  # with tf.io.gfile.GFile(eval_results_path) as f:
  #   result_eval = yaml.safe_load(f)


  # Print Cluster wise performance
  # print_cluster_wise_performance(user_embeddings, item_embeddings,
  #                              test_cluster, target_next_items)

  label_set = np.unique(item_clusters).astype(int)
  cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)

  for label in label_set:
    label_indices = np.where(item_clusters == label)
    rgb_alphas = list(reversed(np.linspace(5.0, 50.0, len(label_indices[0]))))
    ax.scatter(item_embeddings[label_indices,0], 
                item_embeddings[label_indices,1], 
                c=cmap(label),
                s=rgb_alphas,
                label='{}'.format(label))
  
  ax.set_title(f'{get_volatility_str(0.9-alpha)}')

plt.legend()
fig.suptitle(f'Item embedding space using {model_str} for sparse data.',
             fontsize=16)
plt.show()

In [None]:
alpha_str = 'interest-power1.0_item-power1.0_alpha0.8_gamma0.1/'
dataset = load_dataset(root_dir, dataset_path, alpha_str)
(item_clusters, all_items, user_interests, test_cluster, user_item_sequences) = dataset

item_embeddings_path = os.path.join(root_dir, results_path, dataset_path,
                                    f'synthetic_data_{alpha_str}',
                                    f'seed_{seed}', model_str,
                                    'item_embeddings.npy')

user_embeddings_path = os.path.join(root_dir, results_path, dataset_path,
                                    f'synthetic_data_{alpha_str}',
                                    f'seed_{seed}', model_str,
                                    'user_embeddings.npy')

In [None]:
fig, axs = plt.subplots(4, 5, figsize=(20, 14), constrained_layout=True,
                        sharex=True, sharey=True)

for user_ix, ax in enumerate(axs.flat):

  label_set = np.unique(item_clusters).astype(int)
  cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)
  for label in label_set:
    label_indices = np.where(item_clusters == label)
    rgb_alphas = list(reversed(np.linspace(5.0, 50.0, len(label_indices[0]))))
    ax.scatter(item_embeddings[label_indices,0], 
                item_embeddings[label_indices,1], 
                c=cmap(label),
                s=rgb_alphas,
                label='{}'.format(label))
    
  user_embeddings = tf.squeeze(user_embeddings)
  if len(user_embeddings.shape) > 2:
    H = user_embeddings.shape[1]
    ax.quiver(np.zeros(H), np.zeros(H), user_embeddings[user_ix,:, 0], user_embeddings[user_ix,:, 1], scale=7.)
  else:
    ax.quiver(0., 0., user_embeddings[user_ix, 0], user_embeddings[user_ix, 1], scale=6.)

  ax.set_title("""$Y_u$ = {}""".format(sorted(set(user_interests[user_ix]))))
  ax.legend(loc = 'upper left', bbox_to_anchor=(1.0, 1.0))

fig.suptitle('User representations for 10 different users.', fontsize=16)
plt.show()

In [None]:
root_dir = 'root_dir/'
dataset_path = 'datasets/sparse_C5_I10_U3_N10000/'
alpha_str = 'interest-power1.0_item-power1.0_alpha0.8_gamma0.1/'
results_path = 'results/d2/validation/logQ-separate_embedding/lr0.1/'

dataset = load_dataset(root_dir, dataset_path, alpha_str)
(item_clusters, all_items, user_interests, _, user_item_sequences) = dataset

seed = 1235
model_str = 'MUR_5'

item_embeddings_path = os.path.join(root_dir, results_path, dataset_path,
                                    f'synthetic_data_{alpha_str}',
                                    f'seed_{seed}', model_str,
                                    'item_embeddings.npy')

with tf.io.gfile.GFile(item_embeddings_path, 'rb') as f:
  item_embeddings = np.load(f)

label_set = np.unique(item_clusters).astype(int)
cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)

for label in label_set:
  label_indices = np.where(item_clusters == label)
  rgb_alphas = list(reversed(np.linspace(5.0, 50.0, len(label_indices[0]))))
  plt.scatter(item_embeddings[label_indices,0], 
              item_embeddings[label_indices,1], 
              c=cmap(label),
              s=rgb_alphas,
              label='{}'.format(label))

  
plt.legend()
plt.title('Output embedding space')
plt.xlabel('X1')
plt.ylabel('X0')
plt.show()

In [None]:
len(predictions[test_items == 3])

In [None]:
test_next_items = user_item_sequences[:, -1]
train_next_items = user_item_sequences[:, -3] # validation

predictions = compute_top_k_elementwise_accuracy(
    user_embeddings, item_embeddings, test_next_items, k=20)

item_predictions = []
for item_id in all_items:
  item_predictions.append(np.mean(predictions[test_next_items == item_id]))

plt.bar(all_items, item_predictions)
plt.xlabel('Item ID')
plt.ylabel('HR@20')
plt.title('Average performance for each item.')
plt.show()

In [None]:
train_next_item_counter = collections.Counter(train_next_items)
train_next_item_count = np.array([train_next_item_counter[item]
                                  for item in all_items],
                                 dtype=np.float32)

total_count = np.sum(train_next_item_count)
train_next_item_count /= total_count

plt.bar(all_items, train_next_item_count)
plt.xlabel('Item ID')
plt.ylabel('Frequency')
plt.title('Empirical density of items.')
plt.show()

In [None]:
train_next_item_counter = collections.Counter(train_next_items)
train_next_item_count = np.array([train_next_item_counter[item]
                                  for item in all_items],
                                 dtype=np.float32)

total_count = np.sum(train_next_item_count)
train_next_item_count /= total_count

plt.bar(all_items, train_next_item_count)
plt.xlabel('Item ID')
plt.ylabel('Frequency')
plt.title('Empirical density of items.')
plt.show()

In [None]:
stats.pearsonr(item_predictions, train_next_item_count)

In [None]:
num_samples = 100000

# uniformly sample indices
sample_indices = np.random.choice(len(train_next_items), size=num_samples)
# sample_indices = np.random.choice(train_next_items, size=num_samples)

# get embeddings from train_next_items using sampled indices.
data = item_embeddings[train_next_items[sample_indices]]

data += np.random.normal(loc=0.0, scale=0.1, size=data.shape)
kernel = stats.gaussian_kde(data.T)

xmin = -2
ymin = -2
xmax = 2
ymax = 2

X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])

Z = np.reshape(kernel(positions).T, X.shape)

In [None]:
fig = plt.figure()
ax = fig.gca()
ax.set_xlim(xmin, xmax)
ax.set_ylim(ymin, ymax)

# Contourf plot
cfset = ax.contourf(X, Y, Z, cmap='Blues')
## Or kernel density estimate plot instead of the contourf plot
#ax.imshow(np.rot90(f), cmap='Blues', extent=[xmin, xmax, ymin, ymax])

# Contour plot
cset = ax.contour(X, Y, Z, colors='k')

# Label plot
ax.clabel(cset, inline=1, fontsize=10)
ax.set_xlabel('X1')
ax.set_ylabel('X0')
ax.set_title('Density in the Output Embedding Space using KDE.')
plt.show()

In [None]:
density = kernel(item_embeddings[all_items].T)
density.shape

In [None]:
stats.pearsonr(item_predictions, density)

In [None]:
## Plot user 

In [None]:
user_ix = 1
fig = plt.figure(figsize=(5, 4), constrained_layout=True)

# for user_ix, ax in enumerate(axs.flat):
label_set = np.unique(item_clusters).astype(int)
cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)
for label in label_set:
  label_indices = np.where(item_clusters == label)
  rgb_alphas = list(reversed(np.linspace(5.0, 50.0, len(label_indices[0]))))
  plt.scatter(item_embeddings[label_indices,0], 
              item_embeddings[label_indices,1], 
              c=cmap(label),
              s=rgb_alphas,
              label='{}'.format(label))
  
# user_embeddings = np.squeeze(user_embeddings)
# if len(user_embeddings.shape) > 2:
#   H = user_embeddings.shape[1]
#   plt.quiver(np.zeros(H), np.zeros(H), user_embeddings[user_ix,:, 0],
#             user_embeddings[user_ix,:, 1], scale=4.)
# else:
#   plt.quiver(0., 0., user_embeddings[user_ix, 0],
#             user_embeddings[user_ix, 1], scale=1.)

# ax.set_title("""$Y_u$ = {} & Interest Recall: {:.2f}""".format(
#     user_interests[user_ix], interest_acc[user_ix]))
plt.legend(loc = 'upper left', bbox_to_anchor=(1.0, 1.0))

# plt.title('MUR (H=3) for a user having interests: {}.'.format(sorted(user_interests[user_ix])), fontsize=11)
plt.show()

In [None]:
root_dir = 'root_dir/'
dataset_path = 'datasets/sparse_C5_I10_U3_N10000/'
alpha_str = 'interest-power1.0_item-power1.0_alpha0.6_gamma0.3/'
# results_path = 'results/d2/validation/logQ-separate-embedding/lr0.1/'
results_path = 'results/d2/validation/logQ-separate_embedding-sample_weight/lr0.1/'

# dataset_path = 'datasets/New_C5_I10_U3/'
# alpha_str = 'alpha0.6_gamma0.3/'

dataset = load_dataset(root_dir, dataset_path, alpha_str)
(item_clusters, all_items, user_interests, _, user_item_sequences) = dataset

seed = 1236
model_str = 'SUR'

item_embeddings_path = os.path.join(root_dir, results_path, dataset_path,
                                    f'synthetic_data_{alpha_str}',
                                    f'seed_{seed}', model_str,
                                    'item_embeddings.npy')

user_embeddings_path = os.path.join(root_dir, results_path, dataset_path,
                                    f'synthetic_data_{alpha_str}',
                                    f'seed_{seed}', model_str,
                                    'user_embeddings.npy')

# eval_results_path = os.path.join(root_dir, results_path, dataset_path,
#                                     f'synthetic_data_{alpha_str}',
#                                     f'seed_{seed}', model_str,
#                                     'eval_result.yaml')

with tf.io.gfile.GFile(item_embeddings_path, 'rb') as f:
  item_embeddings = np.load(f)

with tf.io.gfile.GFile(user_embeddings_path, 'rb') as f:
  user_embeddings = np.load(f, allow_pickle=True)

# with tf.io.gfile.GFile(eval_results_path) as f:
#   result_eval = yaml.safe_load(f)

label_set = np.unique(item_clusters).astype(int)
cmap = plt.cm.get_cmap('RdYlBu', len(label_set)+1)
for label in label_set:
  label_indices = np.where(item_clusters == label)
  plt.scatter(item_embeddings[label_indices,0], 
              item_embeddings[label_indices,1], 
              c=cmap(label),
              label='{}'.format(label))

user_embeddings = np.squeeze(user_embeddings)
if len(user_embeddings.shape) > 2:
  H = user_embeddings.shape[1]
  plt.quiver(np.zeros(H), np.zeros(H), user_embeddings[user_ix,:, 0],
            user_embeddings[user_ix,:, 1], scale=4.)
else:
  plt.quiver(0., 0., user_embeddings[user_ix, 0],
            user_embeddings[user_ix, 1], scale=5.)
  
plt.legend()
plt.title('Low interest volatility.')
plt.show()

In [None]:
# init plot data df data
cluster_indices = []
hr5 = []
hr10 = []
models = []

In [None]:
root_dir = 'root_dir/'
dataset_path = 'datasets/sparse_C5_I10_U3_N10000/'
alpha_str = 'interest-power1.0_item-power1.0_alpha0.8_gamma0.1/'
# Wihtout sample_weighting
# results_dir = 'results/d2/reruns/logQ-separate-embedding/lr0.1/'

# With sample_weight
results_dir = 'results/d2/reruns/logQ-separate_embedding-sample_weight/lr0.1/'

results_path = os.path.join(root_dir, results_dir, dataset_path)    

dataset = load_dataset(root_dir, dataset_path, alpha_str)
(item_clusters, all_items, user_interests, test_cluster, user_item_sequences) = dataset

model_str = 'SUR'
sur_results = evaluate_results(model_str, results_path, alpha_str, item_clusters, 
                               user_interests, test_cluster, k_list=[1, 5, 10])

In [None]:
for cluster_ix in range(5):
  hr5.append(sur_results[f'HR@5_Cluster{cluster_ix}'])
  hr10.append(sur_results[f'HR@10_Cluster{cluster_ix}'])

  cluster_indices.append(cluster_ix)
  models.append('SUR')

In [None]:
model_str = 'MUR_3'
mur3_results = evaluate_results(model_str, results_path, alpha_str, item_clusters, 
                 user_interests, test_cluster, k_list=[1, 5, 10])

In [None]:
for cluster_ix in range(5):
  hr5.append(mur3_results[f'HR@5_Cluster{cluster_ix}'])
  hr10.append(mur3_results[f'HR@10_Cluster{cluster_ix}'])
  cluster_indices.append(cluster_ix)
  models.append('MUR_3')

In [None]:
model_str = 'MUR_5'
mur5_results = evaluate_results(model_str, results_path, alpha_str,
                                item_clusters, user_interests, test_cluster,
                                k_list=[1, 5, 10])

In [None]:
for cluster_ix in range(5):
  hr5.append(mur5_results[f'HR@5_Cluster{cluster_ix}'])
  hr10.append(mur5_results[f'HR@10_Cluster{cluster_ix}'])
  cluster_indices.append(cluster_ix)
  models.append('MUR_5')

In [None]:
models_FW = [m + '_FW' for m in models]

In [None]:
# Plotting cluster slice performance

# Subplot for alpha_str
d = {'Model': models_FW, 'HR@10': hr10, 'HR@5': hr5, 'Cluster': cluster_indices}

df = pd.DataFrame(data=d)

ax = plt.gca()
ax.hlines(y=sur_results['top_5_categorical_accuracy'], xmin=-0.5, xmax=4.5,
          colors='teal', alpha=0.6)
ax.hlines(y=mur3_results['top_5_categorical_accuracy'], xmin=-0.5, xmax=4.5,
          colors='coral', alpha=0.6)
ax.hlines(y=mur5_results['top_5_categorical_accuracy'], xmin=-0.5, xmax=4.5,
          colors='steelblue', alpha=0.6)

sns.barplot(x="Cluster", y="HR@5", hue="Model", data=df, ax=ax,
            palette = {'SUR_FW': 'teal', 'MUR_3_FW': 'coral', 'MUR_5_FW': 'steelblue'})
plt.legend(loc='lower left')

In [None]:
# Plotting cluster slice performance

# Subplot for alpha_str
d = {'Model': models, 'HR@10': hr10, 'HR@5': hr5, 'Cluster': cluster_indices}

df = pd.DataFrame(data=d)

ax = plt.gca()
ax.hlines(y=sur_results['top_10_categorical_accuracy'], xmin=-0.5, xmax=4.5,
          colors='teal', alpha=0.6)
ax.hlines(y=mur3_results['top_10_categorical_accuracy'], xmin=-0.5, xmax=4.5,
          colors='coral', alpha=0.6)
ax.hlines(y=mur5_results['top_10_categorical_accuracy'], xmin=-0.5, xmax=4.5,
          colors='steelblue', alpha=0.6)

sns.barplot(x="Cluster", y="HR@10", hue="Model", data=df, ax=ax,
            palette = {'SUR': 'teal', 'MUR_3': 'coral', 'MUR_5': 'steelblue'})
plt.legend(loc='lower left')

In [None]:
def get_cluster_results(clusters: List[int],
                        alpha_str: str,
                        only_eval_result: bool = True):

  # models = ['SUR', 'MUR_3', 'MUR_5', 'MUR_7']
  models = ['SUR', 'MUR_5', 'MUR_7']
  d = 16 
  df_model = []
  df_results = dict()
  df_embedding_dim = []
  df_clusters = []
  for cluster in clusters:
    dataset_path = f'datasets/sparse_C{cluster}_I50_U3_N10000'
    
    if only_eval_result:
      item_clusters = None
      user_interests = None
      test_cluster = None
    else:
      item_clusters, all_items, user_interests, test_cluster, _ = load_dataset(
          root_dir, dataset_path, alpha_str, is_npz=False)

    print (f'Getting results for {dataset_path} with {alpha_str} dataset.')
    results_dir = f'results/d{d}'
    results_path = os.path.join(root_dir, results_dir, dataset_path)
    for model_str in models:
      
      result = evaluate_results(model_str, results_path, alpha_str, 
                                item_clusters = item_clusters, 
                                user_interests=user_interests, 
                                test_cluster=test_cluster,
                                is_npz=False, only_eval_result=only_eval_result)
      
      for key in result:
        if key in df_results:
          df_results[key].append(result[key])
        else:
          df_results[key] = [result[key]]
      
      df_embedding_dim.append(d)
      df_clusters.append(cluster)
      df_model.append(model_str)
      
  d = {
      'Model': df_model, 
      'Embedding size': df_embedding_dim,
      'Clusters': df_clusters
    }

  for key, value in df_results.items():
    d[key] = value

  return d

clusters = [20, 30, 40, 50, 60]
fig, axs = plt.subplots(1, 4, figsize=(18, 6), constrained_layout=True, 
                        sharex=True, sharey=True)
axs = axs.flat
fig.suptitle(f"I=50, |Yu|=5, |U|=50000", fontsize=16)
all_results = []
metric_name = 'top_100_categorical_accuracy'

for ix, alpha in enumerate(np.linspace(0.5, 0.8, 4)):
  gamma = 0.9 - alpha
  alpha_str = 'interest-power2.0_item-power2.0_alpha{:0.1f}_gamma{:0.1f}'.format(alpha, gamma)
  results = get_cluster_results(clusters, alpha_str)
  # d_embeddings.append(d_anisotropy)
  all_results.append(results)
  
  df = pd.DataFrame(data=results)
  ax = sns.lineplot(
      x='Clusters', y=metric_name, hue="Model", data=df,
      markers=True, style="Model", ax=axs[ix])
  
  ax.legend(loc='upper center')
  title = alpha_str
  ax.set_title(title)
  ax.set_ylabel(metric_name)
  ax.set_xticks(clusters)

plt.show()

In [None]:
clusters = [20, 30, 40, 50, 60]
fig, axs = plt.subplots(1, 2, figsize=(9, 4), constrained_layout=True, 
                        sharex=True, sharey=True)
axs = axs.flat
fig.suptitle(f"I=50, |Yu|=5, |U|=50000", fontsize=16)
metric_name = 'top_100_categorical_accuracy'
alphas = [0.5, 0.8]
for ix, alpha in enumerate(alphas):
  print (alpha)
  gamma = 0.9 - alpha
  # alpha_str = 'interest-power1.0_item-power1.0_alpha{:0.1f}_gamma{:0.1f}'.format(alpha, gamma)
  # results = get_cluster_results(clusters, alpha_str)
  # d_embeddings.append(d_anisotropy)
  results = all_results[ix]
  
  df = pd.DataFrame(data=results)
  ax = sns.lineplot(
      x='Clusters', y=metric_name, hue="Model", data=df,
      markers=True, style="Model", ax=axs[ix])
  
  if ix == 1:
    ax.legend(loc='upper right')
  else:
    ax.get_legend().remove()

  
  title = alpha_str
  ax.set_title(get_volatility_str(1-alpha))
  ax.set_ylabel('HR@100')
  ax.set_xticks(clusters)

plt.show()