***Copyright 2020 Google LLC.***

Licensed under the Apache License, Version 2.0 (the "License");

Author: Furkan Kocayusufoglu  
Term: Summer 2020 Research Internship with Mixel/Brain  
Purpose: This notebook analyzes user history and time resolution within such histories. 

In [None]:
import tensorflow.compat.v1 as tf
from collections import defaultdict
from datetime import datetime
import copy
import random
import json
import string
import os
import numpy as np
import itertools
import matplotlib. pyplot as plt 

In [None]:
""" Read data. For each label, we store a dictionary of users where 
key:user_id and value: list of <item id, timestamp> tuples."""
base_dir = 'path/to/your/processed/data'
dataset_categories = ['Kindle_Store', 'CDs_and_Vinyl', 'Grocery_and_Gourmet_Food', 'Movies_and_TV', 'Video_Games', 'Pet_Supplies']
labels=['Kindle', 'CDs', 'Food', 'Movies', 'Games', 'Pets']
data = {label: []  for label in labels}
user_lists = {label: defaultdict(list) for label in labels}
for dataset_category, label in zip(dataset_categories, labels):
  dataset_path = '{}{}_user_item_query_time_mapped.txt'.format(base_dir, dataset_category)
  # Read user,item,query,time
  with tf.gfile.Open(dataset_path, "r") as f:
    for line in f:
      u, i, _, t = [int(x) for x in line.rstrip().split(" ")]  # Ignores query.
      user_lists[label][u].append([i, t])

  print("Category {} is done. Number of users = {}".format(dataset_category, len(user_lists[label])))

In [None]:
""" Helper functions for timestamp."""
def convert_to_date(timestamp):
  date_time = datetime.fromtimestamp(timestamp)
  return date_time.strftime("%m/%d/%Y")  # , %H:%M:%S

def time_delta(ts1, ts2):
  if ts1 > ts2:
    return (datetime.fromtimestamp(ts1) - datetime.fromtimestamp(ts2)).days
  else:
    return (datetime.fromtimestamp(ts2) - datetime.fromtimestamp(ts1)).days

In [None]:
""" Sampling random users from Kindle category for visual inspection."""
for idx in random.sample(range(1, 1000), 10):
  print([(i, convert_to_date(t), t) for i, t in user_lists['Kindle'][idx]])

In [None]:
""" Plotting temporal resolution of the user history. """
sampled_users = []
num_users_sample = 100
user_len_sample = 20
while len(sampled_users) < num_users_sample:
  idx = random.sample(range(1, len(user_lists['Kindle'])), 1)[0]
  if len(user_lists['Kindle'][idx]) != user_len_sample:
    continue
  sampled_users.append(user_lists['Kindle'][idx])

num_users_plot = 5
plot_users = random.sample(sampled_users, num_users_plot)
bins = [int(i) for i in range(1, user_len_sample)]
idx = 0

plt.figure(figsize=(10, 8))
plt.xlabel('Time period (days)', fontsize=20)
plt.xticks(fontsize=20)
plt.ylabel('User interactions', fontsize=20)
plt.yticks(bins, fontsize=20)

for u in plot_users:
  last_t = plot_users[-1][-1][1]
  time_deltas = list(sorted([time_delta(last_t, t) for i, t in u[:-1]]))
  plt.scatter(time_deltas, bins, label = 'user {}'.format(idx+1))
  idx += 1

plt.legend()

In [None]:
""" Compute the average time spent between two consecutive items. """
days_list = {label: [] for label in labels}
for label in labels:
  for user, items in user_lists[label].items():
    nxt_item = items[-1]
    for item in reversed(items[:-1]):
      time_d = time_delta(nxt_item[1], item[1])
      days_list[label].append(time_d)
      nxt_item = item

  print("Mean time spent between two consecutive items {} for {} (in days)".format(np.mean(days_list[label]), label))
  print("Median time spent between two consecutive items {} for {} (in days)".format(np.median(days_list[label]), label))

In [None]:
""" Compute the cumulative ratio of items vs time window (in days) with respect
 to last item per user. """
max_day = 365*2  # Consider up to 2 years
days_dict  = {label: defaultdict(lambda: []) for label in labels}
for label in labels:
  for user, items in user_lists[label].items():
    last_item = items[-1]
    user_time_dict = {i:0 for i in range(max_day)}
    scale = len(items) - 1
    for item in reversed(items[:-1]):
      time_d = time_delta(last_item[1], item[1])
      if time_d not in user_time_dict:
        break
      user_time_dict[time_d] += 1/scale
    sum_so_far = 0
    for key in sorted(user_time_dict.keys()):
      user_time_dict[key] += sum_so_far
      sum_so_far = user_time_dict[key]
  
    for key, value in user_time_dict.items():
      days_dict[label][key].append(value)

In [None]:
# Sanity check
len(days_dict['Pets'][0]) == len(days_dict['Pets'][100]) == len(days_dict['Pets'][500])

In [None]:
""" Compute the (non-overlapping) segment densities with respect to different 
    segment allocations. 
"""

def _segment_boundary_exp(b, i):
  """ Computes the end of segment i wrt to exponential with base b. The formula simply is: 
      t = b^i
  """
  return b**i

def _segment_boundary_pow(b, i):
  """ Computes the end of segment i wrt to power law with base b. The formula simply is: 
      t = i**b
  """
  return i**b

def _segment_boundary_lin(b, i):
  """ Computes the end of segment i wrt to linear with base b. The formula simply is: 
      t = b*i
  """
  return b*i

In [None]:
bases_exp = [3, 4, 5, 8]
num_segments = 10
max_day = 365*2  # Consider up to 2 years.
segment_analysis_dict_exp = {label:{} for label in labels}
for label in labels:
  for base in bases_exp:
    segment_boundaries = [_segment_boundary_exp(base, i) for i in range(num_segments)]
    segment_densities = [0] * num_segments
    for user, items in user_lists[label].items():
      last_item = items[-1]
      scale = len(items) - 1
      active_segment_idx = 0
      for item in reversed(items[:-1]):
        time_d = time_delta(last_item[1], item[1])
        if time_d > max_day:
          break
        while time_d > segment_boundaries[active_segment_idx] and active_segment_idx < (num_segments - 1):
          active_segment_idx += 1

        segment_densities[active_segment_idx] += 1/scale

    segment_analysis_dict_exp[label][base] = np.array(segment_densities) /len(user_lists[label])