***Copyright 2020 Google LLC.***

Licensed under the Apache License, Version 2.0 (the "License");

**Author:** Furkan Kocayusufoglu \\
**Term:** Summer 2020 Research Internship with Mixel/Brain \\
**Purpose:** This notebook processes raw data from amazon into <user, item, query, time> quadruplets for personalized retrieval task.  
**Notes:** Datasets are downloaded from http://deepyeti.ucsd.edu/jianmo/amazon/index.html  

In [None]:
import tensorflow.compat.v1 as tf
from collections import defaultdict
from datetime import datetime
import json
import string
import os

In [None]:
# *** Helper functions for raw data stream.
# Reads a json file with multiple objects line by line in a streaming fashion. 
# Since some files are very large, we can read only a specified number of lines (max 100M lines allowed).
def parse_stream(path, max_lines=100000000):
  with tf.gfile.Open(path, 'r') as f:
    for line_num, line in enumerate(f):
      if line_num > max_lines:
        break
      obj = json.loads(line)
      yield obj
  print("Finished reading.")

# Reads a json file with multiple objects, stores them as a list and returns.
def parse(path):
  data = []
  with tf.gfile.Open(path, 'r') as f:
    for line in f.readlines():
      data.append(json.loads(line))
  print("Finished reading.")
  return data

# *** Helper function for text (query) processing.
# Removes duplicate words and punctuations. Duplicate words are removed from the
# left to right. E.g., "Beverages Coffee, Tea & Cocoa, Tea Samplers" becomes 
# "Beverages Coffee Cocoa Tea Samplers".
def remove_duplicate_words(text):
  import string
  tokens = text.translate(str.maketrans('', '', string.punctuation)).split()
  # tokens = text.split(' ')
  ulist = []
  for x in reversed(tokens): 
    if x not in ulist: 
      ulist.append(x)
  return ' '.join(reversed(ulist))

  # *** Helper function for joining file paths. Currently not used.
  def join_paths(path1, path2):
    return os.path.join(path1, path2)

In [None]:
# Global variables and paths
amazon_data_path = 'path/to/your/data'
categories = [
  'Sports_and_Outdoors', 
  'Kindle_Store',
  'CDs_and_Vinyl',
  'Grocery_and_Gourmet_Food', 
  'Movies_and_TV',
  'Video_Games',
  'Pet_Supplies',
  'Luxury_Beauty',
  'Electronics'
]

# Users and items with total number of interactions less than this will be ignored.
min_interaction_threshold = 5

# Data older than this timestamp will be ignored.
time_threshold = 1356998400 # Corresponds to Tue 01 Jan 2013 12:00:00 AM UTC

In [None]:
# For each category: 
# 1. Read the reviews data.
# 2. Filter users and items with less than five interactions.
# 3. Extract synthetic queries for each item using the category information from metadata.
# 4. Map users items and queries to an integer identifier, starting from 1.
# 5. Store <user_id, item_id, query_id, time> quadruplets in file (sorted by time).
for category_name in categories:

  user_interaction_count = defaultdict(lambda: 0)
  item_interaction_count = defaultdict(lambda: 0)
  line = 0
  largest_time = 0
  print("*** Category: ", category_name)

  f = tf.gfile.Open('{}{}_user_item_time.txt'.format(amazon_data_path, category_name), 'w')
  for obj in parse_stream('{}{}.json'.format(amazon_data_path, category_name)):
    time = obj['unixReviewTime']
    if time < time_threshold:
      continue
    user = obj['reviewerID']
    item = obj['asin']  
    largest_time = max(time, largest_time)
    user_interaction_count[user] += 1
    item_interaction_count[item] += 1

    # Writing <user item time> triplets per line.
    f.write(" ".join([user, item, str(time)]) + ' \n')
    line += 1
    if line  == 1:
      print("Started writing...")
    if line % 100000 == 0:
      print("At line: ", line, " with largest time so far: ", largest_time)
  f.close()

  user_map = dict()
  user_num = 0
  item_map = dict()
  item_num = 0
  user_item_seq = dict()
  count_lt5 = 0

  with tf.gfile.Open('{}{}_user_item_time.txt'.format(amazon_data_path, category_name), 'r') as f:
    for line in f:
      user, item, time = line.rstrip().split(' ')
      if min(user_interaction_count[user], item_interaction_count[item]) < min_interaction_threshold:
        count_lt5 += 1
        continue

      if user in user_map:
        user_id = user_map[user]
      else:
        user_num += 1
        user_id = user_num
        user_map[user] = user_id
        user_item_seq[user_id] = []
      if item in item_map:
        item_id = item_map[item]
      else:
        item_num += 1
        item_id = item_num
        item_map[item] = item_id
      
      user_item_seq[user_id].append([item_id, time])

  print("Total number of users with >= 5 interactions: ", user_num)
  print("Total number of items with >= 5 interactions: ", item_num)
  print("Number of times we skipped an item (or user): ", count_lt5)

  # Sort reviews in user_item_seq according to time
  for user_id in user_item_seq.keys():
    user_item_seq[user_id].sort(key=lambda x: x[1])

  # Extract queries for each item.
  item_query_id_map = dict()
  query_num = 0
  query_id_map = dict()
  id_query_map = dict()

  # Keep a special query for items without category information.
  no_category_query = 'NOC'
  query_num += 1
  query_id_map[no_category_query] = query_num
  id_query_map[query_num] = no_category_query

  line = 0
  for obj in parse_stream('{}meta_{}.json'.format(amazon_data_path, category_name)):
    if 'category' not in obj:
      query = no_category_query
    else: 
      query = " ".join(obj['category'])
      query = remove_duplicate_words(query)
    
    item = obj['asin']
    # Item needs to be in the selected group.
    if item not in item_map:
      continue
    
    line += 1
    if line == 1:
      print("Started reading metadata...")
    if line % 100000 == 0:
      print("At line: ", line, len(query_id_map), len(id_query_map)) # Last two should match
    
    if query in query_id_map:
      query_id = query_id_map[query]
    else:
      query_num += 1
      query_id = query_num
      query_id_map[query] = query_id
      id_query_map[query_id] = query
    
    item_query_id_map[item_map[item]] = query_id

  # Generate an output file with <user_id item_id query_id time> per line. 
  f = tf.gfile.Open('{}{}_user_item_query_time_mapped.txt'.format(amazon_data_path, category_name), 'w')
  for user in user_item_seq.keys():
    for i in user_item_seq[user]:
      # i: [item_id, time] -> query: item_query_id_map[i[0]]
      
      # A safety check in case any of items aren't associated with a query. This
      # could happen if metadata is missing an item.
      if i[0] not in item_query_id_map:
        continue
      f.write('%d %d %d %s\n' % (user, i[0], item_query_id_map[i[0]], str(i[1])))
  f.close()

  # Store item, user and query mapping dicts for interpretability/debug purposes.
  # Maps have the following formats:
  # User -> user: user_id
  # Item -> item: item_id
  # Query -> query_id: query
  with tf.gfile.Open('{}{}_itemmap.json'.format(amazon_data_path, category_name), 'w') as f:
    json.dump(item_map, f)
  with tf.gfile.Open('{}{}_usermap.json'.format(amazon_data_path, category_name), 'w') as f:
    json.dump(user_map, f)
  with tf.gfile.Open('{}{}_querymap.json'.format(amazon_data_path, category_name), 'w') as f:
    json.dump(id_query_map, f)
  
  print("Finished category: ", category_name)