In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");


Author: Nikhil Mehta  
Description: Preprocessing Amazon Dataset for Training.

In [None]:
import bs4
from collections import defaultdict
from collections import deque
from datetime import datetime
import gzip
import html
import json
import os
import string
from typing import List, Dict

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from colabtools import adhoc_import

In [None]:
amazon_data_path = 'data_path/AmazonDataset/category_data/'
categories = [
              'Kindle_Store',
              'Grocery_and_Gourmet_Food', 
              'CDs_and_Vinyl',
              'Sports_and_Outdoors',]
extension = '.json'

In [None]:
categories

In [None]:
# Helper functions
# def parse(path):
#   """Reads a json file with multiple objects, stores them as a list and returns.
  
#   Args:
#     path: Path to the json file.

#   Returns:
#     data: List of objects in the json.
#   """

#   data = []
#   with tf.io.gfile.GFile(path, 'r') as f:
#     for line in f:
#       print_function(line)
#       break
#       data.append(json.loads(line))
  
#   return data

def parse_stream(path, max_lines=100000000):
  """Reads json with multiple objects and yields object for an iterator.

    Args:
      max_lines (int): reads a maximum of max_lines (default:100M). Used when files are very large.

    Yields:
      obj (JsonObject):  The next json object.
  """

  with tf.io.gfile.GFile(path, 'r') as f:
    for line_num, line in enumerate(f):
      if line_num > max_lines:
        print ('max line reached in parse_stream.')
        break
      
      obj = json.loads(line)
      yield obj
  

In [None]:
# Data older than this timestamp will be ignored.
# Corresponds to Tue 01 Jan 2013 12:00:00 AM UTC
# TIME_THRESHOLD = 1356998400
TIME_THRESHOLD = 1026900400

# Users and items with total number of interactions less than this will be ignored.
MIN_INTERACTION_THRESHOLD = 5

def extract_user_item_time(raw_dataset_path: str, output_file_path: str):
  """Extracts triplets (user, item, time) and stores in text file.

    Args:
      raw_dataset_path: Path to Amazon Review data.
      output_file_path: Path of the output text file.
  """

  user_interaction_count = defaultdict(lambda: 0)
  item_interaction_count = defaultdict(lambda: 0)
  line = 0
  largest_time = 0
  count_ignored = 0

  with tf.io.gfile.GFile(output_file_path, 'w') as fout:

    # Get the total interaction count for items and users.
    for obj in parse_stream(file_path):
      time = obj['unixReviewTime']
      if time < TIME_THRESHOLD:
        continue

      user = obj['reviewerID']
      item = obj['asin']

      user_interaction_count[user] += 1
      item_interaction_count[item] += 1

    
    for obj in parse_stream(file_path):
      time = obj['unixReviewTime']
      if time < TIME_THRESHOLD:
        continue

      user = obj['reviewerID']
      item = obj['asin']

      min_interaction = min(user_interaction_count[user], item_interaction_count[item])
      if min_interaction < MIN_INTERACTION_THRESHOLD:
        count_ignored += 1
        continue

      largest_time = max(time, largest_time)
      
      # Writing <user item time> triplets per line.
      fout.write(" ".join([user, item, str(time)]) + ' \n')
      line += 1
      if line  == 1:
        print("Started writing...")
      if line % 100000 == 0:
        print("At line: ", line, " with largest time so far: ", largest_time)
    
    fout.close()

    print (f'Users : {len(user_interaction_count)}, Items: {len(item_interaction_count)}')
    print ('Total triplets ignored: {}/{}'.format(count_ignored, line+count_ignored))
    
  return user_interaction_count, item_interaction_count

In [None]:
for category in categories:
  print (f'\n Category: {category}')
  file_path = os.path.join(amazon_data_path, category, category + extension)
  output_path = os.path.join(amazon_data_path, category, category + '_user_item_time.txt')
  user_interaction_count, item_interaction_count = extract_user_item_time(file_path, output_path)

In [None]:
output_path

In [None]:
def generate_user_item_sequence(path: str) -> Dict[int, List[int]]:
  """Generates user item sequence from triplets (user, item, time) saved in a file.

    Args:
      path(str): Txt File path with the triplets.

    Returns:
      user_item_seq(Dict): A dictionary that maps from user-id to a list of 
        items. The item sequence sorted by time.
      user_map: A dictionary that maps user to an integer id.
      item_map: A dictionary that maps item to Tuple(id, popularity)
  """

  user_map = dict()
  user_num = 0
  item_map = dict()
  item_num = 0
  user_item_seq = dict()
  
  with tf.io.gfile.GFile(path, 'r') as fin:
    for line in fin:
      user, item, time = line.rstrip().split(' ')
      
      if user in user_map:
        user_id = user_map[user]
      else:
        user_num += 1
        user_id = user_num
        user_map[user] = user_id
        user_item_seq[user_id] = []
      
      if item in item_map:
        item_id = item_map[item][0]
        item_map[item][1] += 1
      else:
        item_num += 1
        item_id = item_num
        item_map[item] = [item_id, 1]
      
      user_item_seq[user_id].append([item_id, time])

    fin.close()

  print("Total number of users with >= {} interactions: {}".format(
      MIN_INTERACTION_THRESHOLD, user_num))
  print("Total number of items with >= {} interactions: {}".format(
      MIN_INTERACTION_THRESHOLD, item_num))

  # Sort reviews in user_item_seq according to time
  for user_id in user_item_seq.keys():
    user_item_seq[user_id].sort(key=lambda x: x[1])

  return user_item_seq, user_map, item_map


In [None]:
def sample_item_negatives(
    start, end, num_negative_samples, item_counts, skip_items=None) -> List[int]:
  """Samples negatives for each item for the standard evaluation protocol. 
  
  Following the standard evaluation protocol followed in the literature, only
  the negative samples are used while computing metrics HR@K and NDCG@K. For
  negative sampling, only those items are considered which the user has not
  interacted with.

  Args:
    start: Smallest item id to consider while sampling.
    end: Largest item id to consider while sampling.
    num_negative_samples: Number of negative items to sample.
    skip_items: List of items that are to be skipped while sampling. This is
      used to skip items that the user has already interacted with.
    item_counts: A dictionary mapping from item to their engagement count.
    
  Returns:
    negative_samples: A list consisting of negative sample items.
  """

  negative_samples = []
  if skip_items is None:
    skip_items = []
  
  adjusted_item_counts = np.array(
      [item_counts[item_ix] if item_ix not in skip_items else 0.0 
       for item_ix in range(start, end)])
  weights = adjusted_item_counts * 1.0 / sum(adjusted_item_counts)
  return list(np.random.choice(
      np.arange(start, end), size=num_negative_samples, p=weights, replace=False))


In [None]:
for category in categories[1:]:

  print(f'Processing category: {category}')
  path = os.path.join(amazon_data_path, category, category + '_user_item_time.txt')
  user_item_seq, user_map, item_map = generate_user_item_sequence(path)

  # Generate an output file with <user_id item_id time> per line. 
  path = os.path.join(amazon_data_path, category, 'user_item_mapped.txt')
  f = tf.io.gfile.GFile(path, 'w')
  for user in user_item_seq.keys():
    for item_time in user_item_seq[user]:
      # item_time: [item_id, time]
      f.write('%d %d %s\n' % (user, item_time[0], str(item_time[1])))
  f.close()

  path = os.path.join(amazon_data_path, category, 'item_map.json')
  with tf.io.gfile.GFile(path, 'w') as f:
    json.dump(item_map, f)

  path = os.path.join(amazon_data_path, category, 'user_map.json')
  with tf.io.gfile.GFile(path, 'w') as f:
    json.dump(user_map, f)

  # Get item counts
  item_counts = dict()
  for item, item_val in item_map.items():
    item_counts[item_val[0]] = item_val[1]

  # Store neg samples for each user.
  neg_path = os.path.join(amazon_data_path, category, 'user_neg_items.txt')
  neg_sample_f = tf.io.gfile.GFile(neg_path, 'w')
  for user in user_item_seq.keys():
    user_items = [item_time[0] for item_time in user_item_seq[user]]
    negative_samples = sample_item_negatives(
        1, len(item_map)+1, 99, item_counts, skip_items=user_items)
    neg_sample_f.write(
        '%d: %s\n' % (user, ' '.join(map(str, negative_samples))))
  neg_sample_f.close()

In [None]:
def get_item_categories (amazon_dataset_category: str) -> Dict[Text, Any], Dict[int, int]
  """Returns the category_id_map and item_category_map for analysis."""

  meta_path = os.path.join(amazon_data_path, category, f'meta_{category}.json'.)
  
  # Removes numeric strings 
  remove_words = ['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th']
  
  category_num = 0
  no_category_query = 'NOC'

  category_id_map = dict(no_category_query = (category_num, 0))
  item_category_map = defaultdict(lambda: 0)
  items_without_category = 0
  for ix, obj in enumerate(parse_stream(meta_path)):

      if 'category' not in obj:
        category = no_category_query
        items_without_category += 1
      else:
        
        if len(obj['category']) < 3:
          items_without_category += 1
          continue
      
        category = obj['category'][2]
        category = bs4.BeautifulSoup(category).get_text()
        # category = removes_text_in_parenthesis(category)
        category = preprocess_category_text(category, remove_words)

      if category not in category_id_map:
        category_num += 1
        category_id_map[category] = (category_num, 1)
      else:
        category_id_map[category] = (
            category_id_map[category][0], category_id_map[category][1]+1)
      
      item = obj['asin'] 
      if item in item_map:
        item_category_map[item_map[item]] = category_id_map[category][0]

  print(f'Items without category: {items_without_category}')
  return category_id_map, item_category_map

In [None]:
items_without_category

In [None]:
items_with_category = 0
for k in sorted(list(category_id_map.keys())):
  print (f"{category_id_map[k][0]:<{10}} {k:<{40}}: {category_id_map[k][1]:>{20}}")
  items_with_category += category_id_map[k][1]

print (f"\nTotal items with category information {items_with_category}.")