In [None]:
# Licensed under the Apache License, Version 2.0 (the "License");

Author: Nikhil Mehta  
Description: Preprocessing MovieLens Dataset for Training.


In [None]:
import bs4
import collections
from datetime import datetime
import gzip
import html
import json
import os
import pprint
import string
from typing import Any, Dict, List, Text

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

from colabtools import adhoc_import

In [None]:
dataset_name = 'movielens/100k'

In [None]:
# Ratings data.
ratings = tfds.load(f"{dataset_name}-ratings", split="train")
# Features of all the available movies.
movies = tfds.load(f"{dataset_name}-movies", split="train")

In [None]:
movies_np_iterator = list(movies.as_numpy_iterator())
ratings_np_iterator = list(ratings.as_numpy_iterator())

In [None]:
len(ratings_np_iterator), len(movies_np_iterator)

In [None]:
ratings_np_iterator[0]

In [None]:
movie_to_genre = dict()
genre_to_movie = collections.defaultdict(list)
genre_set = []
movie_titles = dict()

for movie in movies_np_iterator:
  movie_genres = list(map(int, movie['movie_genres']))
  movie_titles[]
  movie_id = int(movie['movie_id'])
  
  movie_to_genre[movie_id] = movie_genres
  genre_set.extend(movie_genres)
  
  for genre in movie_genres:
    genre_to_movie[genre].append(movie_id)

genre_set = list(set(genre_set))

In [None]:
genre_set

In [None]:
len(genre_to_movie[0])

In [None]:
user_interaction_count = collections.defaultdict(lambda: 0)
item_interaction_count = collections.defaultdict(lambda: 0)

for rating in ratings.as_numpy_iterator():
  
  movie_id = int(rating['movie_id'])
  user_id = int(rating['user_id'])
  
  user_interaction_count[user_id] += 1
  item_interaction_count[movie_id] += 1

In [None]:
# Prepare (user, movie, timestamp) triplets
movie_arr = []
user_arr = []
timestamp_arr = []
genre_arr = []
user_interests = collections.defaultdict(list)

count_ignored = 0

MIN_INTERACTION_THRESHOLD = 5
for rating in ratings.as_numpy_iterator():

  movie_id = int(rating['movie_id'])
  user_id = int(rating['user_id'])
  min_interaction = min(user_interaction_count[user_id],
                        item_interaction_count[movie_id])
  
  if min_interaction < MIN_INTERACTION_THRESHOLD:
    print (f'User interaction count: {user_interaction_count[user_id]}')
    print (f'Movie interaction count: {item_interaction_count[movie_id]}')
    count_ignored += 1
    continue

  user_arr.append(user_id)
  movie_arr.append(movie_id)
  timestamp_arr.append(int(rating['timestamp']))
  
  # Get genres and user_interests for demonstrating sparsity in data.
  genre_arr.extend(rating['movie_genres'])
  user_interests[user_id].extend(rating['movie_genres'])

In [None]:
movie_counter = collections.Counter(movie_arr)
user_counter = collections.Counter(user_arr)
genre_counter = collections.Counter(genre_arr)

## Item Frequency

In [None]:
movie_count = np.array(list(movie_counter.values()))
movie_count = movie_count / np.sum(movie_count)
plt.bar(range(len(movie_counter)), sorted(movie_count, reverse=True))
plt.title('MovieLens: Movie frequency distribution.', fontsize=14)
plt.xlabel('Movie ID', fontsize=12)
plt.ylabel('Normalized frequency', fontsize=12)
plt.show()

# Genre Frequency

In [None]:
genre_count = np.array(list(genre_counter.values()))
genre_count = genre_count / np.sum(genre_count)
genres = np.arange(len(genre_counter), dtype=int)
plt.bar(genres, sorted(genre_count, reverse=True))
plt.title('MovieLens: Genre frequency distribution.', fontsize=14)
plt.xticks(genres)
plt.xlabel('Genre ID', fontsize=12)
plt.ylabel('Normalized frequency', fontsize=12)
plt.show()

## Check whether top items belong to top genres

In [None]:
N = 100

genre_set = list(set(genre_arr))
num_genres = 20
top_N_movies = movie_counter.most_common(N)

ordered_movie_genre_matrix = np.zeros(shape=(num_genres, N))

for moview_ix, movie in enumerate(top_N_movies):
  movie_id, movie_count = movie
  top_movie_genres = movie_to_genre[movie_id]
  ordered_movie_genre_matrix[top_movie_genres, moview_ix] = 1

fig = plt.figure(figsize=(12, 5))
plt.imshow(ordered_movie_genre_matrix)  
plt.show()

## Distribution of user_interests

In [None]:
INTEREST_THRESHOLD = 12

# Process user_interetes
# User_id -> Genres watched
all_interests = []
for user_id, interests in user_interests.items():
  # Remove genres that occured below a certain threshold
  for interest, count in collections.Counter(interests).most_common():
    if count < INTEREST_THRESHOLD:
      break
    else:
      all_interests.append(interest)

user_interest_counter = collections.Counter(all_interests)

In [None]:
user_interest_counter

In [None]:
plt.bar(range(len(user_interest_counter)), sorted(user_interest_counter.values(), reverse=True))
plt.show()

## Check what the distribution of movies look like within a genre.

In [None]:
movie_counter

In [None]:
movie_

In [None]:
genre_ix = 0
movies_conditioned_on_genre = []
for movie in genre_to_movie[genre_ix]:
  movies_conditioned_on_genre.append(movie_counter.get(movie, 0))

plt.title(f'MovieLens: Movies in genre {genre_ix+1}', fontsize=14)
plt.xlabel('Movies', fontsize=12)
plt.ylabel('Normalized frequency', fontsize=12)

movies_conditioned_on_genre = movies_conditioned_on_genre / np.sum(movies_conditioned_on_genre)
plt.bar(range(len(movies_conditioned_on_genre)),
        sorted(movies_conditioned_on_genre, reverse=True))

plt.show()

In [None]:
plt.bar(np.arange(10), np.arange(1, 11) ** -(0.5))
plt.show()

In [None]:
interests = np.arange(10)
interest_power = 2.0

prob = (interests+1) ** (-1.0 * interest_power)
prob /= np.sum(prob)

plt.bar(interests, prob)
plt.show()

# Save triplets in txt file

In [None]:
root_data = f'/data_path/{dataset_name}'
tf.io.gfile.makeDirs(root_data)

In [None]:
# output_path = os.path.join(root_data,'user_item_time.txt')
# print ('Saving at:', output_path)
user_movie_time = list(zip(user_arr, movie_arr, timestamp_arr))
count_ignored, len(list(user_movie_time))

In [None]:
def generate_user_item_sequence(user_item_time_triplets) -> Dict[int, List[int]]:
  """Generates user item sequence from triplets (user, item, time) saved in a file.

    Args:
      user_item_time_triplets: List of triplets.

    Returns:
      user_item_seq(Dict): A dictionary that maps from user-id to a list of 
        items. The item sequence sorted by time.
      user_map: A dictionary that maps user to an integer id.
      item_map: A dictionary that maps item to Tuple(id, popularity)
  """

  user_map = dict()
  user_num = 0
  item_map = dict()
  item_num = 0
  user_item_seq = dict()
  
  
  for line in user_item_time_triplets:
    user, item, time = line
    
    if user in user_map:
      user_id = user_map[user]
    else:
      user_num += 1
      user_id = user_num
      user_map[user] = user_id
      user_item_seq[user_id] = []
    
    if item in item_map:
      item_id = item_map[item][0]
      item_map[item][1] += 1
    else:
      item_num += 1
      item_id = item_num
      item_map[item] = [item_id, 1]
    
    user_item_seq[user_id].append([item_id, time])

  

  print("Total number of users with >= {} interactions: {}".format(
      MIN_INTERACTION_THRESHOLD, user_num))
  print("Total number of items with >= {} interactions: {}".format(
      MIN_INTERACTION_THRESHOLD, item_num))

  # Sort reviews in user_item_seq according to time
  for user_id in user_item_seq.keys():
    user_item_seq[user_id].sort(key=lambda x: x[1])

  return user_item_seq, user_map, item_map

In [None]:
user_movie_sequences, user_map, movie_map = generate_user_item_sequence(user_movie_time)

In [None]:
user_movie_sequences.values()


In [None]:
np.mean([len(seq) for seq in user_movie_sequences.values()])

In [None]:
def sample_item_negatives(
    start, end, num_negative_samples, item_counts, skip_items=None) -> List[int]:
  """Samples negatives for each item for the standard evaluation protocol. 
  
  Following the standard evaluation protocol followed in the literature, only
  the negative samples are used while computing metrics HR@K and NDCG@K. For
  negative sampling, only those items are considered which the user has not
  interacted with.

  Args:
    start: Smallest item id to consider while sampling.
    end: Largest item id to consider while sampling.
    num_negative_samples: Number of negative items to sample.
    skip_items: List of items that are to be skipped while sampling. This is
      used to skip items that the user has already interacted with.
    item_counts: A dictionary mapping from item to their engagement count.
    
  Returns:
    negative_samples: A list consisting of negative sample items.
  """

  negative_samples = []
  if skip_items is None:
    skip_items = []
  
  adjusted_item_counts = np.array(
      [item_counts[item_ix] if item_ix not in skip_items else 0.0 
       for item_ix in range(start, end)])
  weights = adjusted_item_counts * 1.0 / sum(adjusted_item_counts)
  return list(np.random.choice(
      np.arange(start, end), size=num_negative_samples, p=weights, replace=False))

In [None]:
path = os.path.join(root_data, 'user_item_mapped.txt')

f = tf.io.gfile.GFile(path, 'w')
for user in user_movie_sequences.keys():
  for item_time in user_movie_sequences[user]:
    f.write('%d %d %s\n' % (user, item_time[0], str(item_time[1])))
f.close()

path = os.path.join(root_data, 'item_map.json')
with tf.io.gfile.GFile(path, 'w') as f:
  json.dump(movie_map, f)

path = os.path.join(root_data, 'user_map.json')
with tf.io.gfile.GFile(path, 'w') as f:
  json.dump(user_map, f)

In [None]:
# user_items = [item_time[0] for item_time in user_movie_sequences[user]]

# Get item counts
item_counts = dict()
for item, item_val in movie_map.items():
  item_counts[item_val[0]] = item_val[1]

# Store neg samples for each user.
neg_path = os.path.join(root_data, 'user_neg_items.txt')
neg_sample_f = tf.io.gfile.GFile(neg_path, 'w')

for user in user_movie_sequences.keys():
  user_items = [item_time[0] for item_time in user_movie_sequences[user]]
  negative_samples = sample_item_negatives(
      1, len(movie_map)+1, 99, item_counts, skip_items=user_items)
  neg_sample_f.write(
      '%d: %s\n' % (user, ' '.join(map(str, negative_samples))))
neg_sample_f.close()

In [None]:
def load_amazon_category_data(dataset_dir: str,
                              max_seq_size: int = 30) -> Dict[Text, Any]:
  """Loads the Amazon category dataset.

  Args:
    dataset_dir: Path to preprocessed data directory. The dir should have the
      user_item_mapped.txt file and user_neg_items.txt. The user_item_time.txt
      file should contain <user_id> <item_id> <timestamp> in each line, and the
      user_neg_items.txt file should contain negative items for each user, such
      that each line contains: <user_id>: <item_id_1> <item_id_2>...<item_id_N>,
        where N is the total negative items sampled.
    max_seq_size: Max sequence size. Sequence of length less than max_seq_size,
      is padded with zeros from left.

  Returns:
    dataset: A dictionary containing the dataset.
  """

  logging.info('Loading data from %s.', dataset_dir)
  num_users = 0
  num_items = 0
  user_list = collections.defaultdict(list)
  user_neg_item_dict = collections.defaultdict(list)

  dataset_path = os.path.join(dataset_dir, 'user_item_mapped.txt')
  negative_items_path = os.path.join(dataset_dir, 'user_neg_items.txt')

  with tf.io.gfile.GFile(dataset_path, 'r') as fin:
    for line in fin:

      user_id, item_id, _ = line.rstrip().split(' ')
      user_id = int(user_id)
      item_id = int(item_id)
      num_users = max(user_id, num_users)
      num_items = max(item_id, num_items)
      user_list[user_id].append(item_id)

  logging.info('Num users: %d, Num items: %d', num_users, num_items)
  for user in user_list:
    item_seq = user_list[user]
    item_seq_len = len(item_seq)
    if item_seq_len < max_seq_size:
      padded_item_seq = [0] * (max_seq_size - item_seq_len)
      padded_item_seq.extend(item_seq)
    else:
      padded_item_seq = item_seq
    
    user_list[user] = padded_item_seq

  dataset = {}

  with tf.io.gfile.GFile(negative_items_path, 'r') as fin:
    for line in fin:
      user_id, items = line.rstrip().split(':')
      user_id = int(user_id)
      user_neg_item_dict[user_id] = list(map(int, items.strip().split(' ')))

  user_neg_items = []
  user_item_sequences = []
  for user_id in user_neg_item_dict.keys():
    user_neg_items.append(user_neg_item_dict[user_id])
    item_seq = user_list[user_id]

    if len(item_seq) > max_seq_size:
      # Split the sequence into multiple sequences.
      stride = 5
      split = 0
      while (split*stride+max_seq_size < len(item_seq)):
        start_ix = split*stride
        user_item_sequences.append(item_seq[start_ix:start_ix+max_seq_size])
        split += 1
      user_item_sequences.append(item_seq[(len(item_seq) - max_seq_size):])
    else:
      user_item_sequences.append(user_list[user_id])

  dataset['user_negative_items'] = user_neg_items
  dataset['user_item_sequences'] = user_item_sequences
  dataset['num_items'] = num_items
  dataset['num_users'] = num_users
  dataset['items'] = range(1, num_items + 1)
  dataset['max_seq_size'] = max_seq_size
  logging.info('Data loaded from %s.', dataset_dir)

  return dataset

In [None]:
data = load_amazon_category_data(dataset_dir = root_data)

In [None]:
user_item_sequences = np.array(data['user_item_sequences'])
len(user_item_sequences)

In [None]:
user_item_sequences.shape

In [None]:
test_next_item = [seq[-1] for seq in user_item_sequences]
train_next_item = [seq[-2] for seq in user_item_sequences]

In [None]:
train_counter = collections.Counter(train_next_item)
test_counter = collections.Counter(test_next_item)

In [None]:
len(train_counter), len(test_counter)

In [None]:
movie_id_list = sorted(list(movie_map.keys()))
train_count = np.array([train_counter.get(movie_id, 0) for movie_id in movie_id_list])
train_count = train_count / np.sum(train_count)

test_count = np.array([test_counter.get(movie_id, 0) for movie_id in movie_id_list])
test_count = test_count / np.sum(test_count)

In [None]:
sorted_indices = np.argsort(-1*train_count)
plt.bar(movie_id_list, train_count[sorted_indices], color='coral', alpha=1.0, label='Train')
plt.bar(movie_id_list, test_count[sorted_indices], color='teal', alpha=0.8, label='Test')
plt.ylabel('Normalized frequency', fontsize=12)
plt.xlabel('Movies', fontsize=12)
plt.legend()
plt.title('MovieLens: Train vs. Test Item Distribution.', fontsize=14)
plt.show()