## Imports

First let's get our dependencies and imports out of the way.

In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

In [None]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [None]:
!wget -nc https://files.grouplens.org/datasets/movielens/ml-1m.zip

File ‘ml-1m.zip’ already there; not retrieving.



In [None]:
!unzip -o "ml-1m.zip"  -d  "/content"

Archive:  ml-1m.zip
  inflating: /content/ml-1m/movies.dat  
  inflating: /content/ml-1m/ratings.dat  
  inflating: /content/ml-1m/README   
  inflating: /content/ml-1m/users.dat  


In [None]:
import pandas as pd
ratings_df = pd.read_csv('ml-1m/ratings.dat', sep='::', header=0, skipinitialspace=True, encoding="unicode_escape")
ratings_df.dropna(inplace=True)
movies_df = pd.read_csv('ml-1m/movies.dat', sep='::', header=0, skipinitialspace=True, encoding="unicode_escape")
movies_df.dropna(inplace=True)


  return func(*args, **kwargs)


In [None]:

ratings_df.columns =[ 'userID', 'itemID', 'rating','timestamp']
movies_df.columns =[ 'itemID', 'name','category']
movies_df['itemID']=movies_df['itemID'].astype(str)
items_df=movies_df['itemID']

#Movie Lens

## preprocessing

In [None]:
#   Copyright 2021 The TensorFlow Authors. All Rights Reserved.
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
"""Prepare TF.Examples for on-device recommendation model.

Following functions are included: 1) downloading raw data 2) processing to user
activity sequence and splitting to train/test data 3) convert to TF.Examples
and write in output location.

More information about the itemlens dataset can be found here:
https://grouplens.org/datasets/itemlens/
"""

import collections
import json
import os
import random
import re

from absl import app
from absl import flags
from absl import logging
import pandas as pd
import tensorflow as tf

FLAGS = flags.FLAGS

# Permalinks to download itemlens data.

RATINGS_DATA_COLUMNS = ["userID",'itemID', 'rating','timestamp']
itemS_DATA_COLUMNS = ["itemID"]
OUTPUT_TRAINING_DATA_FILENAME = "train.tfrecord"
OUTPUT_TESTING_DATA_FILENAME = "test.tfrecord"
OUTPUT_ITEM_VOCAB_FILENAME = "item_vocab.json"
PAD_ITEM_ID = 0
PAD_RATING = 0.0
UNKNOWN_STR = "UNK"
VOCAB_ITEM_ID_INDEX = 0
VOCAB_COUNT_INDEX = 0


class ItemInfo(
    collections.namedtuple(
        "ItemInfo", ["item_id", "timestamp", "rating"])):
  """Data holder of basic information of a item."""
  __slots__ = ()

  def __new__(cls,
              item_id=0,
              timestamp=0,
              rating=0,
             ):
    return super(ItemInfo, cls).__new__(cls, item_id, timestamp, rating,
                                        )


def convert_to_timelines(ratings_df):
  """Convert ratings data to user."""
  timelines = collections.defaultdict(list)
  item_counts = collections.Counter()
  for user_id, item_id, rating, timestamp in ratings_df.values:
    timelines[user_id].append(
        ItemInfo(item_id=item_id, timestamp=int(timestamp), rating=rating))
    item_counts[item_id] += 1
  # Sort per-user timeline by timestamp
  for (user_id, context) in timelines.items():
    context.sort(key=lambda x: x.timestamp)
    timelines[user_id] = context
  return timelines, item_counts


def generate_items_dict(items_df):
  """Generates items dictionary from items dataframe."""
  item_dict = {
      item_id: ItemInfo(item_id=item_id)
      for item_id in items_df.values
  }
  item_dict[0] = ItemInfo()
  return item_dict




def generate_examples_from_single_timeline(timeline,
                                           items_dict,
                                           max_context_len=100,
                                           max_context_item_genre_len=320):
  """Generate TF examples from a single user timeline.

  Generate TF examples from a single user timeline. Timeline with length less
  than minimum timeline length will be skipped. And if context user history
  length is shorter than max_context_len, features will be padded with default
  values.

  Args:
    timeline: The timeline to generate TF examples from.
    items_dict: Dictionary of all itemInfos.
    max_context_len: The maximum length of the context. If the context history
      length is less than max_context_length, features will be padded with
      default values.
    max_context_item_genre_len: The length of item genre feature.

  Returns:
    examples: Generated examples from this single timeline.
  """
  examples = []
  for label_idx in range(1, len(timeline)):
    start_idx = max(0, label_idx - max_context_len)
    context = timeline[start_idx:label_idx]
    # Pad context with out-of-vocab item id 0.
    while len(context) < max_context_len:
      context.append(ItemInfo())
    label_item_id = int(timeline[label_idx].item_id)
    context_item_id = [int(ids.item_id) for ids in context]
   
    feature = {
        "context_item_id":
            tf.train.Feature(
                int64_list=tf.train.Int64List(value=context_item_id)),
       
        "label_item_id":
            tf.train.Feature(
                int64_list=tf.train.Int64List(value=[label_item_id]))
    }
    tf_example = tf.train.Example(features=tf.train.Features(feature=feature))
    examples.append(tf_example)

  return examples


def generate_examples_from_timelines(timelines,
                                     items_df,
                                     min_timeline_len=3,
                                     max_context_len=100,
                                     max_context_item_genre_len=320,
                                     train_data_fraction=0.9,
                                     random_seed=None,
                                     shuffle=True):
  """Convert user timelines to tf examples.

  Convert user timelines to tf examples by adding all possible context-label
  pairs in the examples pool.

  Args:
    timelines: The user timelines to process.
    items_df: The dataframe of all items.
    min_timeline_len: The minimum length of timeline. If the timeline length is
      less than min_timeline_len, empty examples list will be returned.
    max_context_len: The maximum length of the context. If the context history
      length is less than max_context_length, features will be padded with
      default values.
    max_context_item_genre_len: The length of item genre feature.
    train_data_fraction: Fraction of training data.
    random_seed: Seed for randomization.
    shuffle: Whether to shuffle the examples before splitting train and test
      data.

  Returns:
    train_examples: TF example list for training.
    test_examples: TF example list for testing.
  """
  examples = []
  items_dict = generate_items_dict(items_df)
  progress_bar = tf.keras.utils.Progbar(len(timelines))

  for timeline in timelines.values():
    if len(timeline) < min_timeline_len:
      progress_bar.add(1)
      continue
    single_timeline_examples = generate_examples_from_single_timeline(
        timeline=timeline,
        items_dict=items_dict,
        max_context_len=max_context_len,
        max_context_item_genre_len=max_context_item_genre_len)
    examples.extend(single_timeline_examples)
   
    progress_bar.add(1)
  # Split the examples into train, test sets.
  if shuffle:
    random.seed(random_seed)
    random.shuffle(examples)
  last_train_index = round(len(examples) * train_data_fraction)

  train_examples = examples[:last_train_index]
  test_examples = examples[last_train_index:]
  return train_examples, test_examples


def generate_item_feature_vocabs(items_df, item_counts):
  """Generate vocabularies for item features.

  Generate vocabularies for item features (item_id, genre, year), sorted by
  usage count. Vocab id 0 will be reserved for default padding value.

  Args:
    items_df: Dataframe for items.
    item_counts: Counts that each item is rated.

  Returns:
    item_id_vocab: List of all item ids paired with item usage count, and
      sorted by counts.
    item_genre_vocab: List of all item genres, sorted by genre usage counts.
    item_year_vocab: List of all item years, sorted by year usage counts.
  """
  item_vocab = []

  for item_id in items_df.values:
    count = item_counts.get(item_id) or 0
    item_vocab.append([item_id, count])
  
  item_vocab.sort(key=lambda x: x[0], reverse=True)  # by count
  
  return item_vocab


def write_tfrecords(tf_examples, filename):
  """Writes tf examples to tfrecord file, and returns the count."""
  with tf.io.TFRecordWriter(filename) as file_writer:
    length = len(tf_examples)
    progress_bar = tf.keras.utils.Progbar(length)
    for example in tf_examples:
      file_writer.write(example.SerializeToString())
      progress_bar.add(1)
    return length


def write_vocab_json(vocab, filename):
  """Write generated item vocabulary to specified file."""
  with open(filename, "w", encoding="utf-8") as jsonfile:
    json.dump(vocab, jsonfile, indent=2)


def write_vocab_txt(vocab, filename):
  with open(filename, "w", encoding="utf-8") as f:
    for item in vocab:
      f.write(str(item) + "\n")


def generate_datasets(extracted_data_dir,
                      output_dir,
                      min_timeline_length,
                      max_context_length,
                      max_context_item_genre_length,
                      min_rating=None,
                      build_vocabs=True,
                      train_data_fraction=0.9,
                      train_filename=OUTPUT_TRAINING_DATA_FILENAME,
                      test_filename=OUTPUT_TESTING_DATA_FILENAME,
                      vocab_filename=OUTPUT_ITEM_VOCAB_FILENAME,
                      ):
  """Generates train and test datasets as TFRecord, and returns stats."""

  logging.info("Generating item rating user timelines.")
  timelines, item_counts = convert_to_timelines(ratings_df)
  logging.info("Generating train and test examples.")
  train_examples, test_examples = generate_examples_from_timelines(
      timelines=timelines,
      items_df=items_df,
      min_timeline_len=min_timeline_length,
      max_context_len=max_context_length,
      max_context_item_genre_len=max_context_item_genre_length,
      train_data_fraction=train_data_fraction)

  if not tf.io.gfile.exists(output_dir):
    tf.io.gfile.makedirs(output_dir)
  logging.info("Writing generated training examples.")
  train_file = os.path.join(output_dir, train_filename)
  train_size = write_tfrecords(tf_examples=train_examples, filename=train_file)
  logging.info("Writing generated testing examples.")
  test_file = os.path.join(output_dir, test_filename)
  test_size = write_tfrecords(tf_examples=test_examples, filename=test_file)
  stats = {
      "train_size": train_size,
      "test_size": test_size,
      "train_file": train_file,
      "test_file": test_file,
  }

  if build_vocabs:
    item_vocab = (
        generate_item_feature_vocabs(
            items_df=items_df, item_counts=item_counts))
    vocab_file = os.path.join(output_dir, vocab_filename)
    write_vocab_json(item_vocab, filename=vocab_file)
    stats.update({
        "vocab_size": len(item_vocab),
        "vocab_file": vocab_file,
        "vocab_max_id": max([arr[VOCAB_ITEM_ID_INDEX] for arr in item_vocab])
    })

    

  return stats



stats = generate_datasets(
      extracted_data_dir="ml-1m",
      output_dir="data/processing",
      min_timeline_length=3,
      max_context_length=10,
      max_context_item_genre_length=4,
      min_rating=0,
      build_vocabs=True,
      train_data_fraction=.8,
  )
print(stats)


{'train_size': 795334, 'test_size': 198834, 'train_file': 'data/processing/train.tfrecord', 'test_file': 'data/processing/test.tfrecord', 'vocab_size': 3882, 'vocab_file': 'data/processing/item_vocab.json', 'vocab_max_id': '999'}


In [None]:
train_filename = "./data/processing/train.tfrecord"
train = tf.data.TFRecordDataset(train_filename)

test_filename = "./data/processing/test.tfrecord"
test = tf.data.TFRecordDataset(test_filename)


In [None]:

feature_description = {
    'context_item_id': tf.io.FixedLenFeature([10], tf.int64, default_value=np.repeat(0, 10)),
    'label_item_id': tf.io.FixedLenFeature([1], tf.int64, default_value=0),
}


In [None]:


def _parse_function(example_proto):
  return tf.io.parse_single_example(example_proto, feature_description)

train_ds = train.map(_parse_function).map(lambda x: {
    "context_item_id": tf.strings.as_string(x["context_item_id"]),
    "label_item_id": tf.strings.as_string(x["label_item_id"])
})



In [None]:

for x in train_ds.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'context_item_id': array([b'2003', b'1339', b'2657', b'2746', b'2120', b'2617', b'2004',
       b'2367', b'2717', b'1690'], dtype=object),
 'label_item_id': array([b'1388'], dtype=object)}


In [None]:


test_ds = test.map(_parse_function).map(lambda x: {
    "context_item_id": tf.strings.as_string(x["context_item_id"]),
    "label_item_id": tf.strings.as_string(x["label_item_id"])
})


In [None]:
# movies = tfds.load("movielens/1m-movies", split='train')
# movies = movies.map(lambda x: x["movie_id"])
# movie_ids = movies.batch(1_000)
# unique_movie_ids = np.unique(np.concatenate(list(movie_ids)))

In [None]:
ratings_df['itemID']=ratings_df['itemID'].astype(str)
items_dict = ratings_df[['itemID']].drop_duplicates()

movies= tf.data.Dataset.from_tensor_slices(dict(items_dict))
movies = movies.map(lambda x: x['itemID'])
items_list =movies.batch(1_000)
unique_movie_ids = unique_items = np.unique(np.concatenate(list(items_list),axis =0))


## Implementing a sequential model


In [None]:
embedding_dimension = 32

query_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
      vocabulary=unique_movie_ids, mask_token=None),
    tf.keras.layers.Embedding(len(unique_movie_ids) + 1, embedding_dimension), 
    tf.keras.layers.GRU(embedding_dimension),
])

candidate_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_movie_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_movie_ids) + 1, embedding_dimension)
])

The metrics, task and full model are defined similar to the basic retrieval model. 

In [None]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=movies.batch(128).map(candidate_model)
)

task = tfrs.tasks.Retrieval(
  metrics=metrics
)

class Model(tfrs.Model):

    def __init__(self, query_model, candidate_model):
        super().__init__()
        self._query_model = query_model
        self._candidate_model = candidate_model

        self._task = task

    def compute_loss(self, features, training=False):
      
        watch_history = features["context_item_id"]
        watch_next_label = features["label_item_id"]

        query_embedding = self._query_model(watch_history)       
        candidate_embedding = self._candidate_model(watch_next_label)
        
        return self._task(query_embedding, candidate_embedding, compute_metrics=not training)

## Fitting and evaluating

We can now compile, train and evaluate our sequential retrieval model.

In [None]:
model = Model(query_model, candidate_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [None]:
cached_train = train_ds.shuffle(10_000).batch(12800).cache()
cached_test = test_ds.batch(2560).cache()

In [None]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f5b0cc22a00>

In [None]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.01490187831223011,
 'factorized_top_k/top_5_categorical_accuracy': 0.07811541110277176,
 'factorized_top_k/top_10_categorical_accuracy': 0.13544967770576477,
 'factorized_top_k/top_50_categorical_accuracy': 0.37050503492355347,
 'factorized_top_k/top_100_categorical_accuracy': 0.4994870126247406,
 'loss': 9968.255859375,
 'regularization_loss': 0,
 'total_loss': 9968.255859375}

Reference:
Tensorflow recommenders 