## Imports

First let's get our dependencies and imports out of the way.

In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets

In [None]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [None]:
!pip install gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!gdown https://drive.google.com/uc?id=1kl65YOvoSAMSgszQQbua2q4Zwe1HgPct

Downloading...
From: https://drive.google.com/uc?id=1kl65YOvoSAMSgszQQbua2q4Zwe1HgPct
To: /content/data.zip
  0% 0.00/4.16M [00:00<?, ?B/s]100% 4.16M/4.16M [00:00<00:00, 278MB/s]


In [None]:
!unzip -o "data.zip"  -d  "/content"

Archive:  data.zip
  inflating: /content/amazon_min.csv  
  inflating: /content/book_min.csv   
  inflating: /content/ciao_min.csv   
  inflating: /content/ecom_min.csv   
  inflating: /content/food_min.csv   
  inflating: /content/movies_min.csv  


In [None]:
import pandas as pd
ratings_df =pd.read_csv('amazon_min.csv')
ratings_df=ratings_df[['userID','itemID','rating','time']]
ratings_df.columns =[ 'userID', 'itemID', 'rating','timestamp']
items_df=ratings_df[['itemID']]
items_df = items_df.sort_values(by='itemID', ascending=False)
items_df = items_df.drop_duplicates(subset='itemID', keep="first")

In [None]:
item_ids = list(set(list(ratings_df.itemID.unique())))
user_ids = list(set(list(ratings_df.userID.unique())))
dict_users = {}
index = 0
for ids in sorted(user_ids):
    dict_users[ids] = index
    index += 1
dict_items = {}
index = 0
for ids in sorted(item_ids):
    dict_items[ids] = index
    index += 1


In [None]:
ratings_df['userID'] = ratings_df.userID.map(dict_users)
ratings_df['itemID'] = ratings_df.itemID.map(dict_items)
items_df['order_id'] = items_df.itemID.map(dict_items)

In [None]:
items_df['itemID']=items_df['itemID'].astype(str)

## preprocessing

In [None]:
"""Ref:- https://www.tensorflow.org/recommenders/"""

import collections
import json
import os
import random
import re

from absl import app
from absl import flags
from absl import logging
import pandas as pd
import tensorflow as tf

FLAGS = flags.FLAGS

# Permalinks to download itemlens data.

RATINGS_DATA_COLUMNS = ["userID",'itemID', 'rating','timestamp']
itemS_DATA_COLUMNS = ["itemID"]
OUTPUT_TRAINING_DATA_FILENAME = "train.tfrecord"
OUTPUT_TESTING_DATA_FILENAME = "test.tfrecord"
OUTPUT_ITEM_VOCAB_FILENAME = "item_vocab.json"
PAD_ITEM_ID = 0
PAD_RATING = 0.0
UNKNOWN_STR = "UNK"
VOCAB_ITEM_ID_INDEX = 0
VOCAB_COUNT_INDEX = 0


class ItemInfo(
    collections.namedtuple(
        "ItemInfo", ["item_id", "timestamp", "rating"])):
  """Data holder of basic information of a item."""
  __slots__ = ()

  def __new__(cls,
              item_id=0,
              timestamp=0,
              rating=0,
             ):
    return super(ItemInfo, cls).__new__(cls, item_id, timestamp, rating,
                                        )


def convert_to_timelines(ratings_df):
  """Convert ratings data to user."""
  timelines = collections.defaultdict(list)
  item_counts = collections.Counter()
  for user_id, item_id, rating, timestamp in ratings_df.values:
    timelines[user_id].append(
        ItemInfo(item_id=item_id, timestamp=int(timestamp), rating=rating))
    item_counts[item_id] += 1
  # Sort per-user timeline by timestamp
  for (user_id, context) in timelines.items():
    context.sort(key=lambda x: x.timestamp)
    timelines[user_id] = context
  return timelines, item_counts


def generate_items_dict(items_df):
  """Generates items dictionary from items dataframe."""
  item_dict = {
      item_id[0]: ItemInfo(item_id=item_id[0])
      for item_id in items_df.values
  }
  item_dict[0] = ItemInfo()
  return item_dict




def generate_examples_from_single_timeline(timeline,
                                           items_dict,
                                           max_context_len=100,
                                           max_context_item_genre_len=320):
  """Generate TF examples from a single user timeline.

  Generate TF examples from a single user timeline. Timeline with length less
  than minimum timeline length will be skipped. And if context user history
  length is shorter than max_context_len, features will be padded with default
  values.

  Args:
    timeline: The timeline to generate TF examples from.
    items_dict: Dictionary of all itemInfos.
    max_context_len: The maximum length of the context. If the context history
      length is less than max_context_length, features will be padded with
      default values.
    max_context_item_genre_len: The length of item genre feature.

  Returns:
    examples: Generated examples from this single timeline.
  """
  examples = []
  for label_idx in range(1, len(timeline)):
    start_idx = max(0, label_idx - max_context_len)
    context = timeline[start_idx:label_idx]
    # Pad context with out-of-vocab item id 0.
    while len(context) < max_context_len:
      context.append(ItemInfo())
    label_item_id = int(timeline[label_idx].item_id)
    context_item_id = [int(ids.item_id) for ids in context]
   
    feature = {
        "context_item_id":
            tf.train.Feature(
                int64_list=tf.train.Int64List(value=context_item_id)),
       
        "label_item_id":
            tf.train.Feature(
                int64_list=tf.train.Int64List(value=[label_item_id]))
    }
    tf_example = tf.train.Example(features=tf.train.Features(feature=feature))
    examples.append(tf_example)

  return examples


def generate_examples_from_timelines(timelines,
                                     items_df,
                                     min_timeline_len=3,
                                     max_context_len=100,
                                     max_context_item_genre_len=320,
                                     train_data_fraction=0.9,
                                     random_seed=None,
                                     shuffle=True):
  """Convert user timelines to tf examples.

  Convert user timelines to tf examples by adding all possible context-label
  pairs in the examples pool.

  """
  examples = []
  items_dict = generate_items_dict(items_df)
  progress_bar = tf.keras.utils.Progbar(len(timelines))

  for timeline in timelines.values():
    if len(timeline) < min_timeline_len:
      progress_bar.add(1)
      continue
    single_timeline_examples = generate_examples_from_single_timeline(
        timeline=timeline,
        items_dict=items_dict,
        max_context_len=max_context_len,
        max_context_item_genre_len=max_context_item_genre_len)
    examples.extend(single_timeline_examples)
   
    progress_bar.add(1)
  # Split the examples into train, test sets.
  if shuffle:
    random.seed(random_seed)
    random.shuffle(examples)
  last_train_index = round(len(examples) * train_data_fraction)

  train_examples = examples[:last_train_index]
  test_examples = examples[last_train_index:]
  return train_examples, test_examples


def generate_item_feature_vocabs(items_df, item_counts):
  """Generate vocabularies for item features.

  Generate vocabularies for item features (item_id, genre, year), sorted by
  usage count. Vocab id 0 will be reserved for default padding value.

  """
  item_vocab = []

  for item_id in items_df.values:
    count = item_counts.get(item_id[0]) or 0
    item_vocab.append([item_id[0], count])
  
  item_vocab.sort(key=lambda x: x[0], reverse=True)  # by count
  
  return item_vocab


def write_tfrecords(tf_examples, filename):
  """Writes tf examples to tfrecord file, and returns the count."""
  with tf.io.TFRecordWriter(filename) as file_writer:
    length = len(tf_examples)
    progress_bar = tf.keras.utils.Progbar(length)
    for example in tf_examples:
      file_writer.write(example.SerializeToString())
      progress_bar.add(1)
    return length


def write_vocab_json(vocab, filename):
  """Write generated item vocabulary to specified file."""
  with open(filename, "w", encoding="utf-8") as jsonfile:
    json.dump(vocab, jsonfile, indent=2)


def write_vocab_txt(vocab, filename):
  with open(filename, "w", encoding="utf-8") as f:
    for item in vocab:
      f.write(str(item) + "\n")


def generate_datasets(extracted_data_dir,
                      output_dir,
                      min_timeline_length,
                      max_context_length,
                      max_context_item_genre_length,
                      min_rating=None,
                      build_vocabs=True,
                      train_data_fraction=0.9,
                      train_filename=OUTPUT_TRAINING_DATA_FILENAME,
                      test_filename=OUTPUT_TESTING_DATA_FILENAME,
                      vocab_filename=OUTPUT_ITEM_VOCAB_FILENAME,
                      ):
  """Generates train and test datasets as TFRecord, and returns stats."""

  logging.info("Generating item rating user timelines.")
  timelines, item_counts = convert_to_timelines(ratings_df)
  logging.info("Generating train and test examples.")
  train_examples, test_examples = generate_examples_from_timelines(
      timelines=timelines,
      items_df=items_df,
      min_timeline_len=min_timeline_length,
      max_context_len=max_context_length,
      max_context_item_genre_len=max_context_item_genre_length,
      train_data_fraction=train_data_fraction)

  if not tf.io.gfile.exists(output_dir):
    tf.io.gfile.makedirs(output_dir)
  logging.info("Writing generated training examples.")
  train_file = os.path.join(output_dir, train_filename)
  train_size = write_tfrecords(tf_examples=train_examples, filename=train_file)
  logging.info("Writing generated testing examples.")
  test_file = os.path.join(output_dir, test_filename)
  test_size = write_tfrecords(tf_examples=test_examples, filename=test_file)
  stats = {
      "train_size": train_size,
      "test_size": test_size,
      "train_file": train_file,
      "test_file": test_file,
  }

  if build_vocabs:
    item_vocab = (
        generate_item_feature_vocabs(
            items_df=items_df, item_counts=item_counts))
    vocab_file = os.path.join(output_dir, vocab_filename)
    write_vocab_json(item_vocab, filename=vocab_file)
    stats.update({
        "vocab_size": len(item_vocab),
        "vocab_file": vocab_file,
        "vocab_max_id": max([arr[VOCAB_ITEM_ID_INDEX] for arr in item_vocab])
    })

    

  return stats



stats = generate_datasets(
      extracted_data_dir="ml-1m",
      output_dir="data/processing",
      min_timeline_length=3,
      max_context_length=10,
      max_context_item_genre_length=4,
      min_rating=0,
      build_vocabs=True,
      train_data_fraction=.8,
  )
print(stats)


{'train_size': 50539, 'test_size': 12635, 'train_file': 'data/processing/train.tfrecord', 'test_file': 'data/processing/test.tfrecord', 'vocab_size': 2771, 'vocab_file': 'data/processing/item_vocab.json', 'vocab_max_id': 'B00L3YHF6O'}


In [None]:
train_filename = "./data/processing/train.tfrecord"
train = tf.data.TFRecordDataset(train_filename)

test_filename = "./data/processing/test.tfrecord"
test = tf.data.TFRecordDataset(test_filename)


In [None]:

feature_description = {
    'context_item_id': tf.io.FixedLenFeature([10], tf.int64, default_value=np.repeat(0, 10)),
    'label_item_id': tf.io.FixedLenFeature([1], tf.int64, default_value=0),
}


In [None]:


def _parse_function(example_proto):
  return tf.io.parse_single_example(example_proto, feature_description)

train_ds = train.map(_parse_function).map(lambda x: {
    "context_item_id": tf.strings.as_string(x["context_item_id"]),
    "label_item_id": tf.strings.as_string(x["label_item_id"])
})



In [None]:

for x in train_ds.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'context_item_id': array([b'1513', b'1648', b'971', b'1671', b'1452', b'2023', b'2103',
       b'1810', b'1744', b'2266'], dtype=object),
 'label_item_id': array([b'2401'], dtype=object)}


In [None]:


test_ds = test.map(_parse_function).map(lambda x: {
    "context_item_id": tf.strings.as_string(x["context_item_id"]),
    "label_item_id": tf.strings.as_string(x["label_item_id"])
})


In [None]:
# movies = tfds.load("movielens/1m-movies", split='train')
# movies = movies.map(lambda x: x["movie_id"])
# movie_ids = movies.batch(1_000)
# unique_movie_ids = np.unique(np.concatenate(list(movie_ids)))

In [None]:
ratings_df['itemID']=ratings_df['itemID'].astype(str)
items_dict = ratings_df[['itemID']].drop_duplicates()

items= tf.data.Dataset.from_tensor_slices(dict(items_dict))
items = items.map(lambda x: x['itemID'])
items_list =items.batch(1_000)
unique_item_ids = unique_items = np.unique(np.concatenate(list(items_list),axis =0))


## Implementing a sequential model


In [None]:
embedding_dimension = 32

query_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
      vocabulary=unique_item_ids, mask_token=None),
    tf.keras.layers.Embedding(len(unique_item_ids) + 1, embedding_dimension), 
    tf.keras.layers.GRU(embedding_dimension),
])

candidate_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_item_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_item_ids) + 1, embedding_dimension)
])

The metrics, task and full model are defined similar to the basic retrieval model. 

In [None]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=items.batch(128).map(candidate_model)
)

task = tfrs.tasks.Retrieval(
  metrics=metrics
)

class Model(tfrs.Model):

    def __init__(self, query_model, candidate_model):
        super().__init__()
        self._query_model = query_model
        self._candidate_model = candidate_model

        self._task = task

    def compute_loss(self, features, training=False):
      
        watch_history = features["context_item_id"]
        watch_next_label = features["label_item_id"]

        query_embedding = self._query_model(watch_history)       
        candidate_embedding = self._candidate_model(watch_next_label)
        
        return self._task(query_embedding, candidate_embedding, compute_metrics=not training)

## Fitting and evaluating

We can now compile, train and evaluate our sequential retrieval model.

In [None]:
model = Model(query_model, candidate_model)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1))

In [None]:
cached_train = train_ds.shuffle(10_000).batch(12800).cache()
cached_test = test_ds.batch(2560).cache()

In [None]:
model.fit(cached_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7efff6e0f580>

In [None]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.004194697365164757,
 'factorized_top_k/top_5_categorical_accuracy': 0.021923229098320007,
 'factorized_top_k/top_10_categorical_accuracy': 0.038385435938835144,
 'factorized_top_k/top_50_categorical_accuracy': 0.13430945575237274,
 'factorized_top_k/top_100_categorical_accuracy': 0.21733281016349792,
 'loss': 24549.404296875,
 'regularization_loss': 0,
 'total_loss': 24549.404296875}