In [None]:
!pip install -q tensorflow-recommenders
!pip install -q --upgrade tensorflow-datasets
!pip install -q scann

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/96.2 KB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 KB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import files
import pandas as pd
import numpy as np

from typing import Dict, Text

import tensorflow as tf
import tensorflow_recommenders as tfrs

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
transaction = pd.read_csv('/df_cleaned.csv')
article = pd.read_csv('/articles.csv')

## Data Preparation

In [None]:
article = pd.DataFrame(transaction[transaction['t_dat']>='2020-09-07']['article_id'].value_counts().head(1000).index, columns=['article_id'])
transaction = transaction.drop_duplicates(subset=['customer_id','article_id'])

article['article_id'] = article['article_id'].astype(str).str.zfill(10)
transaction['article_id'] = transaction['article_id'].astype(str).str.zfill(10)

In [None]:
articles = tf.data.Dataset.from_tensor_slices(dict(article[['article_id']]))
articles = articles.map(lambda x: x['article_id'])

customer_unique = transaction['customer_id'].unique()
article_unique = article['article_id'].unique()

### Train data

In [None]:
trainset = transaction[transaction['t_dat']>='2019-08-01'].copy()

train = tf.data.Dataset.from_tensor_slices(dict(trainset[['customer_id','article_id']])).shuffle(100_000).batch(128).cache()

In [None]:
embedding_dimension = 500

#The query tower
customer_model = tf.keras.Sequential([tf.keras.layers.StringLookup(
                                        vocabulary = customer_unique, mask_token=None),
                                       tf.keras.layers.Embedding(
                                           len(customer_unique) + 1, embedding_dimension)])

#The candidate tower
article_model = tf.keras.Sequential([tf.keras.layers.StringLookup(
                                        vocabulary = article_unique, mask_token=None),
                                       tf.keras.layers.Embedding(
                                           len(article_unique) + 1, embedding_dimension)])

#Metrics
metrics = tfrs.metrics.FactorizedTopK(candidates=articles.batch(128).\
                                                map(article_model))

#Loss
task = tfrs.tasks.Retrieval(metrics=metrics)

## Model

In [None]:
class HnmModel(tfrs.Model):
    def __init__(self, customer_model, article_model):
        super().__init__()
        self.article_model: tf.keras.Model = article_model
        self.customer_model: tf.keras.Model = customer_model
        self.task = task
            
    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        customer_embeddings = self.customer_model(features['customer_id'])
        positive_article_embeddings = self.article_model(features['article_id'])
        
        return self.task(customer_embeddings, positive_article_embeddings, compute_metrics=not training)

In [None]:
model = HnmModel(customer_model, article_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

## Model training

In [None]:
model.fit(train, epochs=3, verbose=1)

## Predictions

In [None]:
index = tfrs.layers.factorized_top_k.ScaNN(model.customer_model, k=5)
index.index_from_dataset(tf.data.Dataset.zip((
    articles.batch(100), articles.batch(100).map(model.article_model)
)))

#Example
_, articles = index(tf.constant(['00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a']))
print(f'Recommendations: {articles[0, :3]}')

In [None]:
df = pd.DataFrame(transaction['customer_id'].unique(), columns=['customer_id'])
_, articles = index(df['customer_id'].values)

In [None]:
prediction = pd.Series(map(' '.join, articles.numpy().astype(str)))
df['prediction'] = prediction