In [1]:
from typing import Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [2]:
devices = tf.config.list_physical_devices('GPU')
devices

2022-05-29 21:14:52.712892: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-29 21:14:52.718082: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-29 21:14:52.718331: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [169]:
data = pd.read_csv(
    'data/Online-Retail.csv',
    dtype={'CustomerID': str, 'StockCode': str},
    parse_dates=['InvoiceDate'],
)
data = data.dropna()
data = data[data['StockCode'].apply(len) == 5]
data['timestamp'] = (data['InvoiceDate'] - pd.Timestamp('1970-01-01')) / pd.Timedelta('1s')

# Por lo mientras para que no tarde tanto
data = data.sample(n=20_000, random_state=42)

# Para quitar los clientes con pocas interacciones
customers = data['CustomerID'].value_counts()
customers = {k: v for (k, v) in customers.items() if v > 2}
data = data[data['CustomerID'].isin(customers)]

# Para pasar de item_id -> description
item_to_description = dict(data[['StockCode', 'Description']].values)

print(f'Data shape {data.shape}')
data.head()

Data shape (18067, 9)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,timestamp
438319,574311,23382,BOX OF 6 CHRISTMAS CAKE DECORATIONS,6,2011-11-03 16:56:00,3.75,15640,United Kingdom,1320339000.0
28901,538662,79321,CHILLI LIGHTS,24,2010-12-13 15:44:00,4.25,15159,United Kingdom,1292255000.0
466055,576301,22470,HEART OF WICKER LARGE,4,2011-11-14 14:40:00,2.95,14667,United Kingdom,1321282000.0
194702,553663,21080,SET/20 RED RETROSPOT PAPER NAPKINS,3,2011-05-18 12:13:00,0.85,14527,United Kingdom,1305721000.0
55819,540999,21633,SUNFLOWER DECORATIVE PARASOL,30,2011-01-13 10:08:00,3.95,13694,United Kingdom,1294913000.0


In [64]:
features_dict = {
    'user_id': data['CustomerID'].values,
    'item_id': data['StockCode'].values,
    'item_description': data['Description'].values,
    'timestamp': data['timestamp'].values,
    'country': data['Country'].values
}

purchases = tf.data.Dataset.from_tensor_slices(features_dict)

In [143]:
type(purchases)

tensorflow.python.data.ops.dataset_ops.TensorSliceDataset

In [153]:
tf.raw_ops.TensorSliceDataset

<function tensorflow.python.ops.gen_dataset_ops.tensor_slice_dataset(components, output_shapes, is_files=False, metadata='', name=None)>

In [72]:
interactions = purchases.map(lambda x: {
    'user_id': x['user_id'], 
    'item_id': x['item_id']
})

users = purchases.map(lambda x: x['user_id'])

items = purchases.map(lambda x: x['item_id'])

In [141]:
for x in interactions.take(10).as_numpy_iterator():
    print(x)

{'user_id': b'15640', 'item_id': b'23382'}
{'user_id': b'15159', 'item_id': b'79321'}
{'user_id': b'14667', 'item_id': b'22470'}
{'user_id': b'14527', 'item_id': b'21080'}
{'user_id': b'13694', 'item_id': b'21633'}
{'user_id': b'15194', 'item_id': b'22867'}
{'user_id': b'13408', 'item_id': b'23205'}
{'user_id': b'14715', 'item_id': b'82552'}
{'user_id': b'17523', 'item_id': b'22751'}
{'user_id': b'12748', 'item_id': b'22170'}


In [159]:
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(users)

item_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
item_ids_vocabulary.adapt(items)

In [162]:
data = tf.constant([["12748", "Hola amigos", "15640"]])
user_ids_vocabulary(data)

<tf.Tensor: shape=(1, 3), dtype=int64, numpy=array([[ 4,  0, 85]])>

In [165]:
user_ids_vocabulary.get_vocabulary()[85]

'15640'

In [75]:
class ECommerceModel(tfrs.Model):
    # We derive from a custom base class to help reduce boilerplate. Under the hood,
    # these are still plain Keras Models.

    def __init__(
        self,
        user_model: tf.keras.Model,
        item_model: tf.keras.Model,
        task: tfrs.tasks.Retrieval,
    ):
        super().__init__()

        # Set up user and movie representations.
        self.user_model = user_model
        self.item_model = item_model

        # Set up a retrieval task.
        self.task = task

    def compute_loss(
        self, features: Dict[Text, tf.Tensor], training=False
    ) -> tf.Tensor:
        # Define how the loss is computed.

        user_embeddings = self.user_model(features['user_id'])
        item_embeddings = self.item_model(features['item_id'])

        return self.task(user_embeddings, item_embeddings)

In [76]:
# Define user and movie models.
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocabulary_size(), 64)
])

item_model = tf.keras.Sequential([
    item_ids_vocabulary,
    tf.keras.layers.Embedding(item_ids_vocabulary.vocabulary_size(), 64)
])

# Define your objectives.
task = tfrs.tasks.Retrieval(metrics=tfrs.metrics.FactorizedTopK(
    items.batch(128).map(item_model)
  )
)

In [77]:
# Create a retrieval model.
model = ECommerceModel(user_model, item_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Train for 3 epochs.
model.fit(interactions.batch(4096), epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f231e471790>

In [93]:
user_ids_vocabulary.get_vocabulary()[10:20]

['14159',
 '14156',
 '14796',
 '14298',
 '14056',
 '15039',
 '18118',
 '17511',
 '16033',
 '16931']

In [178]:
# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
    unique_items.map(lambda x: x['item_id']).batch(128).map(lambda title: (title, model.item_model(title))))

# Get some recommendations.
scores, titles = index(np.array(["16931"]))
print(f"Top 3 recommendations for user 42: {titles[0, :10]}")

Top 3 recommendations for user 42: [b'20773' b'20854' b'23140' b'21879' b'23004' b'85144' b'22175' b'21098'
 b'21616' b'37413']


In [174]:
unique_items = {'item_id': data['StockCode'].unique()}
unique_items = tf.data.Dataset.from_tensor_slices(unique_items)

list(unique_items.take(1))

[{'item_id': <tf.Tensor: shape=(), dtype=string, numpy=b'23382'>}]

In [175]:
user_id = '16931'

past_purchases = data[data['CustomerID'] == user_id].sort_values(by='InvoiceDate')
print(f"History :\n {past_purchases[['StockCode', 'Description']].tail(10)}\n")

# Use brute-force search to set up retrieval using the trained representations.
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
    unique_items.map(lambda x: x['item_id']).batch(128).map(lambda title: (title, model.item_model(title))))

# Get some recommendations.
scores, items = index(np.array([user_id]))
recommendations = pd.DataFrame()
recommendations['Item_id'] = items.numpy().flatten()
recommendations['Description'] = recommendations['Item_id'].apply(lambda x: x.decode('utf-8')).map(item_to_description)
recommendations['Scores'] = scores.numpy().flatten()

print(f'Recommendations {recommendations}')

History :
        StockCode                         Description
412544     23319         BOX OF 6 MINI 50'S CRACKERS
428797     22306  SILVER MUG BONE CHINA TREE OF LIFE
484926     22110         BIRD HOUSE HOT WATER BOTTLE
484909     23210    WHITE ROCKING HORSE HAND PAINTED
484940     23439          HAND WARMER RED LOVE HEART
523185     21616     4 PEAR BOTANICAL DINNER CANDLES
523190     23202              JUMBO BAG VINTAGE LEAF
523184     23265    SET OF 3 WOODEN TREE DECORATIONS
523172     22422                 TOOTHPASTE TUBE PEN
523137     21098               CHRISTMAS TOILET ROLL

Recommendations     Item_id                      Description    Scores
0  b'20773'            BLUE PAISLEY NOTEBOOK  8.819242
1  b'20854'      BLUE PATCH PURSE PINK HEART  8.212093
2  b'23140'     TRIPLE WIRE HOOK IVORY HEART  7.386757
3  b'21879'                 HEARTS GIFT TAPE  6.392712
4  b'23004'        TRAVEL CARD WALLET PANTRY  6.013500
5  b'85144'  JARDIN ETCHED GLASS CHEESE DISH  5.903848
6  b