In [None]:
from datetime import datetime
import os
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD

np.random.seed(0)

###   Dataset



In [None]:
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.0-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.6.0


In [None]:
!gdown https://drive.google.com/uc?id=14ToX4hgP27VE7RIevQMbA-n6KTkhzQP2


Downloading...
From: https://drive.google.com/uc?id=14ToX4hgP27VE7RIevQMbA-n6KTkhzQP2
To: /content/ecommerse.zip
100% 44.7M/44.7M [00:05<00:00, 7.80MB/s]


In [None]:
!unzip  -o  "ecommerse.zip"  -d  "/content"

Archive:  ecommerse.zip
  inflating: /content/olist_customers_dataset.csv  
  inflating: /content/olist_geolocation_dataset.csv  
  inflating: /content/olist_order_items_dataset.csv  
  inflating: /content/olist_order_payments_dataset.csv  
  inflating: /content/olist_order_reviews_dataset.csv  
  inflating: /content/olist_orders_dataset.csv  
  inflating: /content/olist_products_dataset.csv  
  inflating: /content/olist_sellers_dataset.csv  
  inflating: /content/product_category_name_translation.csv  


In [None]:
orders_df = pd.read_csv('olist_orders_dataset.csv')
payment_df = pd.read_csv('olist_order_payments_dataset.csv')
customers_df = pd.read_csv('olist_customers_dataset.csv')
review_df = pd.read_csv('olist_order_reviews_dataset.csv')
product_df = pd.read_csv('olist_products_dataset.csv')
orderitems_df = pd.read_csv('olist_order_items_dataset.csv')
catagory_df = pd.read_csv('product_category_name_translation.csv')
seller_df = pd.read_csv('olist_sellers_dataset.csv')

In [None]:
order_customer = orders_df.merge(customers_df, on = 'customer_id')
order_customer.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,af07308b275d755c9edb36a90c618231,47813,barreiras,BA
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,3a653a41f6f9fc3d2a113cf8398680e8,75265,vianopolis,GO
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00,7c142cf63193a1473d2e66489a9ae977,59296,sao goncalo do amarante,RN
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00,72632f0f9dd73dfee390c9b22eb56dd6,9195,santo andre,SP


In [None]:
order_item = orders_df.merge(orderitems_df, on = 'order_id')
order_item.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,1,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,2017-10-06 11:07:15,29.99,8.72
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,1,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962,2018-07-30 03:24:27,118.7,22.76
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,1,aa4383b373c6aca5d8797843e5594415,4869f7a5dfa277a7dca6462dcf3b52b2,2018-08-13 08:55:23,159.9,19.22
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15 00:00:00,1,d0b61bfb1de832b15ba9d266ca96e5b0,66922902710d126a0e7d26b0e3805106,2017-11-23 19:45:59,45.0,27.2
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26 00:00:00,1,65266b2da20d04dbe00c5c2d3bb7859e,2c9e548be18521d1c43cde1c582c6de8,2018-02-19 20:31:37,19.9,8.72


In [None]:
order_customer=order_customer.merge(review_df,on='order_id')

In [None]:
df_full = order_customer.merge(payment_df, on = 'order_id')
df_full.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_unique_id,customer_zip_code_prefix,...,review_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp,payment_sequential,payment_type,payment_installments,payment_value
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,7c396fd4830fd04220f754e42b4e5bff,3149,...,a54f0611adc9ed256b57ede6b6eb5114,4,,"Não testei o produto ainda, mas ele veio corre...",2017-10-11 00:00:00,2017-10-12 03:43:48,1,credit_card,1,18.12
1,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,7c396fd4830fd04220f754e42b4e5bff,3149,...,a54f0611adc9ed256b57ede6b6eb5114,4,,"Não testei o produto ainda, mas ele veio corre...",2017-10-11 00:00:00,2017-10-12 03:43:48,3,voucher,1,2.0
2,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,7c396fd4830fd04220f754e42b4e5bff,3149,...,a54f0611adc9ed256b57ede6b6eb5114,4,,"Não testei o produto ainda, mas ele veio corre...",2017-10-11 00:00:00,2017-10-12 03:43:48,2,voucher,1,18.59
3,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,af07308b275d755c9edb36a90c618231,47813,...,8d5266042046a06655c8db133d120ba5,4,Muito boa a loja,Muito bom o produto.,2018-08-08 00:00:00,2018-08-08 18:37:50,1,boleto,1,141.46
4,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,3a653a41f6f9fc3d2a113cf8398680e8,75265,...,e73b67b67587f7644d5bd1a52deb1b01,5,,,2018-08-18 00:00:00,2018-08-22 19:07:58,1,credit_card,3,179.12


In [None]:
df_full=df_full.merge(order_item,on='order_id')

In [None]:
df_full=df_full.merge(product_df,on='product_id')

In [None]:
#Removing cancelled orders
#df_full = df_full[df_full['order_status_y'] == 'delivered']
#df_full['order_status_y'].value_counts()

In [None]:
df_full.head(2)

Unnamed: 0,order_id,customer_id_x,order_status_x,order_purchase_timestamp_x,order_approved_at_x,order_delivered_carrier_date_x,order_delivered_customer_date_x,order_estimated_delivery_date_x,customer_unique_id,customer_zip_code_prefix,...,price,freight_value,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,7c396fd4830fd04220f754e42b4e5bff,3149,...,29.99,8.72,utilidades_domesticas,40.0,268.0,4.0,500.0,19.0,8.0,13.0
1,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,7c396fd4830fd04220f754e42b4e5bff,3149,...,29.99,8.72,utilidades_domesticas,40.0,268.0,4.0,500.0,19.0,8.0,13.0


In [None]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
df_full=df_full[['order_id','customer_unique_id','product_id','product_category_name','customer_zip_code_prefix','customer_city','review_score','review_creation_date']]

In [None]:
df_full.rename(columns = {'order_id':'order_id','product_category_name':'category', 'customer_unique_id':'user_id', 'review_creation_date':'timestamp','customer_zip_code_prefix':'zip', 'customer_city':'city' }, inplace = True)

In [None]:
df_full.rename(columns = {'product_id':'itemID'}, inplace = True)

In [None]:
df_full[['order_id', 'timestamp', 'user_id',
       'city', 'category','itemID',
        'review_score' ]].head(2)

Unnamed: 0,order_id,timestamp,user_id,city,category,itemID,review_score
0,e481f51cbdc54678b7cc49136f2d6af7,2017-10-11 00:00:00,7c396fd4830fd04220f754e42b4e5bff,sao paulo,utilidades_domesticas,87285b34884572647811a353c7ac498a,4
1,e481f51cbdc54678b7cc49136f2d6af7,2017-10-11 00:00:00,7c396fd4830fd04220f754e42b4e5bff,sao paulo,utilidades_domesticas,87285b34884572647811a353c7ac498a,4


In [None]:
df_full=df_full[df_full.review_score>=3]

In [None]:

df_full[['user_id',      
          'itemID',  
         ]] = df_full[['user_id','itemID']].astype(str)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [None]:
df_full.head(2)

Unnamed: 0,order_id,user_id,itemID,category,zip,city,review_score,timestamp
0,e481f51cbdc54678b7cc49136f2d6af7,7c396fd4830fd04220f754e42b4e5bff,87285b34884572647811a353c7ac498a,utilidades_domesticas,3149,sao paulo,4,2017-10-11 00:00:00
1,e481f51cbdc54678b7cc49136f2d6af7,7c396fd4830fd04220f754e42b4e5bff,87285b34884572647811a353c7ac498a,utilidades_domesticas,3149,sao paulo,4,2017-10-11 00:00:00


In [None]:
rating_df=df_full

In [None]:
rating_df.head(2)

Unnamed: 0,order_id,user_id,itemID,category,zip,city,review_score,timestamp
0,e481f51cbdc54678b7cc49136f2d6af7,7c396fd4830fd04220f754e42b4e5bff,87285b34884572647811a353c7ac498a,utilidades_domesticas,3149,sao paulo,4,2017-10-11 00:00:00
1,e481f51cbdc54678b7cc49136f2d6af7,7c396fd4830fd04220f754e42b4e5bff,87285b34884572647811a353c7ac498a,utilidades_domesticas,3149,sao paulo,4,2017-10-11 00:00:00


In [None]:
item_ids = list(set(list(rating_df.itemID.unique())))
user_ids = list(set(list(rating_df.user_id.unique())))
order_ids = list(set(list(rating_df.order_id.unique())))

In [None]:
dict_users = {}
index = 0
for ids in sorted(user_ids):
    dict_users[ids] = index
    index += 1
dict_items = {}
index = 0
for ids in sorted(item_ids):
    dict_items[ids] = index
    index += 1
dict_orders = {}
index = 0
for ids in sorted(order_ids):
    dict_orders[ids] = index
    index += 1

In [None]:
rating_df['user_id'] = rating_df.user_id.map(dict_users)
rating_df['itemID'] = rating_df.itemID.map(dict_items)
rating_df['order_id'] = rating_df.order_id.map(dict_orders)

In [None]:
rating_df['itemID'].head(2)

0    15555
1    15555
Name: itemID, dtype: int64

In [None]:
rating_df=rating_df.dropna()

In [None]:
rating_df.rename(columns = {'user_id':'userID' }, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [None]:
rating_df.rename(columns = {'review_score':'rating' }, inplace = True)

In [None]:

rating_df["itemID"] = rating_df["itemID"].astype(str)
rating_df["userID"] = rating_df["userID"].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_df["itemID"] = rating_df["itemID"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rating_df["userID"] = rating_df["userID"].astype(str)


In [None]:
# unique_users = rating_df.userID.unique()
# user_ids = dict(zip(unique_users, np.arange(unique_users.shape[0], dtype=np.int32)))

# unique_items = rating_df.itemID.unique()
# item_ids = dict(zip(unique_items, np.arange(unique_items.shape[0], dtype=np.int32)))

In [None]:
product_ids = list(set(list(rating_df.itemID.unique())))
user_ids = list(set(list(rating_df.userID.unique())))

### Tensorflow-recommenders

In [None]:
!pip install tensorflow-recommenders==0.6.0

import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-recommenders==0.6.0
  Downloading tensorflow_recommenders-0.6.0-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.8/85.8 KB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-recommenders
Successfully installed tensorflow-recommenders-0.6.0


### Prepare dataset for tensorflow recommenders

In [None]:


## item features
items_dict = rating_df[['itemID']].drop_duplicates()
#items_dict = {name: np.array(value) for name, value in items_dict.items()}
items = tf.data.Dataset.from_tensor_slices(dict(items_dict))

## map the features in interactions and items


In [None]:
# ratings = tf.data.Dataset.from_tensor_slices(dict(rating_df)).map(lambda x: {
#     "userID": x["userID"],
#     "itemID": x["itemID"],
#     "rating": x["rating"]
# })
interactions_dict = rating_df[['userID', 
                                'itemID',
                                 'zip',
                               
                                'rating'
                                      ]]


In [None]:
ratings = tf.data.Dataset.from_tensor_slices(dict(interactions_dict))


In [None]:
itemlists = items.map(lambda x: x['itemID'])

In [None]:
unique_zip = np.unique(np.concatenate(list(ratings.batch(1_000).map(lambda x: x["zip"]))))
unique_userIds = np.unique(np.concatenate(list(ratings.batch(1_000).map(lambda x: x["userID"]))))
unique_productIds = np.unique(np.concatenate(list(items.batch(1000).map(lambda x: x["itemID"]))))

In [None]:

#items =ratings.batch(rating_df.shape[0]).map(lambda x: x["itemID"])
# user_ids = ratings.batch(rating_df.shape[0]).map(lambda x: x["userID"])
# unique_user_ids = np.unique(np.concatenate(list(user_ids)))
# unique_item_ids = np.unique(np.concatenate(list(items)))
# products_dict = rating_df[['itemID']].drop_duplicates()
# products_dict = {name: np.array(value) for name, value in products_dict.items()}
# products = tf.data.Dataset.from_tensor_slices(products_dict)
# products = products.map(lambda x: x['itemID'])


In [None]:
# userIds    = rating_df.userID.unique()
# productIds = rating_df.itemID.unique()
# total_ratings= len(rating_df.index)

In [None]:
# ratings = tf.data.Dataset.from_tensor_slices( {"userID":tf.cast( rating_df.userID.values  ,tf.string),
#                                 "itemID":tf.cast( rating_df.itemID.values,tf.string),
#                                 "rating":tf.cast( rating_df.rating.values  ,tf.int8,) } )

In [None]:
total_ratings= len(rating_df.index)
tf.random.set_seed(42)
shuffled = ratings.shuffle(int(total_ratings), seed=42, reshuffle_each_iteration=False)

train = shuffled.take( int(total_ratings*0.8) )
test = shuffled.skip(int(total_ratings*0.8)).take(int(total_ratings*0.2))

# unique_productIds = unique_items
# unique_userIds    = unique_user_ids
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

### Tensorflow Ranking 

In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
# Build a model.
class RankingModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        embedding_dimension = 32

        self.user_embeddings = tf.keras.Sequential([
                                    tf.keras.layers.experimental.preprocessing.StringLookup(
                                        vocabulary=unique_user_ids, mask_token=None),
                                        # add addional embedding to account for unknow tokens
                                    tf.keras.layers.Embedding(len(unique_user_ids)+1, embedding_dimension)
                                    ])

        self.product_embeddings = tf.keras.Sequential([
                                    tf.keras.layers.experimental.preprocessing.StringLookup(
                                        vocabulary=unique_productIds, mask_token=None),
                                    # add addional embedding to account for unknow tokens
                                    tf.keras.layers.Embedding(len(unique_productIds)+1, embedding_dimension)
                                    ])
        # Set up a retrieval task and evaluation metrics over the
        # entire dataset of candidates.
        self.ratings = tf.keras.Sequential([
                            tf.keras.layers.Dense(256, activation="relu"),
                            tf.keras.layers.Dense(128,  activation="relu"),
                            tf.keras.layers.Dense(1)
                              ])
    def call(self, userId, productId):
        user_embeddings  = self.user_embeddings (userId)
        product_embeddings = self.product_embeddings(productId)
        return self.ratings(tf.concat([user_embeddings,product_embeddings], axis=1))

# Build a model.
class BuildModelRanking(tfrs.models.Model):

    def __init__(self):
        super().__init__()
        self.ranking_model: tf.keras.Model = RankingModel()
        self.task: tf.keras.layers.Layer   = tfrs.tasks.Ranking(
                                                    loss    =  tf.keras.losses.MeanSquaredError(),
                                                    metrics = [tf.keras.metrics.RootMeanSquaredError()])
            

    def compute_loss(self, features, training=False):
        rating_predictions = self.ranking_model(features["userID"], features["itemID"]  )

        return self.task( labels=features["rating"], predictions=rating_predictions)

In [None]:
modelranking = BuildModelRanking()
modelranking.compile(optimizer=tf.keras.optimizers.RMSprop( learning_rate=0.01 ))

ranking_history=modelranking.fit(cached_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
metrics = modelranking.evaluate(cached_test, return_dict=True)




### Muti task Both  Retrieval and Ranking 

In [None]:
class BuildModelMultitask(tfrs.models.Model):

  def __init__(self, rating_weight: float, retrieval_weight: float) -> None:
    # We take the loss weights in the constructor: this allows us to instantiate
    # several model objects with different loss weights.

    super().__init__()

    embedding_dimension = 32

    # User and movie models.
    self.item_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_productIds, mask_token=None),
      tf.keras.layers.Embedding(len(unique_productIds) + 1, embedding_dimension)
    ])
    self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    # Model to take in user and item embeddings and predict ratings.
    self.rating_model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(1),
    ])

    # The tasks.
    self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )
    self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=itemlists.batch(128).map(self.item_model)
        )
    )

    # The loss weights.
    self.rating_weight = rating_weight
    self.retrieval_weight = retrieval_weight

  def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["userID"])
    # And pick out the movie features and pass them into the movie model.
    item_embeddings = self.item_model(features["itemID"])

    return (
        user_embeddings,
        item_embeddings,
        # We apply the multi-layered rating model to a concatentation of
        # user and movie embeddings.
        self.rating_model(
            tf.concat([user_embeddings, item_embeddings], axis=1)
        ),
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    ratings = features.pop("rating")

    user_embeddings,item_embeddings, rating_predictions = self(features)

    # We compute the loss for each task.
    rating_loss = self.rating_task(
        labels=ratings,
        predictions=rating_predictions,
    )
    retrieval_loss = self.retrieval_task(user_embeddings, item_embeddings)

    # And combine them using the loss weights.
    return (self.rating_weight * rating_loss
            + self.retrieval_weight * retrieval_loss)

In [None]:
modelmultitask = BuildModelMultitask(rating_weight=5.0, retrieval_weight=0.5)
modelmultitask.compile(optimizer=tf.keras.optimizers.RMSprop(0.01))

In [None]:
modelhist_multitask=modelmultitask.fit(cached_train, epochs=3)
metrics = modelmultitask.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Retrieval top-100 accuracy: 0.192.
Ranking RMSE: 1.228.


## Deep cross Network

In [None]:
rating_df.head(5)

Unnamed: 0,order_id,userID,itemID,category,zip,city,rating,timestamp
0,75205,39668,15555,utilidades_domesticas,3149,sao paulo,4,2017-10-11 00:00:00
1,75205,39668,15555,utilidades_domesticas,3149,sao paulo,4,2017-10-11 00:00:00
2,75205,39668,15555,utilidades_domesticas,3149,sao paulo,4,2017-10-11 00:00:00
3,6126,18623,15555,utilidades_domesticas,3366,sao paulo,4,2017-08-19 00:00:00
4,4786,76179,15555,utilidades_domesticas,2290,sao paulo,5,2017-08-08 00:00:00


In [None]:
epochs = 20
learning_rate = 0.01

In [None]:
# rating_df['userID']=rating_df['userID'].astype(str)
# rating_df['itemID']=rating_df['itemID'].astype(str)
# rating_df['zip']=rating_df['zip'].astype(str)

In [None]:
# interactions_dict = rating_df.groupby(['userID', 
#                                       'itemID',
#                                       'category'
#                                       ,'zip'
#                                       ])[ 'rating'].sum().reset_index()


In [None]:
# #interactions_dict = {name: np.array(value) for name, value in interactions_dict.items()}
# interactions = tf.data.Dataset.from_tensor_slices(dict(interactions_dict))

In [None]:


# ## item features
# items_dict = rating_df[['itemID']].drop_duplicates()
# #items_dict = {name: np.array(value) for name, value in items_dict.items()}
# items = tf.data.Dataset.from_tensor_slices(dict(items_dict))

# ## map the features in interactions and items

# # Select the basic features.
# interactions = interactions.map(lambda x: {
#     'userID' : str(x['userID']), 
#     'zip' : str(x['zip']),
#     'category' : str(x['category']),
#     'itemID' : str(x['itemID']), 
#     'rating' : int(x['rating']),
    
# })



In [None]:
# items = items.map(lambda x: str(x['itemID']))
# category = interactions.map(lambda x: str(x['category']))
# zip = interactions.map(lambda x: str(x['zip']))

In [None]:
# unique_items = np.unique(np.concatenate(list(items.batch(1000))))
# unique_user_ids = np.unique(np.concatenate(list(interactions.batch(1_000).map(lambda x: x["userID"]))))
# unique_zip = np.unique(np.concatenate(list(interactions.batch(1_000).map(lambda x: x["zip"]))))

In [None]:
total=ratings.__len__().numpy()
train_size=(int) (total * .8 )
test_size=(int) (total * .2 )

In [None]:
tf.random.set_seed(42)
shuffled = ratings.shuffle(total, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(train_size)
test = shuffled.skip(train_size).take(test_size)

In [None]:
class UserModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        
        self.embedding_dimension = 32
        max_tokens = 10_000

        ## user id
        self.user_embedding = tf.keras.Sequential([
                                                    tf.keras.layers.experimental.preprocessing.StringLookup(
                                                    vocabulary=unique_userIds, mask_token=None),
                                                    tf.keras.layers.Embedding(len(unique_userIds) + 1, 32),
                                                    ])
       
        
              
        # zip
        self.zip_embedding = tf.keras.Sequential([
                                  tf.keras.layers.experimental.preprocessing.StringLookup(
                                    vocabulary=unique_zip, mask_token=None),
                                  tf.keras.layers.Embedding(len(unique_zip) + 1, self.embedding_dimension)
                                ])
        
  
        

    def call(self, inputs):
        # Take the input dictionary, pass it through each input layer,
        # and concatenate the result.
        return tf.concat([
            self.user_embedding(inputs["userID"]),
            self.zip_embedding(inputs["zip"]),
        ], axis=1)

In [None]:
class QueryModel(tf.keras.Model):
    """Model for encoding user queries."""

    def __init__(self, layer_sizes, projection_dim=None):
        """Model for encoding user queries
        """
        super().__init__()

        # We first use the user model for generating embeddings.
        self.embedding_model = UserModel()
            

        # Then construct the layers.
        self.dense_layers = tf.keras.Sequential(tfrs.layers.dcn.Cross(projection_dim=projection_dim,
                                        kernel_initializer="glorot_uniform"))

        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

        # No activation for the last layer.
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size))

    def call(self, inputs):
        feature_embedding = self.embedding_model(inputs)
        return self.dense_layers(feature_embedding)

In [None]:
class ItemModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        
        self.embedding_dimension = 32

        max_tokens = 10_000

        self.item_embedding = tf.keras.Sequential([
          tf.keras.layers.experimental.preprocessing.StringLookup(
              vocabulary=unique_productIds,mask_token=None),
          tf.keras.layers.Embedding(len(unique_productIds) + 1, self.embedding_dimension)
        ])

     
      
    def call(self, items):
        return tf.concat([
            self.item_embedding(items),
        ], axis=1)

In [None]:
class CandidateModel(tf.keras.Model):
    """Model for encoding items."""

    def __init__(self, layer_sizes, projection_dim=None):
        """Model for encoding items.

        
        """
        super().__init__()

        self.embedding_model = ItemModel()

         # Then construct the layers.
        self.dense_layers = tf.keras.Sequential(tfrs.layers.dcn.Cross(projection_dim=projection_dim,
                                                kernel_initializer="glorot_uniform"))

        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

        # No activation for the last layer.
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size))

    def call(self, inputs):
        feature_embedding = self.embedding_model(inputs)
        return self.dense_layers(feature_embedding)

In [None]:
class CrossDNNModel(tfrs.models.Model):

    def __init__(self, layer_sizes, projection_dim=None ):
        super().__init__()
        
        self.query_model : tf.keras.Model = QueryModel(layer_sizes)
        self.candidate_model : tf.keras.Model = CandidateModel(layer_sizes)
        
        ## rating and retrieval task.
        
        self.rating_task = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()],
        )
                 
        self.retrieval_task : tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=itemlists.batch(128).map(self.candidate_model)
            )
        )

   

    def compute_loss(self, features, training=False):
        
        # We only pass the user id and timestamp features into the query model. This
        # is to ensure that the training inputs would have the same keys as the
        # query inputs. Otherwise the discrepancy in input structure would cause an
        # error when loading the query model after saving it.
        ratings = features.pop("rating")
        
        query_embeddings = self.query_model({
            "userID": features["userID"],
            "zip": features["zip"],
        })
    
        item_embeddings = self.candidate_model(features["itemID"])       
        retrieval_loss = self.retrieval_task(query_embeddings, item_embeddings)
    
    
        return self.retrieval_task(query_embeddings, item_embeddings)

In [None]:
cached_train = train.shuffle(100_000).batch(512).cache()
cached_test = test.batch(200).cache()

model = CrossDNNModel([128,32], projection_dim=None)



In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.01))

modelhist_dcn=model.fit(cached_train,  epochs=2)

metrics = model.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Retrieval top-50 accuracy: {metrics['factorized_top_k/top_50_categorical_accuracy']:.3f}.")
print(f"Retrieval top-10 accuracy: {metrics['factorized_top_k/top_10_categorical_accuracy']:.3f}.")
print(f"Retrieval top-5 accuracy: {metrics['factorized_top_k/top_5_categorical_accuracy']:.3f}.")
print(f"Retrieval top-1 accuracy: {metrics['factorized_top_k/top_1_categorical_accuracy']:.3f}.")

Epoch 1/2
Epoch 2/2
Retrieval top-100 accuracy: 0.088.
Retrieval top-50 accuracy: 0.075.
Retrieval top-10 accuracy: 0.045.
Retrieval top-5 accuracy: 0.034.
Retrieval top-1 accuracy: 0.007.


In [None]:

index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
index.index_from_dataset(
  tf.data.Dataset.zip((itemlists.batch(100), itemlists.batch(100).map(model.candidate_model)))
)
 
 
_, titles = index({
    "userID": np.array(['39668']),
     
     "zip":np.array(['3149']),
 },
    k=10
)
titles[0]

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'15555', b'14537', b'13279', b'6671', b'27411', b'4687', b'12495',
       b'13171', b'9794', b'10898'], dtype=object)>