In [1]:
from datetime import datetime
import os
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

from scipy import sparse
from scipy.sparse import csc_matrix
from sklearn.decomposition import TruncatedSVD

np.random.seed(0)

### Setup and Load dataset

In [2]:

import pandas as pd
import os
from os.path import exists
import zipfile
import numpy as np

In [3]:
!pip install gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!gdown https://drive.google.com/uc?id=1gfGS5iL8-8HxdWVtTHXLGWGfPZRvMj6x


Downloading...
From: https://drive.google.com/uc?id=1gfGS5iL8-8HxdWVtTHXLGWGfPZRvMj6x
To: /content/Music_InCarMusic.zip
  0% 0.00/152k [00:00<?, ?B/s]100% 152k/152k [00:00<00:00, 92.1MB/s]


In [5]:
!unzip "Music_InCarMusic.zip"  -d  "/content"

Archive:  Music_InCarMusic.zip
replace /content/Music_InCarMusic/Data_InCarMusic.xlsx? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: /content/Music_InCarMusic/Data_InCarMusic.xlsx  
  inflating: /content/Music_InCarMusic/ReadMe.txt  


In [6]:
!pip install pandas openpyxl

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:

import pandas as pd

xls = pd.ExcelFile(r"Music_InCarMusic/Data_InCarMusic.xlsx") 

rating_df = xls.parse(0) #2 is the sheet number+1 thus if the file has only 1 sheet write 0 in paranthesis
music_df = xls.parse(2)
context_df = xls.parse(1)
cat_df = xls.parse(3)

In [8]:

rating_df.rename(columns = {'UserID':'userID'}, inplace = True)
rating_df.rename(columns = {'ItemID':'itemID'}, inplace = True)
rating_df.rename(columns = {' Rating':'rating'}, inplace = True)


## Name of the file where user item interaction data avaiable?
- Column name should be in following name and order
- [ **itemID, userID , rating , [other features]** ]


### Dense columns

In [9]:
dense_col=None

### Sparse columns

In [10]:
rating_df.columns

Index(['userID', 'itemID', 'rating', 'DrivingStyle', 'landscape', 'mood',
       'naturalphenomena ', 'RoadType', 'sleepiness', 'trafficConditions',
       'weather'],
      dtype='object')

In [11]:
col_sparse= ['DrivingStyle', 'landscape', 'mood',
       'naturalphenomena ', 'RoadType', 'sleepiness', 'trafficConditions',
       'weather']


### User contexual features other than "user" column

In [12]:

user_col = ['DrivingStyle', 'landscape', 'mood',
       'naturalphenomena ', 'RoadType', 'sleepiness', 'trafficConditions',
       'weather']


In [13]:
for i in user_col:
  rating_df[i] = rating_df[i].fillna('NA')

In [14]:
rating_df=rating_df.reset_index(drop=True)

# Tensorflow recommenders

In [15]:
!pip install tensorflow-recommenders==0.6.0



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [16]:
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs
from typing import Dict, Text

In [17]:
allcols=['userID','itemID','rating']
for i in user_col:
  allcols.append(i)


In [18]:
for i in allcols:
  rating_df[i]= rating_df[i].astype(str)

In [19]:
interactions_dict = rating_df[allcols]


In [20]:
ratings = tf.data.Dataset.from_tensor_slices(dict(interactions_dict))

In [21]:
## item features
items_dict = rating_df[['itemID']].drop_duplicates()
#items_dict = {name: np.array(value) for name, value in items_dict.items()}
items = tf.data.Dataset.from_tensor_slices(dict(items_dict))

In [22]:
itemlists = items.map(lambda x: x['itemID'])

In [23]:
userIds    = rating_df.userID.unique()
productIds = rating_df.itemID.unique()


unique column values

In [24]:
unique=dict()
for i in allcols:
 unique[i]= np.unique(np.concatenate(list(ratings.batch(1_000).map(lambda x: x[i]))))

In [25]:
unique_items = np.unique(np.concatenate(list(items.batch(1000).map(lambda x: x["itemID"]))))

In [26]:
total=ratings.__len__().numpy()
train_size=(int) (total * .8 )
test_size=(int) (total * .2 )

In [27]:
total_ratings= len(rating_df.index)
tf.random.set_seed(42)
shuffled = ratings.shuffle(int(total_ratings), seed=42, reshuffle_each_iteration=False)

train = shuffled.take( int(total_ratings*0.8) )
test = shuffled.skip(int(total_ratings*0.8)).take(int(total_ratings*0.2))
# unique_productIds = unique_items
# unique_userIds    = unique_user_ids
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()


# Deep cross Network

In [28]:
class UserModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        
        self.embedding_dimension = 64
        self.embedding=dict()
        max_tokens = 10_000

        ## user id
        for col in allcols:
          if col=='itemID' or col=='rating':
            continue
          else:
            
            self.embedding[col] = tf.keras.Sequential([
                                                    tf.keras.layers.experimental.preprocessing.StringLookup(
                                                    vocabulary=unique[i], mask_token=None),
                                                    tf.keras.layers.Embedding(len(unique[col]) + 1, 32),
                                                    ])



    def call(self, inputs):
        cols=allcols.copy()
        cols.remove('itemID')
        cols.remove('rating')
        # Take the input dictionary, pass it through each input layer,
        # and concatenate the result.
     
        
        res=tf.concat([
          self.embedding[i](inputs[i])
           for i in cols]
       
        , axis=1)
      
        return res

In [29]:
class QueryModel(tf.keras.Model):
    """Model for encoding user queries."""

    def __init__(self, layer_sizes, projection_dim=None):
        """Model for encoding user queries
        """
        super().__init__()

        # We first use the user model for generating embeddings.
        self.embedding_model = UserModel()
            

        # Then construct the layers.
        self.dense_layers = tf.keras.Sequential(tfrs.layers.dcn.Cross(projection_dim=projection_dim,
                                        kernel_initializer="glorot_uniform"))

        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

        # No activation for the last layer.
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size))

    def call(self, inputs):
        feature_embedding = self.embedding_model(inputs)
        return self.dense_layers(feature_embedding)

In [30]:
class ItemModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        
        self.embedding_dimension = 64

        #max_tokens = 10_000

        self.item_embedding = tf.keras.Sequential([
          tf.keras.layers.experimental.preprocessing.StringLookup(
              vocabulary=unique_items,mask_token=None),
          tf.keras.layers.Embedding(len(unique_items) + 1, self.embedding_dimension)
        ])

        # self.item_vectorizer = tf.keras.layers.experimental.preprocessing.TextVectorization(
        #     max_tokens=max_tokens)

        # self.item_vectorizer.adapt(items)
      
    def call(self, it):
        return tf.concat([
            self.item_embedding(it)
         
        ], axis=1)

In [31]:
class CandidateModel(tf.keras.Model):
    """Model for encoding items."""

    def __init__(self, layer_sizes, projection_dim=None):
        """Model for encoding items.

        
        """
        super().__init__()

        self.embedding_model = ItemModel()

         # Then construct the layers.
        self.dense_layers = tf.keras.Sequential(tfrs.layers.dcn.Cross(projection_dim=projection_dim,
                                                kernel_initializer="glorot_uniform"))

        # Use the ReLU activation for all but the last layer.
        for layer_size in layer_sizes[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

        # No activation for the last layer.
        for layer_size in layer_sizes[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size))

    def call(self, inputs):
        feature_embedding = self.embedding_model(inputs)
        return self.dense_layers(feature_embedding)

In [32]:
class CrossDNNModel(tfrs.models.Model):

    def __init__(self, layer_sizes,projection_dim=None ):
        super().__init__()
        
        self.query_model : tf.keras.Model = QueryModel(layer_sizes)
        self.candidate_model : tf.keras.Model = CandidateModel(layer_sizes)
        
        ## rating and retrieval task.
        
        self.rating_task = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()],
        )
                 
        self.retrieval_task : tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=itemlists.batch(128).map(self.candidate_model)
            )
        )

      

    def compute_loss(self, features, training=False):
        
        # We only pass the user id and timestamp features into the query model. This
        # is to ensure that the training inputs would have the same keys as the
        # query inputs. Otherwise the discrepancy in input structure would cause an
        # error when loading the query model after saving it.
        ratings = features.pop("rating")
        cols=allcols.copy()
       
        cols.remove('itemID')
        cols.remove('rating')
        query_embeddings = self.query_model({
           i: features[i]
           for i in cols
           
        })
      
        item_embeddings = self.candidate_model(features["itemID"])       
        retrieval_loss = self.retrieval_task(query_embeddings, item_embeddings)
    
    
        return self.retrieval_task(query_embeddings, item_embeddings)

In [33]:
cached_train = train.shuffle(train_size).batch(32).cache()
cached_test = test.batch(32).cache()

model = CrossDNNModel([128,32], 
                      projection_dim=None)



In [34]:
model.compile(optimizer=tf.keras.optimizers.RMSprop(0.01))

modelhist_dcn=model.fit(cached_train, 
         epochs=3)

metrics = model.evaluate(cached_test, return_dict=True)

print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Retrieval top-50 accuracy: {metrics['factorized_top_k/top_50_categorical_accuracy']:.3f}.")
print(f"Retrieval top-10 accuracy: {metrics['factorized_top_k/top_10_categorical_accuracy']:.3f}.")
print(f"Retrieval top-5 accuracy: {metrics['factorized_top_k/top_5_categorical_accuracy']:.3f}.")
print(f"Retrieval top-1 accuracy: {metrics['factorized_top_k/top_1_categorical_accuracy']:.3f}.")

Epoch 1/3
Epoch 2/3
Epoch 3/3
Retrieval top-100 accuracy: 0.741.
Retrieval top-50 accuracy: 0.363.
Retrieval top-10 accuracy: 0.086.
Retrieval top-5 accuracy: 0.045.
Retrieval top-1 accuracy: 0.001.


In [36]:
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
index.index_from_dataset(
  tf.data.Dataset.zip((itemlists.batch(100), itemlists.batch(100).map(model.candidate_model)))
)
 

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7fb23cb90850>

recommendation

In [37]:
_, titles = index({"userID": np.array(['25']),
     "DrivingStyle": np.array(['NA']),
            "landscape":np.array(['NA']),
            "naturalphenomena " : np.array(['NA']),
            "RoadType": np.array(['NA']),
            "sleepiness": np.array(['NA']),
            "trafficConditions": np.array(['NA']),
             "weather": np.array(['rainy']),
            "mood": np.array(['NA']
    )},
    k=10
)
titles[0]

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'762', b'751', b'703', b'695', b'249', b'287', b'747', b'755',
       b'732', b'710'], dtype=object)>