In [None]:
!pip install tensorflow==2.15.1
!pip install tensorflow-recommenders==0.7.3

# Summary

This is a simple recommendation algorithm that I applied on a toy dataset. The idea is to train a Neural Network model with the available data (say transaction history of customers) and make recommendations for the future. Additionally, if a new customer comes, this algorithm finds the closest customer in the training data and makes the same recommendation.

# Models used

Recommendation Algorithm: $⇒$ `tfrs.Model`

Finding the closest customer $⇒$ `K-Means`

#Considerations and Limitations

1. **Updating $\mu$ and $\sigma$:** The algorithm assumes that the new customers and the ones in the training data come from the same distribution. This may not be true, therefore from time to time, $μ$ and $\sigma$ (used in z-score normalization) should be updated.


2. **Customer data complexity:** The customer information contains a few features that are convenient for data preprocessing. In a real life scenario, more complex data preprocessing steps may be required.

3. **Customer job category:** Is is assumed that the job categories are selected (not entered manually). Therefore, the cardinality assumed to be low which makes it suitable for `One Hot Encoding`.

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler,OneHotEncoder

# 1. Data Generation

In [None]:
#generating synthetic transaction history dataset
np.random.seed(42)
users = ["Tom", "John", "Michael"]
product_categories = ["Kitchen_Tools", "Accesories","Sports"]
products = {
    "Kitchen_Tools": ["Knife", "Cutting_Board", "Peeler", "Spatula", "Measuring_Cup"],
    "Accesories": ["Watch", "Wallet", "Sunglasses", "Belt", "Hat"],
    "Sports": ["Football", "Basketball", "Tennis_Racket", "Yoga_Mat", "Running_Shoes"]
}

train_data = []
num_iter = 1000
for user in users:

  if user == "Tom": #uniform user
    p = [0.333,0.333,0.334]

  elif user == "John": #interested in cooking
    p = [0.7,0.2,0.1]

  elif user == "Michael": #interested mostly in sports
    p = [0.05,0.35,0.6]

  for _ in range(num_iter):
    product_category = np.random.choice(product_categories,size = 1, replace = True,p = p)[0]
    product = np.random.choice(products[product_category],size = 1, replace = True)[0]
    train_data.append([user,product_category,product])

train_dataset = pd.DataFrame(train_data,columns = ["user","product_category","product"])
train_dataset = train_dataset.sample(frac = 1,replace = False,random_state = 42).reset_index(drop = True)
train_dataset_tf = tf.data.Dataset.from_tensor_slices(dict(train_dataset[["user","product"]])).batch(128)
train_dataset.head()

# 2. The Retrieval Algorithm

In [None]:
flat_products = [item for sublist in products.values() for item in sublist]

#define user model
user_model = tf.keras.Sequential([tf.keras.layers.StringLookup(vocabulary = users,mask_token = None,num_oov_indices=1),
                                  tf.keras.layers.Embedding(len(users) + 1, output_dim = 3,name = "user_embeddings")],name = "user_model")

#define product model
product_model = tf.keras.Sequential([tf.keras.layers.StringLookup(vocabulary = flat_products),
                                     tf.keras.layers.Embedding(len(flat_products) + 1, output_dim = 3,name = "product_embeddings")],name = "user_model")

#define candidates
candidates = tf.data.Dataset.from_tensor_slices(flat_products).batch(32).map(lambda x: (x,product_model(x)))

#the task is to find the most relevant products given the user
task = tfrs.tasks.Retrieval(metrics = tfrs.metrics.FactorizedTopK(candidates = candidates))

class RetrievalModel(tfrs.Model):
  def __init__(self,user_model,product_model):
    super().__init__()
    self.user_model = user_model
    self.product_model = product_model
    self.task = task

  def compute_loss(self, features,training = False):
    user_embeddings = self.user_model(features["user"])
    product_embeddings = self.product_model(features["product"])
    return self.task(user_embeddings,product_embeddings)


In [None]:
model = RetrievalModel(user_model,product_model)
model.compile(optimizer = tf.keras.optimizers.Adagrad(0.1))
model.fit(train_dataset_tf,epochs = 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7ac053ef0d90>

In [None]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(candidates)

scores,items_tom = index(tf.constant(["Tom"]), k = 3)
print("User: Tom\n")
print(f"Recommended products: {items_tom.numpy()[0].astype(str)}")
print(f"Recommendation scores: {scores.numpy()[0].astype(str)}\n")

scores,items_john = index(tf.constant(["John"]), k = 3)
print("User: John\n")
print(f"Recommended products: {items_john.numpy()[0].astype(str)}")
print(f"Recommendation scores: {scores.numpy()[0].astype(str)}\n")

scores,items_michael = index(tf.constant(["Michael"]), k = 3)
print("User: Michael\n")
print(f"Recommended products: {items_michael.numpy()[0].astype(str)}")
print(f"Recommendation scores: {scores.numpy()[0].astype(str)}\n")


User: Tom

Recommended products: ['Wallet' 'Football' 'Hat']
Recommendation scores: ['0.06319289' '0.0622658' '0.060824823']

User: John

Recommended products: ['Knife' 'Spatula' 'Cutting_Board']
Recommendation scores: ['0.96940625' '0.93734336' '0.9244829']

User: Michael

Recommended products: ['Running_Shoes' 'Tennis_Racket' 'Yoga_Mat']
Recommendation scores: ['1.3429782' '1.1107031' '1.0839791']



In [None]:
recommendations = {'Tom': items_tom.numpy()[0].astype(str),
                   'John': items_john.numpy()[0].astype(str),
                   'Michael': items_michael.numpy()[0].astype(str)}
recommendations

{'Tom': array(['Wallet', 'Football', 'Hat'], dtype='<U8'),
 'John': array(['Knife', 'Spatula', 'Cutting_Board'], dtype='<U13'),
 'Michael': array(['Running_Shoes', 'Tennis_Racket', 'Yoga_Mat'], dtype='<U13')}

# 3. Making Recommendations for a New Customer

In [None]:
user_information = pd.DataFrame(data=[
    ["Tom", 26, "Writer", "Male"],
    ["John", 35, "Cook", "Male"],
    ["Michael", 29, "Sports", "Male"]],
    columns=["user", "age", "profession_category", "gender"])

new_customer = pd.DataFrame(data = [["George",34,"Cook","Male"]],columns = ["user","age","profession_category","gender"])
user_information

Unnamed: 0,user,age,profession_category,gender
0,Tom,26,Writer,Male
1,John,35,Cook,Male
2,Michael,29,Sports,Male


In [None]:
def preprocess_data(df,scaler = None,encoder = None):
  df = df.copy()

  #OHE for profession category
  if not encoder:
    encoder = OneHotEncoder(sparse_output=False)
    encoded_profession = encoder.fit_transform(df[["profession_category"]])

  else:
    encoded_profession = encoder.transform(df[["profession_category"]])

  profession_names = encoder.get_feature_names_out(["profession_category"])
  encoded_df = pd.DataFrame(encoded_profession, columns=profession_names)
  df = pd.concat([df.drop(columns=['profession_category']), encoded_df], axis=1)

  #binary mapping for gender
  df["gender"] = df["gender"].map({"Male":1,"Female":0})

  #z-score normalization for age
  if not scaler:
    ss = StandardScaler()
    df["age"] = ss.fit_transform(df[["age"]])
    return df,ss,encoder

  else:
    df["age"] = scaler.transform(df[["age"]])
    return df

def find_closest_customer(user_information,new_customer):

  #preprocess the data
  user_information_transformed,ss,encoder = preprocess_data(user_information)
  new_customer_transformed = preprocess_data(new_customer,scaler = ss,encoder = encoder)
  users = user_information_transformed["user"].values.copy()

  nn = NearestNeighbors(n_neighbors = 1)
  nn.fit(user_information_transformed.drop(columns = ["user"]))
  distances,indices = nn.kneighbors(new_customer_transformed.drop(columns = ["user"]))

  return users[indices[0]][0]

def recommend_products(user):
  return recommendations[user]

In [None]:
closest_user = find_closest_customer(user_information,new_customer)
print(f"The closest user to the {new_customer.user.values[0]}: {closest_user}")
print(f"Recommendadtions: {recommend_products(closest_user)}")

The closest user to the George: John
Recommendadtions: ['Knife' 'Spatula' 'Cutting_Board']
