In [1]:
%%time
import numpy as np
import pandas as pd
import gc
import random
from collections import Counter
from datetime import timedelta
from tqdm import tqdm
from typing import Dict, Text


import tensorflow as tf
import tensorflow_recommenders as tfrs


# Read data into memory
article_df = pd.read_csv("hnmdata/articles.csv")
customer_df = pd.read_csv("hnmdata/customers.csv")
transaction_df = pd.read_csv('hnmdata/transactions_train.csv')

# Impute missing data
customer_df.fillna({"FN":0}, inplace=True)
customer_df.fillna({"Active":0}, inplace=True)
customer_df.fillna({"club_member_status":"UNKNOWN"}, inplace=True)
customer_df["fashion_news_frequency"] = customer_df["fashion_news_frequency"].replace({"None":"NONE"})
customer_df.fillna({"fashion_news_frequency":"UNKNOWN"}, inplace=True)
customer_df.fillna({"age": customer_df["age"].median()}, inplace=True)

# Bucket age groups
def create_age_interval(x):
    if x <= 25:
        return [16, 25]
    elif x <= 35:
        return [26, 35]
    elif x <= 45:
        return [36, 45]
    elif x <= 55:
        return [46, 55]
    elif x <= 65:
        return [56, 65]
    else:
        return [66, 99]
    
customer_df["age_interval"] = customer_df["age"].apply(lambda x: create_age_interval(x))

# Make sure all nulls are filled in customer_df
assert customer_df.isnull().sum().sum() == 0

# Impute missing data
article_df.fillna(value="No Description", inplace=True)

# Change article_id datatype to string
article_df['article_id'] = article_df['article_id'].astype(str)

# Add a zero to the left of the article_id string
article_df['article_id'] = article_df['article_id'].apply(lambda x: x.zfill(10))

transaction_df['article_id'] = transaction_df['article_id'].astype(str)
transaction_df['article_id'] = transaction_df['article_id'].apply(lambda x: x.zfill(10))

# Intersection helper function
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

# Recall@k = (# of recommended items @k that are relevant) / (total # of relevant items)
def estimate_recall(cg, purchase):
    return len(intersection(cg,purchase))/len(purchase)


# Test the functions with toy example.
# How many items from purchases did each CG retrieve?
purchases = ["item45","item97","item71","item125","item5"]
cg1 = ["item1","item97","item12","item105","item5","item17","item197","item122","item85","item15"]
cg2 = ["item13","item94","item14","item15","item5","item18","item197","item132","item86","item65"]

print("recall for CG1: ", estimate_recall(cg1, purchases))
print("recall for CG1: ", estimate_recall(cg2, purchases))

# Split transactions into train and test
N_DAYS_TRAIN = 90
N_DAYS_TEST = 7

max_date = transaction_df['t_dat'].max()
train = transaction_df[(transaction_df['t_dat']>=((pd.to_datetime(max_date) - timedelta(days=N_DAYS_TRAIN+N_DAYS_TEST)).date().strftime('%Y-%m-%d')))
                        & (transaction_df['t_dat']<((pd.to_datetime(max_date) - timedelta(days=N_DAYS_TEST)).date().strftime('%Y-%m-%d')))]
test = transaction_df[(transaction_df['t_dat']>=((pd.to_datetime(max_date) - timedelta(days=N_DAYS_TEST)).date().strftime('%Y-%m-%d')))]


# Delete transaction_df from the namespace to free up some memory
transaction_df = None
del transaction_df
gc.collect()

print(f"train and test shape : {(train.shape, test.shape)}")

c1 = train['customer_id'].to_list()
c2 = test['customer_id'].to_list()

# Note: Sorting common_users so evaluation is deterministic
common_users = sorted(intersection(c1, c2))

print(len(train), len(test), len(c1), len(c2), len(common_users))

# Pre-calculate unique items so they can be re-used during every invocation
train_unique_items = sorted(train['article_id'].unique().tolist())

def get_k_candidates_random(u, k):
    """
    Generate k random candidates from the training set.

    Args:
        u (str): user ID for which to generate the candidates.
        k (int): Number of candidates to generate.

    Returns:
        candidates (list): Random k candidates.
    """

    # Set the seed to the user ID to make this function deterministic
    random.seed(u)
    candidates = random.sample(train_unique_items, k)
    return candidates

# Pre-calculate item counts so they can be re-used during every invocation
train_item_counts = Counter(train['article_id'].to_list()).most_common()

def get_top_k_candidates_popular(u, k):
    """
    Generate k most popular (number of times purchased) candidates from the training set.

    Args:
        u (str): user ID for which to generate the candidates. Not necessary for this function, but is an assumed input during evaluation.
        k (int): Number of candidates to generate.

    Returns:
        candidates (list): The most popular k candidates.
    """
    candidates = [article for article, count in train_item_counts[:k]]
    return candidates

def run_candidate_generation(method, k_values=[100,1000], user_set_size=1000):
    """
    Evaluate a given candidate generator in terms of recall on the held-out test set.

    Args:
        method (function): Candidate generation function. User ID (u) and number of candidates (k) arguments.
        k_values (list): List of number of candidates to generate and evaluate.
        user_set_size (int): Number of users to evaluate.

    Returns:
        None. Prints results.
    """

    # Initialise evaluation variables
    k_values = sorted(k_values)
    recall_dict = {k:0 for k in k_values}
    user_set = common_users[:user_set_size]

    # Loop over users
    for u in tqdm(user_set):
        # Get list of purchased items for user u
        purchase_list = test[test['customer_id']==u]['article_id'].to_list()
        # Run candidate generation
        cg = method(u, max(k_values))
        for k in k_values:
            # Estimate recall for candidate generator
            recall = estimate_recall(cg[:k], purchase_list)
            # Add this to overall recall (to be averaged at end)
            recall_dict[k] += recall

    print(f"\nRecall evaluation for {user_set_size:,} users:")
    for k in k_values:
        # Average recall by dividing sum of recalls by user_set length
        overall_recall = recall_dict[k] / len(user_set)
        print(f"Recall @ {k} Candidates: {overall_recall:.5f}")
# Evaluate CG 1 (random)
run_candidate_generation(get_k_candidates_random)

# Evaluate CG 2 (popularity)
run_candidate_generation(get_top_k_candidates_popular)

# Create lists of unique ids
unique_customer_ids = train.customer_id.unique()
unique_article_ids = train.article_id.unique()

embedding_dimension = 128

tf.keras.utils.set_random_seed(42)

customer_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_customer_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_customer_ids) + 1, embedding_dimension),
  tf.keras.layers.Dense(64, activation='relu')
])

article_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_article_ids, mask_token=None),
  tf.keras.layers.Embedding(len(unique_article_ids) + 1, embedding_dimension),
  tf.keras.layers.Dense(64, activation='relu')
])

article_ds = tf.data.Dataset.from_tensor_slices(dict(article_df[['article_id']]))
articles = article_ds.map(lambda x: x['article_id'])

class HandMModel(tfrs.Model):

    def __init__(self, customer_model, article_model):
        super().__init__()
        self.article_model: tf.keras.Model = article_model
        self.customer_model: tf.keras.Model = customer_model
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
            candidates=articles.batch(128).map(self.article_model), # Batching articles into size 128 and passing through the article model
            ),
        )

    def compute_loss(self, features: Dict[str, tf.Tensor], training=False) -> tf.Tensor:

        customer_embeddings = self.customer_model(features["customer_id"])
        article_embeddings = self.article_model(features["article_id"])

        # The task computes the loss and the metrics.
        # Note that by default compute_metrics is set to not_training as running during training is VERY expensive
        return self.task(customer_embeddings, article_embeddings, compute_metrics=not training)
# Instantiate model
model = HandMModel(customer_model, article_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

# The `from_tensor_slices` method creates a dataset with a separate element for each row of the input tensor
# Shuffling randomly shuffles the dataset and batching sets the batch size to 256
# Caching keeps the dataset in memory (or a specified file). For larger datasets sometimes we cannot fit the entire dataset in memory and thus use a file on disk.
train_ds = tf.data.Dataset.from_tensor_slices(dict(train[['customer_id','article_id']])).shuffle(100_000, seed=42).batch(256).cache()


# Fit the model
# FactorizedTopK will show as 0, but these are the expensive metrics and as training=True they will not be calculated
num_epochs = 8
history = model.fit(train_ds, epochs=num_epochs, verbose=1)

# Take a sample of 2000 elements from the test set and run inference on them
test_ds = tf.data.Dataset.from_tensor_slices(dict(test.sample(2000, random_state=42)[['customer_id','article_id']])).batch(500).cache()

# We now see the metrics are not in fact zero (train=False here, so they are computed)
model.evaluate(test_ds, return_dict=True)



index = tfrs.layers.factorized_top_k.BruteForce(model.customer_model)
index.index_from_dataset(
  tf.data.Dataset.zip((articles.batch(1000), articles.batch(1000).map(model.article_model)))
)

def get_top_k_candidates_2_tower(u, k):
    """
    Generate k candidates from the training set using the two-tower model.

    Args:
        u (str): user ID for which to generate the candidates.
        k (int): Number of candidates to generate.

    Returns:
        candidates (list): The top-k candidates.
    """
    _, candidates = index(tf.constant([u]), k=k)
    candidates = candidates.numpy().flatten()
    candidates = [c.decode("utf-8") for c in candidates]
    return candidates

run_candidate_generation(get_top_k_candidates_2_tower)

2024-12-05 15:09:06.451057: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-12-05 15:09:07.527779: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/x86_64-linux-gnu/:/opt/conda/lib
2024-12-05 15:09:07.527910: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local

recall for CG1:  0.4
recall for CG1:  0.2
train and test shape : ((4187470, 5), (266364, 5))
4187470 266364 4187470 266364 55385


100%|██████████| 1000/1000 [00:19<00:00, 50.38it/s]



Recall evaluation for 1,000 users:
Recall @ 100 Candidates: 0.00308
Recall @ 1000 Candidates: 0.02244


100%|██████████| 1000/1000 [00:19<00:00, 52.24it/s]



Recall evaluation for 1,000 users:
Recall @ 100 Candidates: 0.05095
Recall @ 1000 Candidates: 0.20833


2024-12-05 15:10:54.505436: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-12-05 15:10:54.539570: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-12-05 15:10:54.543084: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-12-05 15:10:54.547041: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

Cause: could not parse the source code of <function <lambda> at 0x7fb4253d8550>: no matching AST found among candidates:

Cause: could not parse the source code of <function <lambda> at 0x7fb4253d8550>: no matching AST found among candidates:

Epoch 1/8


2024-12-05 15:10:57.127251: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7fb2f645f1f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-12-05 15:10:57.127291: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2024-12-05 15:10:57.132130: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-12-05 15:10:57.244404: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


100%|██████████| 1000/1000 [00:22<00:00, 44.43it/s]


Recall evaluation for 1,000 users:
Recall @ 100 Candidates: 0.06227
Recall @ 1000 Candidates: 0.18906
CPU times: user 14min 12s, sys: 2min 1s, total: 16min 13s
Wall time: 9min 42s



