# Hybrid Approach with Gradinet Boosting 

* Goal: build a recommender by combining user and item features
* Engineer user and product features
* Split Data with a global temporal split
* Encode product text features
* Train a gradinet boosting model (CatboostRanker) with a PairLogit loss so that it is able to rank items based on their relevance to users

In [1]:
import os
from collections import defaultdict

import numpy as np
import pandas as pd
from catboost import CatBoostRanker, Pool
from datasets import load_dataset
from sklearn.metrics import ndcg_score
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

os.chdir("../")
from aux.feature_engineering import calculate_rolling_stats
from aux.text_pre_processing import combine_text_features, pre_process_text
from aux.train_test_split import (
    global_temporal_split,
    temporal_split_users_in_both_sets,
    temporal_split_users_with_cold_start,
)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_colwidth', None)

2025-01-16 16:49:35.593000: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-16 16:49:35.729607: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737046175.786759  431584 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737046175.803477  431584 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-16 16:49:35.941513: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

# Load Data

In [2]:
dataset_reviews = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
df_reviews = dataset_reviews["full"].to_pandas()

dataset_items = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_All_Beauty", split="full", trust_remote_code=True)
df_items = dataset_items.to_pandas()

# filter out users with low ammount of reviews for now - look at the cold start problem later
min_ammount_reviews = 5
user_review_counts = df_reviews.groupby('user_id').size()
users_with_min_reviews = user_review_counts[user_review_counts >= min_ammount_reviews].index
df_reviews_filtered = df_reviews[df_reviews['user_id'].isin(users_with_min_reviews)]

df = pd.merge(df_reviews_filtered, df_items, on='parent_asin', how='left', suffixes=('_review', '_item'))

KeyboardInterrupt: 

In [None]:
df.shape

In [None]:
df.columns

# Pre-processing

In [None]:
# item features
df['title_description_features'] = df.apply(
    lambda row: combine_text_features(row, col1='title_item', col2='features', col3='description'),
    axis=1
)
df = pre_process_text(df, input_col='title_description_features', output_col='product_title_description_feature_pre_processed')


In [None]:
df.head()

In [None]:
def analyze_nulls(df, column):
    total_rows = len(df)
    null_count = df[column].isnull().sum()
    null_percentage = (null_count / total_rows) * 100
    
    return {
        'column': column,
        'total_rows': total_rows,
        'null_count': null_count,
        'null_percentage': round(null_percentage, 2)
    }


In [None]:
analyze_nulls(df, 'price')

In [None]:
# too many missing values, we won't use it for now
df[df.price == 'None'].shape[0] / df.shape[0]

In [None]:
analyze_nulls(df, 'verified_purchase')

In [None]:
df['verified_purchase'] = df.verified_purchase.astype('int')

In [None]:
analyze_nulls(df, 'store')

In [None]:
df['store'] = df['store'].fillna('UNKNOWN')

In [None]:
def pre_process_categories(row, col):
    if isinstance(row[col], str) and row[col].strip():
        categories = row[col]
    elif isinstance(row[col], (list, np.ndarray)):
        categories = ' '.join(str(element) for element in row[col])
    else:
        categories = ''
    return categories

In [None]:
df['categories_processed'] = df.apply(
    lambda row: pre_process_categories(row, col='categories'),
    axis=1
)

In [None]:
df[df.categories_processed == ''].shape

In [None]:
# maybe prefereences change over the years 
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
df['year'] = df['timestamp'].dt.year

# Feature Engineering

In [None]:
df = calculate_rolling_stats(df, 'user_id')  
df = calculate_rolling_stats(df, 'parent_asin')  
df = df.sort_values(by='timestamp')

In [None]:
df[df.user_id == 'AHV6QCNBJNSGLATP56JAWJ3C4G2A'][['timestamp', 'rating', 'rolling_avg_rating_user', 'rolling_review_count_user']].head()

In [None]:
df[df.parent_asin == 'B0B5XFVSXY'][['timestamp', 'rating', 'rolling_avg_rating_product', 'rolling_review_count_product']].head()

# Select Features

In [None]:
# numerical features
user_review_features = ['rolling_avg_rating_user', 'rolling_review_count_user', 'helpful_vote', 'verified_purchase', 'year']
product_features = ['average_rating', 'rolling_avg_rating_product', 'rolling_review_count_product', 'rating_number']
categoricals = ['main_category', 'store']
text_features = ['product_title_description_feature_pre_processed']
target = 'rating'

train_test_split_features =  ['timestamp', 'parent_asin']
group_features = ['user_id']

In [None]:
columns = train_test_split_features + group_features + user_review_features + product_features + categoricals + text_features + [target]

In [None]:
df = df[columns]

In [None]:
df.head()

# Train Test Split

In [None]:
train_df, test_df = global_temporal_split(df, split_ratio=0.8, exclude_cold_start_users=False)

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.user_id.unique().shape

In [None]:
test_df.user_id.unique().shape

In [None]:
# 141 users only have interactions in the test set
cold_start_users = [i for i in test_df.user_id.unique() if i not in train_df.user_id.unique()]
len(cold_start_users)

In [None]:
# 83% of products in the test set not in the train set
# we will try to work wit this by adding content-based features
test_only_products = [i for i in test_df.parent_asin.unique() if i not in train_df.parent_asin.unique()]
len(test_only_products) / len(test_df.parent_asin.unique())

In [None]:
len(train_df.parent_asin.unique())

In [None]:
train_df = train_df.drop(train_test_split_features, axis=1)
test_df = test_df.drop(train_test_split_features, axis=1)

# Generate Embeddings For Text Features

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
train_embeddings = model.encode(train_df['product_title_description_feature_pre_processed'].tolist(), batch_size=64, show_progress_bar=True)
test_embeddings = model.encode(test_df['product_title_description_feature_pre_processed'].tolist(), batch_size=64, show_progress_bar=True)

In [None]:
np.save('train_embeddings.npy', train_embeddings)
np.save('test_embeddings.npy', test_embeddings)

In [None]:
train_embeddings = np.load('train_embeddings.npy')
test_embeddings = np.load('test_embeddings.npy')

In [None]:
embeding_columns = [f'product_embeddings_{i}' for i in range(train_embeddings.shape[1])]

In [None]:
train_embeddings_df = pd.DataFrame(train_embeddings, columns=embeding_columns)
test_embeddings_df = pd.DataFrame(test_embeddings, columns=embeding_columns)

In [None]:
train_df = pd.concat([train_df, train_embeddings_df], axis=1)
test_df = pd.concat([test_df, test_embeddings_df], axis=1)

In [None]:
train_df.head()

In [None]:
train_df = train_df.drop(text_features, axis=1)
test_df = test_df.drop(text_features, axis=1)

# Train Model

In [None]:
features = user_review_features + product_features + categoricals + embeding_columns

In [None]:
y_train = train_df[target]
X_train = train_df.drop([target], axis=1)

y_test = test_df[target]
X_test = test_df.drop([target], axis=1)

In [None]:
categorical_indices = [X_train[features].columns.get_loc(col) for col in categoricals]
categorical_indices

In [None]:
train_df = train_df.sort_values(by='user_id')
test_df = test_df.sort_values(by='user_id')

In [None]:
train_pool = Pool(
    data=train_df[features],
    label=train_df[target],
    group_id=train_df['user_id'].tolist(), 
    cat_features=categorical_indices
)

test_pool = Pool(
    data=test_df[features],
    label=test_df[target],
    group_id=test_df['user_id'].tolist(),
    cat_features=categorical_indices 
)


In [None]:
model = CatBoostRanker(
    loss_function='PairLogit',
    iterations=1000,
    #learning_rate=0.1,
    #depth=6,
    cat_features=categorical_indices
)

model.fit(train_pool,
        eval_set=test_pool,
        metric_period=30,
        use_best_model=True 
          
         )

In [None]:
feature_importances = model.get_feature_importance(train_pool)
for score, name in sorted(zip(feature_importances, features), reverse=True):
    print('{}: {}'.format(name, score))

# Evaluate

In [None]:
test_df["predicted_score"] = model.predict(test_pool)

In [None]:
ndcg_scores = []
num_users = 0
for user_id in test_df['user_id'].unique():
    true_relevance = test_df[test_df['user_id'] == user_id][target].tolist()
    predicted_scores = test_df[test_df['user_id'] == user_id]['predicted_score'].tolist()
    if len(predicted_scores) > 1:
        user_ndcg = ndcg_score([true_relevance], [predicted_scores], k=10)
        ndcg_scores.append(user_ndcg)
        num_users += 1

average_ndcg = np.mean(ndcg_scores)
print(f"Average NDCG@10 across all users with mora than 1 rating ({num_users}, {num_users/test_df.user_id.unique().shape[0]}% of the test set): {average_ndcg:.4f}")


In [None]:
def precision_recall_at_k(group, k=10):
    # sort by predicted_score descending
    group_sorted = group.sort_values("predicted_score", ascending=False)
    
    top_k = group_sorted.head(k)
    
    # number of relevant items in the top K
    relevant_in_top_k = top_k["relevant"].sum()
    
    # total relevant items for this user
    total_relevant = group["relevant"].sum()
    
    precision_k = relevant_in_top_k / k
    recall_k = relevant_in_top_k / total_relevant if total_relevant > 0 else 0.0
    
    return pd.Series({
        "precision@{}".format(k): precision_k,
        "recall@{}".format(k): recall_k
    })

def compute_precision_recall_at_k(df, user_col="user_id", k=10):
    """ 
    Compute mean precision@K and recall@K across all users. 
    """
    metrics_df = (
        df
        .groupby(user_col)
        [df.columns]
        .apply(lambda g: precision_recall_at_k(g, k))
    )

    return metrics_df.mean().to_dict()



In [None]:
test_df["relevant"] = (test_df["rating_number"] >= 4).astype(int)

test_df_sorted = (
    test_df
    .groupby("user_id", group_keys=False)
    [['user_id', 'relevant', 'predicted_score']] 
    .apply(lambda df: df.sort_values("predicted_score", ascending=False))
)

metrics_k10 = compute_precision_recall_at_k(test_df_sorted, user_col="user_id", k=10)
print(metrics_k10)