In [1]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs
# import tensorflow_datasets as tfds

In [2]:
# !pip install tensorflow_ranking

In [3]:
# !pip install tensorflow_recommenders

In [5]:
from catboost.datasets import msrank_10k
train_df, test_df = msrank_10k()
X_train = train_df.drop([0, 1], axis=1).values
y_train = train_df[0].values
queries_train = train_df[1].values

X_test = test_df.drop([0, 1], axis=1).values
y_test = test_df[0].values
queries_test = test_df[1].values

In [5]:
from sklearn.preprocessing import StandardScaler

In [7]:
feature_scaler = StandardScaler()
target_scaler = StandardScaler()

# Scale features
X_train_scaled = feature_scaler.fit_transform(X_train)
X_test_scaled = feature_scaler.transform(X_test)

# Scale targets
y_train_scaled = target_scaler.fit_transform(y_train.reshape(-1, 1)).flatten()
y_test_scaled = target_scaler.transform(y_test.reshape(-1, 1)).flatten()

{
    "X_train_scaled_shape": X_train_scaled.shape,
    "X_test_scaled_shape": X_test_scaled.shape,
    "y_train_scaled_shape": y_train_scaled.shape,
    "y_test_scaled_shape": y_test_scaled.shape
}


{'X_train_scaled_shape': (10000, 136),
 'X_test_scaled_shape': (10000, 136),
 'y_train_scaled_shape': (10000,),
 'y_test_scaled_shape': (10000,)}

In [8]:
class MSRankModel(tfrs.Model):

    def __init__(self, loss):
        super().__init__()
        self.score_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="tanh"),
            tf.keras.layers.Dense(64, activation="tanh"),
            # Ensure the final layer has an output shape of [batch_size, 1]
            tf.keras.layers.Dense(1, activation=None)
        ])
        
        self.task = tfrs.tasks.Ranking(
            loss=loss,
            metrics=[
                # Specify NDCG with topn=10
                tfr.keras.metrics.NDCGMetric(name="ndcg_metric", topn=10),
                tf.keras.metrics.RootMeanSquaredError()
            ]
        )
#         self.task = tfrs.tasks.Ranking(
#             loss=loss,
#             metrics=[
#                 tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
#                 tf.keras.metrics.RootMeanSquaredError()
#             ]
#         )

    def call(self, features):
        # features is a [batch_size, num_features] tensor.
        # Ensure that the output is a 2D tensor with shape [batch_size, 1]
        return self.score_model(features)

    def compute_loss(self, data, training=False):
        features, labels = data
        scores = self(features)

        # The labels might also need to be reshaped to [batch_size, 1]
        labels = tf.reshape(labels, (-1, 1))

        return self.task(
            labels=labels,
            predictions=scores,
        )


In [9]:
# Convert training data to TensorFlow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
# Optionally, you can add batch size and caching for efficiency
train_dataset = train_dataset.batch(2048).cache()

# Convert testing data to TensorFlow dataset
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
# Optionally, you can add batch size for efficiency
test_dataset = test_dataset.batch(2048).cache()

In [10]:
# Example of training with mean squared error
mse_model = MSRankModel(tf.keras.losses.MeanSquaredError())
mse_model.compile(optimizer=tf.keras.optimizers.legacy.Adagrad(0.1))

In [11]:
mse_model.fit(train_dataset, epochs=20, verbose=False)
mse_model_result = mse_model.evaluate(test_dataset, return_dict=True)
print("NDCG of the MSE Model: {:.4f}".format(mse_model_result["ndcg_metric"]))

NDCG of the MSE Model: 0.4245


In [10]:
# Pairwise Hinge Loss Model
hinge_model = MSRankModel(tfr.keras.losses.PairwiseHingeLoss())
hinge_model.compile(optimizer=tf.keras.optimizers.legacy.Adam(0.05))

# Train the model
hinge_model.fit(train_dataset, epochs=20, verbose=False)

# Evaluate the model
hinge_model_result = hinge_model.evaluate(test_dataset, return_dict=True)
print("NDCG of the Pairwise Hinge Loss Model: {:.4f}".format(hinge_model_result["ndcg_metric"]))

NDCG of the Pairwise Hinge Loss Model: 0.4245


In [13]:
# ListMLE Loss Model
listwise_model = MSRankModel(tfr.keras.losses.ListMLELoss())
listwise_model.compile(optimizer=tf.keras.optimizers.legacy.Adagrad(0.1))

# Train the model
listwise_model.fit(train_dataset, epochs=5, verbose=False)

# Evaluate the model
listwise_model_result = listwise_model.evaluate(test_dataset, return_dict=True)
print("NDCG of the ListMLE Loss Model: {:.4f}".format(listwise_model_result["ndcg_metric"]))

NDCG of the ListMLE Loss Model: 0.4245


In [14]:
import numpy as np
import tensorflow as tf
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs

import numpy as np

In [15]:
def generate_complex_random_data(num_samples=10000, num_features=136, num_queries=1000):
    """
    Generate a complex synthetic dataset with random features and labels.

    Args:
    - num_samples (int): Number of samples to generate.
    - num_features (int): Number of features in each sample.
    - num_queries (int): Number of unique queries.

    Returns:
    Tuple of Numpy arrays: (features, labels, query_ids)
    """
    # Random features
    X = np.random.randn(num_samples, num_features).astype(np.float32)

    # Random labels, but more complex than a simple uniform distribution
    # Using a combination of different distributions to add complexity
    y = np.random.choice([np.random.normal(), np.random.exponential(), np.random.uniform()], size=num_samples)
    y = (y - y.min()) / (y.max() - y.min()) * 4  # Normalize and scale labels

    # Generate query IDs to group features into queries
    query_ids = np.random.randint(0, num_queries, size=num_samples).astype(np.int32)

    return X, y, query_ids

# Generate the complex random dataset
X_train_complex, y_train_complex, queries_train_complex = generate_complex_random_data()
X_test_complex, y_test_complex, queries_test_complex = generate_complex_random_data()

# Output shapes as a basic validation of the data generation
(X_train_complex.shape, y_train_complex.shape, queries_train_complex.shape), (X_test_complex.shape, y_test_complex.shape, queries_test_complex.shape)


(((10000, 136), (10000,), (10000,)), ((10000, 136), (10000,), (10000,)))

In [14]:
def generate_query_involved_data(num_queries=1000, num_samples_per_query=10, num_features=136):
    X = []
    y = []
    query_ids = []

    for q_id in range(num_queries):
        num_samples = np.random.randint(5, num_samples_per_query + 1)  # Random number of samples per query

        # Generate features for each sample in the query
        X_query = np.random.randn(num_samples, num_features).astype(np.float32)

        # Generate labels for each sample in the query
        # Example: labels could be dependent on some specific features
        y_query = np.sum(X_query[:, :5], axis=1) + np.random.randn(num_samples)
        y_query = (y_query - y_query.min()) / (y_query.max() - y_query.min()) * 4

        # Append to the main list
        X.append(X_query)
        y.append(y_query)
        query_ids.append(np.full(num_samples, q_id, dtype=np.int32))

    # Convert lists to numpy arrays
    return np.vstack(X), np.hstack(y), np.hstack(query_ids)

# Generate the query-involved dataset
X_train_query, y_train_query, queries_train_query = generate_query_involved_data()
X_test_query, y_test_query, queries_test_query = generate_query_involved_data()


In [42]:
# Define the MSRankModel class
class MSRankModel(tfrs.Model):
    def __init__(self, loss):
        super().__init__()
        self.score_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="tanh"),
            tf.keras.layers.Dense(64, activation="tanh"),
            tf.keras.layers.Dense(1, activation=None)
        ])
        self.task = tfrs.tasks.Ranking(
            loss=loss,
            metrics=[tfr.keras.metrics.NDCGMetric(name="ndcg_metric", topn=100),
                     tf.keras.metrics.RootMeanSquaredError()]
        )

    def call(self, features):
        return self.score_model(features)

    def compute_loss(self, data, training=False):
        features, labels = data
        scores = self(features)
        labels = tf.reshape(labels, (-1, 1))
        return self.task(labels=labels, predictions=scores)

In [43]:
# Test the model with different loss functions
loss_functions = [tf.keras.losses.MeanSquaredError(), tfr.keras.losses.PairwiseHingeLoss(), tfr.keras.losses.ListMLELoss()]
loss_results = {}

In [44]:
# Convert synthetic data to TensorFlow dataset
train_dataset_syn = tf.data.Dataset.from_tensor_slices((X_train_query, y_train_query)).batch(2048).cache()
test_dataset_syn = tf.data.Dataset.from_tensor_slices((X_test_query, y_test_query)).batch(2048).cache()

In [45]:
# ... [previous code for data generation and model definition]

# Use the legacy optimizer for compatibility with M1/M2 Macs
optimizer = tf.keras.optimizers.legacy.Adam(0.1)

for loss_function in loss_functions:
    model = MSRankModel(loss_function)
    model.compile(optimizer=optimizer)
    model.fit(train_dataset_syn, epochs=5, verbose=False)  # Set verbose to True to see progress
    results = model.evaluate(test_dataset_syn, return_dict=True)
    loss_results[str(loss_function)] = results["ndcg_metric"]

print(loss_results)


{'<keras.src.losses.MeanSquaredError object at 0x2ef7dcb50>': 0.8674443364143372, '<tensorflow_ranking.python.keras.losses.PairwiseHingeLoss object at 0x2f1425e90>': 0.8674443364143372, '<tensorflow_ranking.python.keras.losses.ListMLELoss object at 0x2efd22810>': 0.8674443364143372}


In [52]:
import numpy as np
import tensorflow as tf
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs

import numpy as np

def generate_complex_random_data(num_samples=10000, num_features=136, num_queries=1000):
    """
    Generate a complex synthetic dataset with random features and labels.

    Args:
    - num_samples (int): Number of samples to generate.
    - num_features (int): Number of features in each sample.
    - num_queries (int): Number of unique queries.

    Returns:
    Tuple of Numpy arrays: (features, labels, query_ids)
    """
    # Random features
    X = np.random.randn(num_samples, num_features).astype(np.float32)

    # Random labels, but more complex than a simple uniform distribution
    # Using a combination of different distributions to add complexity
    y = np.random.choice([np.random.normal(), np.random.exponential(), np.random.uniform()], size=num_samples)
    y = (y - y.min()) / (y.max() - y.min()) * 4  # Normalize and scale labels

    # Generate query IDs to group features into queries
    query_ids = np.random.randint(0, num_queries, size=num_samples).astype(np.int32)

    return X, y, query_ids

# Generate the complex random dataset
X_train_complex, y_train_complex, queries_train_complex = generate_complex_random_data()
X_test_complex, y_test_complex, queries_test_complex = generate_complex_random_data()

# Output shapes as a basic validation of the data generation
(X_train_complex.shape, y_train_complex.shape, queries_train_complex.shape), (X_test_complex.shape, y_test_complex.shape, queries_test_complex.shape)



def generate_query_involved_data(num_queries=1000, num_samples_per_query=10, num_features=136):
    X = []
    y = []
    query_ids = []

    for q_id in range(num_queries):
        num_samples = np.random.randint(5, num_samples_per_query + 1)  # Random number of samples per query

        # Generate features for each sample in the query
        X_query = np.random.randn(num_samples, num_features).astype(np.float32)

        # Generate labels for each sample in the query
        # Example: labels could be dependent on some specific features
        y_query = np.sum(X_query[:, :5], axis=1) + np.random.randn(num_samples)
        y_query = (y_query - y_query.min()) / (y_query.max() - y_query.min()) * 4

        # Append to the main list
        X.append(X_query)
        y.append(y_query)
        query_ids.append(np.full(num_samples, q_id, dtype=np.int32))

    # Convert lists to numpy arrays
    return np.vstack(X), np.hstack(y), np.hstack(query_ids)

# Generate the query-involved dataset
X_train_query, y_train_query, queries_train_query = generate_query_involved_data()
X_test_query, y_test_query, queries_test_query = generate_query_involved_data()


# Define the MSRankModel class
class MSRankModel(tfrs.Model):
    def __init__(self, loss):
        super().__init__()
        self.score_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="tanh"),
            tf.keras.layers.Dense(64, activation="tanh"),
            tf.keras.layers.Dense(1, activation=None)
        ])
        self.task = tfrs.tasks.Ranking(
            loss=loss,
            metrics=[tfr.keras.metrics.NDCGMetric(name="ndcg_metric", topn=100),
                     tf.keras.metrics.RootMeanSquaredError()]
        )

    def call(self, features):
        return self.score_model(features)

    def compute_loss(self, data, training=False):
        features, labels = data
        scores = self(features)
        labels = tf.reshape(labels, (-1, 1))
        return self.task(labels=labels, predictions=scores)


# Test the model with different loss functions
loss_functions = [tf.keras.losses.MeanSquaredError(), tfr.keras.losses.PairwiseHingeLoss(), tfr.keras.losses.ListMLELoss()]
loss_results = {}

# Convert synthetic data to TensorFlow dataset
train_dataset_syn = tf.data.Dataset.from_tensor_slices((X_train_query, y_train_query)).batch(2048).cache()
test_dataset_syn = tf.data.Dataset.from_tensor_slices((X_test_query, y_test_query)).batch(2048).cache()


# ... [previous code for data generation and model definition]

# Use the legacy optimizer for compatibility with M1/M2 Macs
optimizer = tf.keras.optimizers.legacy.Adam(0.1)

for loss_function in loss_functions:
    model = MSRankModel(loss_function)
    model.compile(optimizer=optimizer)
    model.fit(train_dataset_syn, epochs=5, verbose=False)  # Set verbose to True to see progress
    results = model.evaluate(test_dataset_syn, return_dict=True)
    loss_results[str(loss_function)] = results["ndcg_metric"]

print(loss_results)


{'<keras.src.losses.MeanSquaredError object at 0x2f2706990>': 0.867584764957428, '<tensorflow_ranking.python.keras.losses.PairwiseHingeLoss object at 0x2f3b8da10>': 0.867584764957428, '<tensorflow_ranking.python.keras.losses.ListMLELoss object at 0x2f2cb5890>': 0.867584764957428}


In [1]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs
# import tensorflow_datasets as tfds

In [2]:
loss_functions = [tf.keras.losses.MeanSquaredError(), tfr.keras.losses.PairwiseHingeLoss(), tfr.keras.losses.ListMLELoss()]

In [3]:
def generate_query_related_data(num_queries=1000, num_samples_per_query=10, num_features=136):
    X = []
    y = []
    query_ids = []

    for q_id in range(num_queries):
        num_samples = np.random.randint(5, num_samples_per_query + 1)

        # Generate features for each sample in the query
        X_query = np.random.randn(num_samples, num_features).astype(np.float32)

        # Generate labels for each sample in the query
        # Example: labels could be more influenced by certain features
        y_query = np.sum(X_query[:, :5], axis=1) + np.random.randint(num_samples)
        y_query = np.argsort(np.argsort(-y_query))  # Rank-based labels
        y_query = tf.cast(y_query, dtype=tf.float32)

        X.append(X_query)
        y.append(y_query)
        query_ids.append(np.full(num_samples, q_id, dtype=np.float32))

    return np.vstack(X), np.hstack(y), np.hstack(query_ids)


In [4]:
num_queries = 1000

In [5]:
X_train, y_train, queries_train = generate_query_related_data()

In [6]:
X_test, y_test, queries_test = generate_query_related_data()

In [7]:
X_train.shape

(7509, 136)

In [8]:
class QueryAwareRankModel(tfrs.Model):
    def __init__(self, loss, num_queries):
        super().__init__()
        self.query_embedding = tf.keras.layers.Embedding(input_dim=num_queries, output_dim=64)
        self.score_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="tanh"),
            tf.keras.layers.Dense(64, activation="tanh"),
            tf.keras.layers.Dense(1, activation=None)
        ])
        self.task = tfrs.tasks.Ranking(loss=loss, metrics=[tfr.keras.metrics.NDCGMetric(name="ndcg_metric")])

    def call(self, inputs, training=False):
        features = inputs["features"]
        query_id = inputs["query_id"]
        query_embedding = self.query_embedding(query_id)
        combined_features = tf.concat([features, query_embedding], axis=1)
        return self.score_model(combined_features)


    
    def compute_loss(self, data, training=False):
        data_dict, labels = data
        features = data_dict["features"]
        query_id = data_dict["query_id"]
        scores = self(data_dict, training)
        labels = tf.reshape(labels, (-1, 1))
        return self.task(labels=labels, predictions=scores)




In [9]:
train_dataset = tf.data.Dataset.from_tensor_slices(({'features': X_train, 'query_id': queries_train}, y_train)).batch(2048)
test_dataset = tf.data.Dataset.from_tensor_slices(({'features': X_test, 'query_id': queries_test}, y_test)).batch(2048)

In [10]:
loss_results = {}

In [11]:
optimizer = tf.keras.optimizers.legacy.Adam(0.1)

for loss_function in loss_functions:
    model = QueryAwareRankModel(loss_function, num_queries)
    model.compile(optimizer=optimizer)
#     model.fit(train_dataset, epochs=5, verbose=False)
    results = model.evaluate(test_dataset, return_dict=True)
    loss_results[str(loss_function)] = results["ndcg_metric"]

print(loss_results)


{'<keras.src.losses.MeanSquaredError object at 0x2acfd6910>': 0.866950511932373, '<tensorflow_ranking.python.keras.losses.PairwiseHingeLoss object at 0x117a778d0>': 0.866950511932373, '<tensorflow_ranking.python.keras.losses.ListMLELoss object at 0x2ae4f2c10>': 0.866950511932373}
