# Learn-to-Rank Keras Example with txt files

https://github.com/tensorflow/ranking/blob/master/tensorflow_ranking/examples/keras/keras_dnn_tfrecord.py

In [1]:
import numpy as np
import tensorflow as tf
import tensorflow_ranking as tfr
import glob
from datetime import datetime

In [2]:
tf.__version__

'2.5.0'

In [3]:
tfr.__version__

'0.4.0.dev'

In [4]:
filename = "/data/example.txt"

In [5]:
text = """1 qid:10 32:0.14 48:0.97 51:0.45
0 qid:10 1:0.15 31:0.75 32:0.24 49:0.6
2 qid:10 1:0.71 2:0.36 31:0.58 51:0.12
0 qid:20 4:0.79 31:0.01 33:0.05 35:0.27
3 qid:20 1:0.42 28:0.79 35:0.30 42:0.76"""

with open(filename, 'w') as f:
    f.write(text)

In [6]:
!cat "$filename"

1 qid:10 32:0.14 48:0.97 51:0.45
0 qid:10 1:0.15 31:0.75 32:0.24 49:0.6
2 qid:10 1:0.71 2:0.36 31:0.58 51:0.12
0 qid:20 4:0.79 31:0.01 33:0.05 35:0.27
3 qid:20 1:0.42 28:0.79 35:0.30 42:0.76

### Keras example

https://github.com/tensorflow/ranking/blob/master/tensorflow_ranking/examples/keras/keras_dnn_tfrecord.py

In [7]:
from typing import Dict, Tuple
from absl import app
from absl import flags
import tensorflow as tf
import tensorflow_ranking as tfr

In [8]:
# Arguments
num_features = 136

# The document relevance label.
_LABEL_FEATURE = "utility"

# Padding labels are set negative so that the corresponding examples can be
# ignored in loss and metrics.
_PADDING_LABEL = -1
_MASK = "example_list_mask"

def _create_feature_spec() -> Tuple[Dict[str, tf.io.FixedLenFeature], Dict[
    str, tf.io.FixedLenFeature], Tuple[str, tf.io.FixedLenFeature]]:
    """Create context and example feature spec for data parsing.
    Returns:
    (context feature specs, example feature specs, label spec).
    """
    context_feature_spec = {}
    example_feature_spec = {
      "custom_features_{}".format(i + 1):
      tf.io.FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=0.0)
      for i in range(0, num_features)
    }
    label_spec = (_LABEL_FEATURE,
                tf.io.FixedLenFeature(
                    shape=(1,), dtype=tf.int64, default_value=_PADDING_LABEL))
    return context_feature_spec, example_feature_spec, label_spec

In [9]:
context_feature_spec, example_feature_spec, label_spec = _create_feature_spec()

In [10]:
context_feature_spec

{}

In [11]:
# example_feature_spec

In [12]:
example_feature_spec['custom_features_1']

FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=0.0)

In [13]:
label_spec

('utility', FixedLenFeature(shape=(1,), dtype=tf.int64, default_value=-1))

In [14]:
from tensorflow_ranking.python.keras.pipeline import DatasetHparams

In [15]:
feature_len = 51

def parsing():

    def convert(line):
        
        columns = tf.strings.split([line], ' ')
        labels = tf.strings.to_number(columns.values[0], out_type=tf.int32)
        labels = tf.reshape(labels, [-1])
        splits = tf.strings.split(columns.values[2:], ':') # skip qid:<number>
        id_vals = tf.reshape(splits.values, (splits.shape[0], 2))
        feat_ids, feat_vals = tf.split(id_vals, num_or_size_splits=2, axis=1)
        feat_ids = tf.strings.to_number(feat_ids, out_type=tf.int64)
        feat_vals = tf.strings.to_number(feat_vals, out_type=tf.float32)
        sparse_feature = tf.SparseTensor(feat_ids-1, tf.reshape(feat_vals, [-1]), [feature_len])
        dense_feature = tf.sparse.to_dense(sparse_feature)
        return dense_feature, labels

    return convert


def get_dataset(filenames):
    """ Read dataset from filenames """

    dataset = tf.data.TextLineDataset(filenames)
    dataset = dataset.map(tf.strings.strip).filter(lambda line: tf.strings.length(line) > 0)

    # tf.py_function to be used as contents could not be read directly with map and the function as argument
    dataset = dataset.map(
        lambda data: tf.py_function(
            parsing(),
            [ data ],
            (tf.float32, tf.int32),
            name="features_labels_parser")
    )
    
    # Shape is lost after py_function: https://github.com/tensorflow/tensorflow/issues/31373
    # Shape cannot be inferred: https://github.com/tensorflow/tensorflow/issues/16052
    dataset = dataset.map(
        lambda x, y: (tf.reshape(x, (51,), name=None), 
                      tf.reshape(y, (1,), name=None)))
    
    return dataset

In [16]:
# Arguments
train_input_pattern = "/data/example.txt"
valid_input_pattern = "/data/example.txt"
train_batch_size = 1
valid_batch_size = 1
list_size = None
convert_labels_to_binary = False

# Get dataset hyperparams
dataset_hparams = tfr.python.keras.pipeline.DatasetHparams(
    train_input_pattern=train_input_pattern,
    valid_input_pattern=valid_input_pattern,
    train_batch_size=train_batch_size,
    valid_batch_size=valid_batch_size,
    list_size=list_size,
    dataset_reader=get_dataset,
    convert_labels_to_binary=convert_labels_to_binary)

In [17]:
dataset_hparams

DatasetHparams(train_input_pattern='/data/example.txt', valid_input_pattern='/data/example.txt', train_batch_size=1, valid_batch_size=1, list_size=None, valid_list_size=None, dataset_reader=<function get_dataset at 0x7f56ffe8adc0>, convert_labels_to_binary=False)

In [18]:
dataset_hparams.valid_list_size

In [19]:
dataset_hparams.dataset_reader

<function __main__.get_dataset(filenames)>

In [20]:
dataset_hparams.dataset_reader([ filename ])

<MapDataset shapes: ((51,), (1,)), types: (tf.float32, tf.int32)>

In [21]:
# dir(dataset_hparams)

In [22]:
# Arguments
model_dir = f"/tmp/tf-ranking-{datetime.now().strftime('%d-%m-%Y-%H-%M-%S')}"
num_epochs = 10
num_train_steps = 1000
num_valid_steps = 100
loss = "approx_ndcg_loss"
optimizer = "adagrad"
learning_rate = 0.005
steps_per_execution = 10
export_best_model = False
strategy = "MirroredStrategy"

pipeline_hparams = tfr.keras.pipeline.PipelineHparams(
      model_dir=model_dir,
      num_epochs=num_epochs,
      steps_per_epoch=(num_train_steps // num_epochs),
      validation_steps=num_valid_steps,
      loss=loss,
      loss_reduction=tf.losses.Reduction.AUTO,
      optimizer=optimizer,
      learning_rate=learning_rate,
      steps_per_execution=steps_per_execution,
      export_best_model=export_best_model,
      strategy=strategy)

In [23]:
pipeline_hparams

PipelineHparams(model_dir='/tmp/tf-ranking-22-06-2021-15-11-58', num_epochs=10, steps_per_epoch=100, validation_steps=100, learning_rate=0.005, loss='approx_ndcg_loss', loss_reduction='auto', optimizer='adagrad', loss_weights=None, steps_per_execution=10, automatic_reduce_lr=False, use_weighted_metrics=False, export_best_model=False, best_exporter_metric_higher_better=False, best_exporter_metric='loss', strategy='MirroredStrategy', tpu='')

In [24]:
# Arguments
use_log1p = False

preprocess_dict = {}
if use_log1p:
    preprocess_dict = {
        fname: lambda t: tf.math.log1p(t * tf.sign(t)) * tf.sign(t)
        for fname in example_feature_spec.keys()
    }

In [25]:
hidden_layer_dims = "64,32,16"
use_batch_norm = True
batch_norm_moment = 0.99
dropout = 0.4

dnn_scorer = tfr.keras.model.DNNScorer(
    hidden_layer_dims=map(int, hidden_layer_dims.split(",")),
    output_units=1,
    activation=tf.nn.relu,
    input_batch_norm=use_batch_norm,
    use_batch_norm=use_batch_norm,
    batch_norm_moment=batch_norm_moment,
    dropout=dropout)

In [26]:
dnn_scorer

<tensorflow_ranking.python.keras.model.DNNScorer at 0x7f56ffe6ecd0>

In [27]:
model_builder = tfr.keras.model.ModelBuilder(
    input_creator=tfr.keras.model.FeatureSpecInputCreator(
      context_feature_spec, 
        example_feature_spec),
    preprocessor=tfr.keras.model.PreprocessorWithSpec(preprocess_dict),
    scorer=dnn_scorer,
    mask_feature_name=_MASK,
    name="keras_dnn_model")

In [28]:
model_builder

<tensorflow_ranking.python.keras.model.ModelBuilder at 0x7f56ffe6e5b0>

In [29]:
# dir(model_builder)

In [30]:
ranking_pipeline = tfr.keras.pipeline.SimplePipeline(
    model_builder=model_builder,
    dataset_builder=tfr.keras.pipeline.SimpleDatasetBuilder(
        context_feature_spec=context_feature_spec,
        example_feature_spec=example_feature_spec,
        mask_feature_name=_MASK,
        label_spec=label_spec,
        hparams=dataset_hparams),
    hparams=pipeline_hparams)

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


In [31]:
ranking_pipeline.train_and_validate(verbose=1)

TypeError: in user code:


    TypeError: tf__parse_from_example_list() got multiple values for argument 'list_size'
