# Write TFRecods

In [47]:
import numpy as np
import tensorflow as tf
import tensorflow_ranking as tfr
import glob
from datetime import datetime
import random

In [48]:
# Random
random.seed(42)

# Python
os.environ['PYTHONHASHSEED'] = str(42)

# Numpy
np.random.seed(42)

In [4]:
tf.__version__

'2.5.0'

In [5]:
tfr.__version__

'0.4.0.dev'

### TensorFlow Ranking dataset_reader example

**tfr.keras.pipeline.DatasetHparams.dataset_reader**: https://github.com/tensorflow/ranking/blob/master/tensorflow_ranking/g3doc/api_docs/python/tfr/keras/pipeline/DatasetHparams/dataset_reader.md

In [112]:
import os
import tempfile

example_path = os.path.join(tempfile.gettempdir(), "example.tfrecords")
print(example_path)

/tmp/example.tfrecords


In [127]:
# Write the records to a file.
with tf.io.TFRecordWriter(example_path) as file_writer:
    for _ in range(4):
        x, y = np.random.random(), np.random.random()
        record_bytes = tf.train.Example(features=tf.train.Features(feature={
            "x": tf.train.Feature(float_list=tf.train.FloatList(value=[x])),
            "y": tf.train.Feature(float_list=tf.train.FloatList(value=[y])),
        })).SerializeToString()
        file_writer.write(record_bytes)

In [128]:
!ls -l "$example_path"

-rw-r--r-- 1 root root 192 Jun 23 10:01 /tmp/example.tfrecords


In [129]:
# Read the data back out.
def decode_fn(record_bytes):
    return tf.io.parse_single_example(
        # Data
        record_bytes,
        # Schema
        {
            "x": tf.io.FixedLenFeature([], dtype=tf.float32),
            "y": tf.io.FixedLenFeature([], dtype=tf.float32)
        }
    )

In [130]:
for batch in tf.data.TFRecordDataset([example_path]).map(decode_fn):
    print("x = {x:.4f},  y = {y:.4f}".format(**batch))

x = 0.4561,  y = 0.7852
x = 0.1997,  y = 0.5142
x = 0.5924,  y = 0.0465
x = 0.6075,  y = 0.1705


### My own example

In [89]:
filename = "/data/example.txt"

In [90]:
text = """1 qid:10 32:0.14 48:0.97 51:0.45
0 qid:10 1:0.15 31:0.75 32:0.24 49:0.6
2 qid:10 1:0.71 2:0.36 31:0.58 51:0.12
0 qid:20 4:0.79 31:0.01 33:0.05 35:0.27
3 qid:20 1:0.42 28:0.79 35:0.30 42:0.76"""

with open(filename, 'w') as f:
    f.write(text)

In [91]:
!cat "$filename"

1 qid:10 32:0.14 48:0.97 51:0.45
0 qid:10 1:0.15 31:0.75 32:0.24 49:0.6
2 qid:10 1:0.71 2:0.36 31:0.58 51:0.12
0 qid:20 4:0.79 31:0.01 33:0.05 35:0.27
3 qid:20 1:0.42 28:0.79 35:0.30 42:0.76

In [92]:
example_path = "/data/example.tfrecords"
print(example_path)

/data/example.tfrecords


In [96]:
with open(filename) as f:
    lines = [ line.strip() for line in f.readlines() ]

In [97]:
lines

['1 qid:10 32:0.14 48:0.97 51:0.45',
 '0 qid:10 1:0.15 31:0.75 32:0.24 49:0.6',
 '2 qid:10 1:0.71 2:0.36 31:0.58 51:0.12',
 '0 qid:20 4:0.79 31:0.01 33:0.05 35:0.27',
 '3 qid:20 1:0.42 28:0.79 35:0.30 42:0.76']

In [134]:
# Arguments
num_features = 136

with tf.io.TFRecordWriter(example_path) as file_writer:

    with open(filename) as f:

        for line in f.readlines():

            # Read LibSVM line
            tokens = line.strip().split()
            label = int(tokens[0])
            qid = int(tokens[1].split(':')[1])
            kv_pairs = [ kv.split(":") for kv in tokens[2:] ]
            sparse_features = { int(k): float(v) for (k, v) in kv_pairs }
            
            # Create default values for sparse vector
            feature = {
                f"custom_features_{i}": tf.train.Feature(float_list=tf.train.FloatList(value=[0.0]))
                for i in range(1, num_features + 1)
            }
            # Apply non-zero values from sparse features
            for k, v in sparse_features.items():
                feature[f"custom_features_{k}"] = tf.train.Feature(float_list=tf.train.FloatList(value=[v]))
            # Add label
            feature["label"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[label]))
            feature["qid"] = tf.train.Feature(int64_list=tf.train.Int64List(value=[qid]))

            record_bytes = tf.train.Example(
                features=tf.train.Features(
                    feature=feature
                )).SerializeToString()
            file_writer.write(record_bytes)

In [137]:
schema = {
    f"custom_features_{i}": tf.io.FixedLenFeature([], dtype=tf.float32)
    for i in range(1, num_features + 1)
}
schema["label"] = tf.io.FixedLenFeature([], dtype=tf.int64)
schema["qid"] = tf.io.FixedLenFeature([], dtype=tf.int64)

# Read the data back out.
def decode_fn(record_bytes):
    return tf.io.parse_single_example(
        # Data
        record_bytes,
        # Schema
        schema
    )

In [142]:
for batch in tf.data.TFRecordDataset([example_path]).map(decode_fn):
    print("label = {label},  qid = {qid},  custom_features_1 = {custom_features_1:.4f}".format(**batch))

label = 1,  qid = 10,  custom_features_1 = 0.0000
label = 0,  qid = 10,  custom_features_1 = 0.1500
label = 2,  qid = 10,  custom_features_1 = 0.7100
label = 0,  qid = 20,  custom_features_1 = 0.0000
label = 3,  qid = 20,  custom_features_1 = 0.4200


### TensorFlow Ranking Keras example

https://github.com/tensorflow/ranking/blob/master/tensorflow_ranking/examples/keras/keras_dnn_tfrecord.py

In [143]:
test_records = "/data/test_numerical_elwc.tfrecord"

In [144]:
!ls -l "$test_records"

-rw-r--r-- 1 root root 21788 Jun  8 12:21 /data/test_numerical_elwc.tfrecord


In [145]:
for batch in tf.data.TFRecordDataset(val_records).take(1):
    print(batch)

tf.Tensor(b'\n\xf5\x03\n\xf2\x03\n\x1e\n\x12custom_features_77\x12\x08\x12\x06\n\x04;:\x02\xbf\n\x1e\n\x12custom_features_54\x12\x08\x12\x06\n\x04%Z\x82>\n\x1e\n\x12custom_features_37\x12\x08\x12\x06\n\x04\xe7\x8bI?\n\x10\n\x07utility\x12\x05\x1a\x03\n\x01\x01\n\x1d\n\x11custom_features_2\x12\x08\x12\x06\n\x04\xee\x08\xff>\n\x1e\n\x12custom_features_90\x12\x08\x12\x06\n\x04c\x9bd\xbe\n\x1f\n\x13custom_features_131\x12\x08\x12\x06\n\x04\x02,.?\n\x1e\n\x12custom_features_46\x12\x08\x12\x06\n\x041\xee\x1a\xbf\n\x1d\n\x11custom_features_7\x12\x08\x12\x06\n\x04K\xe9i>\n\x1e\n\x12custom_features_43\x12\x08\x12\x06\n\x04Y\xdc\xcf>\n\x1e\n\x12custom_features_40\x12\x08\x12\x06\n\x04j1\xd8\xbd\n\x1e\n\x12custom_features_82\x12\x08\x12\x06\n\x04\xa1j`\xbf\n\x1e\n\x12custom_features_61\x12\x08\x12\x06\n\x04\xad\xc3a>\n\x1e\n\x12custom_features_93\x12\x08\x12\x06\n\x04\xaa\xee\x91\xbe\n\x1e\n\x12custom_features_81\x12\x08\x12\x06\n\x04\xfaE\xe9\xbe\n\x1f\n\x13custom_features_107\x12\x08\x12\x06\n\

In [146]:
# Arguments
num_features = 136

# The document relevance label.
_LABEL_FEATURE = "utility"

# Padding labels are set negative so that the corresponding examples can be
# ignored in loss and metrics.
_PADDING_LABEL = -1
_MASK = "example_list_mask"

# Read the data back out.
def decode_fn(record_bytes):
    
    schema = {
        f"custom_features_{i}": tf.io.FixedLenFeature(shape=(1,), dtype=tf.float32, default_value=0.0)
        for i in range(1, num_features + 1)
    }
    schema[_LABEL_FEATURE] = tf.io.FixedLenFeature(shape=(1,), dtype=tf.int64, default_value=_PADDING_LABEL)
    
    return tf.io.parse_single_example(
        # Data
        record_bytes,
        # Schema
        schema
    )

In [147]:
for batch in tf.data.TFRecordDataset([val_records]).map(decode_fn):
    print(batch)
    #print("x = {x:.4f},  y = {y:.4f}".format(**batch))

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf5 in position 40: invalid start byte