Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

A feature converter to convert a prefix LM corpus to one suitable for RL finetuning. #488

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions seqio/feature_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -1126,6 +1126,124 @@ def loss_on_targets_only(self) -> bool:
return self._loss_on_targets_only


class PrefixLM2RLFeatureConverter(FeatureConverter):
"""Feature converter for a prefix LM corpus to be used for RL training.

The input dataset must have both "inputs" and "targets" fields. For language
modeling objective with "targets" only dataset, use LMFeatureConverter.

For RL, only the "inputs" field is going to be used but we keep the targets
for utilization as a length constraint.

See PrefixLMFeatureConverter for a description of the fields utilized.

"""

TASK_FEATURES = {
"inputs": FeatureConverter.FeatureSpec(dtype=tf.int32),
"targets": FeatureConverter.FeatureSpec(dtype=tf.int32),
}
MODEL_FEATURES = {
"decoder_target_tokens": FeatureConverter.FeatureSpec(dtype=tf.int32),
"decoder_input_tokens": FeatureConverter.FeatureSpec(dtype=tf.int32),
"decoder_loss_weights": FeatureConverter.FeatureSpec(dtype=tf.int32),
"decoder_causal_attention": FeatureConverter.FeatureSpec(dtype=tf.int32),
}
PACKING_FEATURE_DTYPES = {}

def __init__(self,
**kwargs) -> None:
super().__init__(**kwargs)

def _convert_example(
self, features: Mapping[str, tf.Tensor]
) -> Mapping[str, tf.Tensor]:
"""Convert a Prefix LM example into an example with RL model features.

Example:
```
Suppose the original dataset is

ds = [{"inputs": [9, 4, 6, 1], "targets": [3, 9, 1]}]

Then the input features to this method (after padding) are

features = {
"inputs" = [9, 4, 6, 1, 0, 0, 0, 0, 0]
"targets" = [3, 9, 1, 0, 0, 0, 0, 0, 0]
"inputs_positions" = [0, 1, 2, 3, 0, 0, 0, 0, 0]
}

where "inputs_positions" indicates valid input positions.

Then we compute "decoder_causal_attention". For an upacked dataset, we need
to define the decoder_causal_attention; we do this similar to the
LMFeatureConverter by tiling the length+1 and creating an array of the
positions and then the decoder causal attention is where the positions
are less than the inputs_width:

"inputs_width_add_pos" = [4, 4, 4, 4, 0, 0, 0, 0, 0]
"positions" = [0, 1, 2, 3, 4, 5, 6, 7, 8]
< ---------------------------
"decoder_causal_attention" = [1, 1, 1, 1, 0, 0, 0, 0, 0]
```

Args:
features: an input tf.data.Dataset to be converted.

Returns:
d: the converted features.
"""

inputs = features["inputs"]
targets = features["targets"]
d = {
"decoder_target_tokens": targets,
"decoder_input_tokens": inputs,
"decoder_loss_weights": non_padding_position(features["inputs"]),
}
width = tf.argmax(features["inputs_positions"]) + 1
inputs_size = tf.size(inputs)
positions = tf.range(inputs_size, dtype=tf.int64)
inputs_width = tf.fill([inputs_size], width)
# Binary mask where 1 represents a position in a non-causal attention region
d["decoder_causal_attention"] = tf.cast(
positions < inputs_width, dtype=features["targets"].dtype
)
return d

def _convert_features(
self, ds: tf.data.Dataset, task_feature_lengths: Mapping[str, int]
) -> tf.data.Dataset:
"""Convert the input dataset to an output dataset to be fed to the model.

Args:
ds: an input tf.data.Dataset to be converted.
task_feature_lengths: a mapping from task feature name to its length.

Returns:
ds: the converted dataset.
"""
ds = self._pack_or_pad(ds, task_feature_lengths)
return ds.map(
self._convert_example, num_parallel_calls=tf.data.experimental.AUTOTUNE
)

def get_model_feature_lengths(
self, task_feature_lengths: Mapping[str, int]
) -> Mapping[str, int]:
"""Define the length relationship between task and model features."""
decoder_length = task_feature_lengths["targets"]
inputs_length = task_feature_lengths["inputs"]
model_feature_lengths = {
"decoder_target_tokens": decoder_length,
"decoder_input_tokens": inputs_length,
"decoder_loss_weights": inputs_length,
"decoder_causal_attention": inputs_length,
}
return model_feature_lengths


class PrefixSuffixLMFeatureConverter(PrefixLMFeatureConverter):
"""Feature converter for a input + target + suffix language model.

Expand Down
26 changes: 26 additions & 0 deletions seqio/feature_converters_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1332,5 +1332,31 @@ def test_prefix_suffix_lm_packed_trivial_suffoxes(self):
assert_dataset(converted_ds, expected)


class PrefixLM2RLFeatureConverterTest(tf.test.TestCase):

def test_lm_unpacked(self):
x = [{
"inputs": tf.constant([6, 1, 2, 3, 4]),
"targets": tf.constant([3, 9, 1]),
}]
ds = create_default_dataset(
x, feature_names=["inputs", "targets"]
)
task_feature_lengths = {
"inputs": 10,
"targets": 5,
}

converter = feature_converters.PrefixLM2RLFeatureConverter()
converted_ds = converter(ds, task_feature_lengths)
expected = {
"decoder_target_tokens": [3, 9, 1, 0, 0],
"decoder_input_tokens": [6, 1, 2, 3, 4, 0, 0, 0, 0, 0],
"decoder_loss_weights": [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
"decoder_causal_attention": [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
}
assert_dataset(converted_ds, expected)


if __name__ == "__main__":
tf.test.main()