build your ops.yaml from data-juicer

In [1]:
from data_juicer.tools.op_search import OPSearcher

op_records = OPSearcher().search()

2025-07-10 14:00:08.127 | INFO     | data_juicer.ops:timing_context:12 - Importing operator modules took 10.40 seconds


In [2]:
import re
import sys
import inspect
from annotated_types import Gt, Lt

def build_arg_state_from_signature(sig: inspect.Signature):

    arg_state = {}
    for name, param in sig.parameters.items():
        if name == 'self' or name == 'args' or name == 'kwargs':
            continue
        param_dict = {}
        ann = param.annotation

        if ann is inspect._empty:
            param_dict['type'] = 'str'
        elif ann is str:
            param_dict['type'] = 'str'
        elif ann is bool:
            param_dict['type'] = 'bool'
            param_dict['default'] = True
        elif ann is int:
            param_dict['type'] = 'int'
        elif ann is float:
            param_dict['type'] = 'float'
        elif hasattr(ann, '__origin__') and ann.__origin__ is list:
            if hasattr(ann, '__args__') and ann.__args__[0] is str:
                param_dict['type'] = 'list_str'
            else:
                # param_dict['type'] = 'list'
                raise
        elif hasattr(ann, '__origin__') and ann.__origin__ is int:
            param_dict['type'] = 'int'
            for meta in ann.__metadata__:
                if isinstance(meta, Gt):
                    param_dict['min'] = meta.gt
                if isinstance(meta, Lt):
                    param_dict['max'] = meta.lt
        else:
            raise ValueError(f"Unsupported type: {ann}")

        if param.default is inspect._empty or param.default is None:
            param_dict['default'] = ""
        else:
            if type(param.default) != ann and not isinstance(param.default, list) and param.default != sys.maxsize:
                re_pattern = r"<class '(.*)'>"
                str_type = re.search(re_pattern, str(type(param.default))).group(1)
                param_dict['type'] = str_type
            max_value = 9007199254740991.0
            min_value = -9007199254740991.0
            if isinstance(param.default, (int, float)):
                if param.default > max_value:
                    param_dict['default'] = max_value if isinstance(param.default, float) else int(max_value)
                elif param.default < min_value:
                    param_dict['default'] = min_value if isinstance(param.default, float) else int(min_value)
                else:
                    param_dict['default'] = param.default
            else:
                param_dict['default'] = param.default

        arg_state[name] = param_dict

    return arg_state

In [3]:
custom_ops = {}
for op_record in op_records:
    op_sig = op_record['signature']
    try:
        arg_state = build_arg_state_from_signature(op_sig)
    except Exception:
        continue
    if op_record["name"] in ["general_field_filter", "download_file_mapper", "sdxl_prompt2prompt_mapper", "sentence_augmentation_mapper", "naive_grouper", "naive_reverse_grouper", "tags_specified_field_selector"]:
        continue
    custom_ops[op_record["name"]] = {"args":arg_state}

In [4]:
import yaml
with open("./configs/all_ops.yaml", "w") as f:
    yaml.dump(custom_ops, f)

Test operator pool

In [5]:
from operator_pool import OperatorPool

# op_pool = OperatorPool(config_path="./configs/default_ops.yaml")
op_pool = OperatorPool(config_path="./configs/all_ops.yaml")

2025-07-10 14:00:23.398 | INFO     | operator_pool:__init__:225 - Operator: __init__: alphanumeric_filter args: {'max_ratio': {'default': 9007199254740991, 'type': 'float'}, 'min_ratio': {'default': 0.25, 'type': 'float'}, 'tokenization': {'default': False, 'type': 'bool'}}
2025-07-10 14:00:23.399 | INFO     | operator_pool:__init__:225 - Operator: __init__: audio_add_gaussian_noise_mapper args: {'max_amplitude': {'default': 0.015, 'type': 'float'}, 'min_amplitude': {'default': 0.001, 'type': 'float'}, 'p': {'default': 0.5, 'type': 'float'}}
2025-07-10 14:00:23.400 | INFO     | operator_pool:__init__:225 - Operator: __init__: audio_duration_filter args: {'any_or_all': {'default': 'any', 'type': 'str'}, 'max_duration': {'default': 9007199254740991, 'type': 'int'}, 'min_duration': {'default': 0, 'type': 'int'}}
2025-07-10 14:00:23.400 | INFO     | operator_pool:__init__:225 - Operator: __init__: audio_nmf_snr_filter args: {'any_or_all': {'default': 'any', 'type': 'str'}, 'max_snr': {'def

In [6]:
# access op with index
op_pool[0].name

'alphanumeric_filter'

In [7]:
# access op with name as key
op_pool["alphanumeric_filter"].desc

'Filter to keep samples with alphabet/numeric ratio within a specific range.'

In [8]:
# iteration
for op_name in op_pool.pool:
    print(op_name)

alphanumeric_filter
audio_add_gaussian_noise_mapper
audio_duration_filter
audio_nmf_snr_filter
audio_size_filter
average_line_length_filter
character_repetition_filter
chinese_convert_mapper
clean_copyright_mapper
clean_html_mapper
document_deduplicator
expand_macro_mapper
extract_tables_from_html_mapper
fix_unicode_mapper
human_preference_annotation_mapper
image_aesthetics_filter
image_aspect_ratio_filter
image_blur_mapper
image_deduplicator
image_face_count_filter
image_face_ratio_filter
image_nsfw_filter
image_segment_mapper
image_shape_filter
image_size_filter
image_tagging_mapper
image_text_matching_filter
image_text_similarity_filter
image_watermark_filter
maximum_line_length_filter
mllm_mapper
nlpaug_en_mapper
nlpcda_zh_mapper
perplexity_filter
phrase_grounding_recall_filter
punctuation_normalization_mapper
python_file_mapper
python_lambda_mapper
ray_document_deduplicator
ray_image_deduplicator
ray_video_deduplicator
remove_bibliography_mapper
remove_header_mapper
remove_long_wo

In [9]:
# state, used for llm query, save, and load
op_pool.state

{'alphanumeric_filter': {'name': 'alphanumeric_filter',
  'desc': 'Filter to keep samples with alphabet/numeric ratio within a specific range.',
  'enabled': True,
  'args': {'max_ratio': {'name': 'max_ratio',
    'desc': "max_ratio (<class 'float'>): The max filter ratio in alphanumeric op, samples will be filtered if their alphabet/numeric ratio exceeds this parameter.",
    'type': 'float',
    'default': 9007199254740991.0,
    'v': 9007199254740991.0,
    'options': None,
    'min': None,
    'max': None},
   'min_ratio': {'name': 'min_ratio',
    'desc': "min_ratio (<class 'float'>): The min filter ratio in alphanumeric op, samples will be filtered if their alphabet/numeric ratio is below this parameter.",
    'type': 'float',
    'default': 0.25,
    'v': 0.25,
    'options': None,
    'min': None,
    'max': None},
   'tokenization': {'name': 'tokenization',
    'desc': "tokenization (<class 'bool'>): Whether to count the ratio of alphanumeric to the total number of tokens. if 

In [10]:
# export recipe
op_pool.export_config(
    project_name="demo",
    dataset_path="./data/demo-dataset.jsonl",
    nproc=4,
    export_path="./outputs/processed_data.jsonl",
    config_path="./configs/demo-recipe.yaml"
)

'./configs/demo-recipe.yaml'

In [11]:
# enable/disable an operator
print(op_pool["alphanumeric_filter"].enabled)
op_pool.act(op_name="alphanumeric_filter", action_type="disable")
print(op_pool["alphanumeric_filter"].enabled)
op_pool.act(op_name="alphanumeric_filter", action_type="enable")
print(op_pool["alphanumeric_filter"].enabled)

True
False
True


In [12]:
# set arg value
op_pool.act(op_name="alphanumeric_filter", action_type="set_arg",
            arg_name="min_ratio", v=0.2)
op_pool["alphanumeric_filter"].args["min_ratio"].v

0.2

In [13]:
stats = dict(mean=0.3, std=0.04, min=0.12, max=0.89, quantiles=[0.01 * i for i in range(101)])
op_pool["alphanumeric_filter"].update_with_stats(stats)
# set arg value as the p% quantile
op_pool.act(op_name="alphanumeric_filter", action_type="set_arg",
            arg_name="min_ratio", p=30) # p=0.3 is also acceptable
op_pool["alphanumeric_filter"].args["min_ratio"].v

0.3

In [14]:
# set filter args as mean \pm k * std
op_pool.act(op_name="alphanumeric_filter", action_type="set_arg",
            k=3)
op_pool["alphanumeric_filter"].args["min_ratio"].v, op_pool["alphanumeric_filter"].args["max_ratio"].v

(0.18, 0.42)