# Lucene regex filter
This presentations contains an example of a filter with a lucene conform regular expression. 
A concatenator that merges different fields form an event is used as a processor for demonstrating the filter function. 

Until now it was necessary to flag keys of values that contain a regular expression with regex_fields. 

In [1]:
document = {
    'data_stream': {
        'dataset': 'windows', 
        'namespace': 'devopslab', 
        'type': 'logs'
        }, 
    '_op_type': 'create'
    }

expected = {
    'data_stream': {
        'dataset': 'windows', 
        'namespace': 'devopslab', 
        'type': 'logs'
        }, 
    '_op_type': 'create', 
    '_index': 'logs-windows-devopslab'
    }

### Define process

In [2]:
import sys
sys.path.insert(0,"../../../../../")
import tempfile
from copy import deepcopy
from pathlib import Path

from unittest import mock
from logprep.factory import Factory

rule_path = Path(tempfile.gettempdir()) / "concatenator"
rule_path.mkdir(exist_ok=True)
rule_file = rule_path / "data-stream.yml"

if rule_file.exists():
    rule_file.unlink()

processor_config = {
    "myconcatenator":{   
        "type": "concatenator",
        "specific_rules": [str(rule_path)],
        "generic_rules": ["/dev"],
        }
    }

def concat_with_rule(rule_yaml):
    mydocument = deepcopy(document)
    if rule_file.exists():
        rule_file.unlink()
    rule_file.write_text(rule_yaml)
    concatenator = Factory.create(processor_config)
    print(f"before: {mydocument}")
    concatenator.process(mydocument)
    print(f"after: {mydocument}")
    print(mydocument == expected)
    

### regex_fields version

In [3]:
rule_yaml = """---
filter: 'data_stream.type: ".*lo.*"'     
regex_fields:
  - "data_stream.type"
concatenator:
  source_fields:
    - data_stream.type
    - data_stream.dataset
    - data_stream.namespace
  target_field: _index
  separator: "-"
  overwrite_target: false
  delete_source_fields: false
"""

concat_with_rule(rule_yaml)


[Deprecated]: regex_fields are no longer necessary. Use Lucene regex annotation.




before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}
after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}
True


### Lucene conform version without the need of regex_fields

In [4]:
rule_yaml = """---
filter: 'data_stream.type: "/.*lo.*/"'    
concatenator:
  source_fields:
    - data_stream.type
    - data_stream.dataset
    - data_stream.namespace
  target_field: _index
  separator: "-"
  overwrite_target: false
  delete_source_fields: false
"""
concat_with_rule(rule_yaml)


before: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create'}
after: {'data_stream': {'dataset': 'windows', 'namespace': 'devopslab', 'type': 'logs'}, '_op_type': 'create', '_index': 'logs-windows-devopslab'}
True
