Skip to content

Commit

Permalink
Add HTML strip processor (#41888)
Browse files Browse the repository at this point in the history
This processor uses the lucene HTMLStripCharFilter class to remove HTML
entities from a field. This adds to the char filter, so that there is
possibility to store the stripped version as well.

Note, that the characeter filter replaces tags with a newline, so that
the produced HTML will look slightly different than the incoming HTML
with regards to newlines.
  • Loading branch information
spinscale committed May 9, 2019
1 parent 2592b49 commit 2a9da80
Show file tree
Hide file tree
Showing 8 changed files with 179 additions and 2 deletions.
1 change: 1 addition & 0 deletions docs/reference/ingest/ingest-node.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -864,6 +864,7 @@ include::processors/foreach.asciidoc[]
include::processors/geoip.asciidoc[]
include::processors/grok.asciidoc[]
include::processors/gsub.asciidoc[]
include::processors/html_strip.asciidoc[]
include::processors/join.asciidoc[]
include::processors/json.asciidoc[]
include::processors/kv.asciidoc[]
Expand Down
26 changes: 26 additions & 0 deletions docs/reference/ingest/processors/html_strip.asciidoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[[htmlstrip-processor]]
=== HTML Strip Processor
Removes HTML from field.

NOTE: Each HTML tag is replaced with a `\n` character.

[[htmlstrip-options]]
.HTML Strip Options
[options="header"]
|======
| Name | Required | Default | Description
| `field` | yes | - | The string-valued field to remove HTML tags from
| `target_field` | no | `field` | The field to assign the value to, by default `field` is updated in-place
| `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document
include::common-options.asciidoc[]
|======

[source,js]
--------------------------------------------------
{
"html_strip": {
"field": "foo"
}
}
--------------------------------------------------
// NOTCONSOLE
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.ingest.common;

import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
import org.elasticsearch.ElasticsearchException;

import java.io.IOException;
import java.io.StringReader;
import java.util.Map;

public final class HtmlStripProcessor extends AbstractStringProcessor<String> {

public static final String TYPE = "html_strip";

HtmlStripProcessor(String tag, String field, boolean ignoreMissing, String targetField) {
super(tag, field, ignoreMissing, targetField);
}

@Override
protected String process(String value) {
// shortcut, no need to create a string builder and go through each char
if (value.contains("<") == false || value.contains(">") == false) {
return value;
}

HTMLStripCharFilter filter = new HTMLStripCharFilter(new StringReader(value));

StringBuilder builder = new StringBuilder();
int ch;
try {
while ((ch = filter.read()) != -1) {
builder.append((char)ch);
}
} catch (IOException e) {
throw new ElasticsearchException(e);
}

return builder.toString();
}

@Override
public String getType() {
return TYPE;
}

public static final class Factory extends AbstractStringProcessor.Factory {

public Factory() {
super(TYPE);
}

@Override
protected HtmlStripProcessor newProcessor(String tag, Map<String, Object> config, String field,
boolean ignoreMissing, String targetField) {
return new HtmlStripProcessor(tag, field, ignoreMissing, targetField);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ public Map<String, Processor.Factory> getProcessors(Processor.Parameters paramet
entry(BytesProcessor.TYPE, new BytesProcessor.Factory()),
entry(PipelineProcessor.TYPE, new PipelineProcessor.Factory(parameters.ingestService)),
entry(DissectProcessor.TYPE, new DissectProcessor.Factory()),
entry(DropProcessor.TYPE, new DropProcessor.Factory()));
entry(DropProcessor.TYPE, new DropProcessor.Factory()),
entry(HtmlStripProcessor.TYPE, new HtmlStripProcessor.Factory()));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.ingest.common;

public class HtmlStripProcessorFactoryTests extends AbstractStringProcessorFactoryTestCase {
@Override
protected AbstractStringProcessor.Factory newFactory() {
return new HtmlStripProcessor.Factory();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.ingest.common;

public class HtmlStripProcessorTests extends AbstractStringProcessorTestCase<String> {

@Override
protected AbstractStringProcessor<String> newProcessor(String field, boolean ignoreMissing, String targetField) {
return new HtmlStripProcessor(randomAlphaOfLength(10), field, ignoreMissing, targetField);
}

@Override
protected String modifyInput(String input) {
return "<p><b>test</b>" + input + "<p><b>test</b>";
}

@Override
protected String expectedResult(String input) {
return "\ntest" + input + "\ntest";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
- contains: { nodes.$master.ingest.processors: { type: foreach } }
- contains: { nodes.$master.ingest.processors: { type: grok } }
- contains: { nodes.$master.ingest.processors: { type: gsub } }
- contains: { nodes.$master.ingest.processors: { type: html_strip } }
- contains: { nodes.$master.ingest.processors: { type: join } }
- contains: { nodes.$master.ingest.processors: { type: json } }
- contains: { nodes.$master.ingest.processors: { type: kv } }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ teardown:
"pattern" : "-",
"replacement" : "."
}
},
{
"html_strip" : {
"field" : "field_to_html_strip"
}
}
]
}
Expand All @@ -96,7 +101,8 @@ teardown:
"field_to_split": "127-0-0-1",
"field_to_join": ["127","0","0","1"],
"field_to_convert": ["127","0","0","1"],
"field_to_gsub": "127-0-0-1"
"field_to_gsub": "127-0-0-1",
"field_to_html_strip": "<p>this <title>is</title> a <b>test</b>"
}
- do:
Expand All @@ -114,6 +120,7 @@ teardown:
- match: { _source.field_to_join: "127-0-0-1" }
- match: { _source.field_to_convert: [127,0,0,1] }
- match: { _source.field_to_gsub: "127.0.0.1" }
- match: { _source.field_to_html_strip: "\nthis \nis\n a test" }

---
"Test metadata":
Expand Down

0 comments on commit 2a9da80

Please sign in to comment.