-
Notifications
You must be signed in to change notification settings - Fork 24.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This processor uses the lucene HTMLStripCharFilter class to remove HTML entities from a field. This adds to the char filter, so that there is possibility to store the stripped version as well. Note, that the characeter filter replaces tags with a newline, so that the produced HTML will look slightly different than the incoming HTML with regards to newlines.
- Loading branch information
Showing
8 changed files
with
179 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
[[htmlstrip-processor]] | ||
=== HTML Strip Processor | ||
Removes HTML from field. | ||
|
||
NOTE: Each HTML tag is replaced with a `\n` character. | ||
|
||
[[htmlstrip-options]] | ||
.HTML Strip Options | ||
[options="header"] | ||
|====== | ||
| Name | Required | Default | Description | ||
| `field` | yes | - | The string-valued field to remove HTML tags from | ||
| `target_field` | no | `field` | The field to assign the value to, by default `field` is updated in-place | ||
| `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document | ||
include::common-options.asciidoc[] | ||
|====== | ||
|
||
[source,js] | ||
-------------------------------------------------- | ||
{ | ||
"html_strip": { | ||
"field": "foo" | ||
} | ||
} | ||
-------------------------------------------------- | ||
// NOTCONSOLE |
76 changes: 76 additions & 0 deletions
76
modules/ingest-common/src/main/java/org/elasticsearch/ingest/common/HtmlStripProcessor.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.ingest.common; | ||
|
||
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; | ||
import org.elasticsearch.ElasticsearchException; | ||
|
||
import java.io.IOException; | ||
import java.io.StringReader; | ||
import java.util.Map; | ||
|
||
public final class HtmlStripProcessor extends AbstractStringProcessor<String> { | ||
|
||
public static final String TYPE = "html_strip"; | ||
|
||
HtmlStripProcessor(String tag, String field, boolean ignoreMissing, String targetField) { | ||
super(tag, field, ignoreMissing, targetField); | ||
} | ||
|
||
@Override | ||
protected String process(String value) { | ||
// shortcut, no need to create a string builder and go through each char | ||
if (value.contains("<") == false || value.contains(">") == false) { | ||
return value; | ||
} | ||
|
||
HTMLStripCharFilter filter = new HTMLStripCharFilter(new StringReader(value)); | ||
|
||
StringBuilder builder = new StringBuilder(); | ||
int ch; | ||
try { | ||
while ((ch = filter.read()) != -1) { | ||
builder.append((char)ch); | ||
} | ||
} catch (IOException e) { | ||
throw new ElasticsearchException(e); | ||
} | ||
|
||
return builder.toString(); | ||
} | ||
|
||
@Override | ||
public String getType() { | ||
return TYPE; | ||
} | ||
|
||
public static final class Factory extends AbstractStringProcessor.Factory { | ||
|
||
public Factory() { | ||
super(TYPE); | ||
} | ||
|
||
@Override | ||
protected HtmlStripProcessor newProcessor(String tag, Map<String, Object> config, String field, | ||
boolean ignoreMissing, String targetField) { | ||
return new HtmlStripProcessor(tag, field, ignoreMissing, targetField); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
27 changes: 27 additions & 0 deletions
27
...-common/src/test/java/org/elasticsearch/ingest/common/HtmlStripProcessorFactoryTests.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.ingest.common; | ||
|
||
public class HtmlStripProcessorFactoryTests extends AbstractStringProcessorFactoryTestCase { | ||
@Override | ||
protected AbstractStringProcessor.Factory newFactory() { | ||
return new HtmlStripProcessor.Factory(); | ||
} | ||
} |
38 changes: 38 additions & 0 deletions
38
.../ingest-common/src/test/java/org/elasticsearch/ingest/common/HtmlStripProcessorTests.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
/* | ||
* Licensed to Elasticsearch under one or more contributor | ||
* license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright | ||
* ownership. Elasticsearch licenses this file to you under | ||
* the Apache License, Version 2.0 (the "License"); you may | ||
* not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
package org.elasticsearch.ingest.common; | ||
|
||
public class HtmlStripProcessorTests extends AbstractStringProcessorTestCase<String> { | ||
|
||
@Override | ||
protected AbstractStringProcessor<String> newProcessor(String field, boolean ignoreMissing, String targetField) { | ||
return new HtmlStripProcessor(randomAlphaOfLength(10), field, ignoreMissing, targetField); | ||
} | ||
|
||
@Override | ||
protected String modifyInput(String input) { | ||
return "<p><b>test</b>" + input + "<p><b>test</b>"; | ||
} | ||
|
||
@Override | ||
protected String expectedResult(String input) { | ||
return "\ntest" + input + "\ntest"; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters