Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions docs/plugins/ingest-attachment.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,40 @@ The document's `attachment` object contains extracted properties for the file:
NOTE: Keeping the binary as a field within the document might consume a lot of resources. It is highly recommended
to remove that field from the document. Set `remove_binary` to `true` to automatically remove the field.

[[ingest-attachment-fields]]
==== Exported fields

The fields which might be extracted from a document are:

* `content`,
* `title`,
* `author`,
* `keywords`,
* `date`,
* `content_type`,
* `content_length`,
* `language`,
* `modified`,
* `format`,
* `identifier`,
* `contributor`,
* `coverage`,
* `modifier`,
* `creator_tool`,
* `publisher`,
* `relation`,
* `rights`,
* `source`,
* `type`,
* `description`,
* `print_date`,
* `metadata_date`,
* `latitude`,
* `longitude`,
* `altitude`,
* `rating`,
* `comments`

To extract only certain `attachment` fields, specify the `properties` array:

[source,console]
Expand Down
6 changes: 6 additions & 0 deletions plugins/ingest-attachment/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ tasks.named("forbiddenPatterns").configure {
exclude '**/text-cjk-*.txt'
}

tasks.named("yamlRestTestV7CompatTransform").configure { task ->
// 2 new tika metadata fields are returned in v8
task.replaceValueInLength("_source.attachment", 8, "Test ingest attachment processor with .doc file")
task.replaceValueInLength("_source.attachment", 8, "Test ingest attachment processor with .docx file")
}

tasks.named("thirdPartyAudit").configure {
ignoreMissingClasses()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.common.Strings;
Expand Down Expand Up @@ -132,40 +133,11 @@ public IngestDocument execute(IngestDocument ingestDocument) {
additionalFields.put(Property.LANGUAGE.toLowerCase(), language);
}

if (properties.contains(Property.DATE)) {
String createdDate = metadata.get(TikaCoreProperties.CREATED);
if (createdDate != null) {
additionalFields.put(Property.DATE.toLowerCase(), createdDate);
}
}

if (properties.contains(Property.TITLE)) {
String title = metadata.get(TikaCoreProperties.TITLE);
if (Strings.hasLength(title)) {
additionalFields.put(Property.TITLE.toLowerCase(), title);
}
}

if (properties.contains(Property.AUTHOR)) {
String author = metadata.get("Author");
if (Strings.hasLength(author)) {
additionalFields.put(Property.AUTHOR.toLowerCase(), author);
}
}

if (properties.contains(Property.KEYWORDS)) {
String keywords = metadata.get("Keywords");
if (Strings.hasLength(keywords)) {
additionalFields.put(Property.KEYWORDS.toLowerCase(), keywords);
}
}

if (properties.contains(Property.CONTENT_TYPE)) {
String contentType = metadata.get(Metadata.CONTENT_TYPE);
if (Strings.hasLength(contentType)) {
additionalFields.put(Property.CONTENT_TYPE.toLowerCase(), contentType);
}
}
addAdditionalField(additionalFields, Property.DATE, metadata.get(TikaCoreProperties.CREATED));
addAdditionalField(additionalFields, Property.TITLE, metadata.get(TikaCoreProperties.TITLE));
addAdditionalField(additionalFields, Property.AUTHOR, metadata.get("Author"));
addAdditionalField(additionalFields, Property.KEYWORDS, metadata.get("Keywords"));
addAdditionalField(additionalFields, Property.CONTENT_TYPE, metadata.get(Metadata.CONTENT_TYPE));

if (properties.contains(Property.CONTENT_LENGTH)) {
String contentLength = metadata.get(Metadata.CONTENT_LENGTH);
Expand All @@ -178,6 +150,30 @@ public IngestDocument execute(IngestDocument ingestDocument) {
additionalFields.put(Property.CONTENT_LENGTH.toLowerCase(), length);
}

addAdditionalField(additionalFields, Property.AUTHOR, metadata.get(TikaCoreProperties.CREATOR));
addAdditionalField(additionalFields, Property.KEYWORDS, metadata.get(Office.KEYWORDS));

addAdditionalField(additionalFields, Property.MODIFIED, metadata.get(TikaCoreProperties.MODIFIED));
addAdditionalField(additionalFields, Property.FORMAT, metadata.get(TikaCoreProperties.FORMAT));
addAdditionalField(additionalFields, Property.IDENTIFIER, metadata.get(TikaCoreProperties.IDENTIFIER));
addAdditionalField(additionalFields, Property.CONTRIBUTOR, metadata.get(TikaCoreProperties.CONTRIBUTOR));
addAdditionalField(additionalFields, Property.COVERAGE, metadata.get(TikaCoreProperties.COVERAGE));
addAdditionalField(additionalFields, Property.MODIFIER, metadata.get(TikaCoreProperties.MODIFIER));
addAdditionalField(additionalFields, Property.CREATOR_TOOL, metadata.get(TikaCoreProperties.CREATOR_TOOL));
addAdditionalField(additionalFields, Property.PUBLISHER, metadata.get(TikaCoreProperties.PUBLISHER));
addAdditionalField(additionalFields, Property.RELATION, metadata.get(TikaCoreProperties.RELATION));
addAdditionalField(additionalFields, Property.RIGHTS, metadata.get(TikaCoreProperties.RIGHTS));
addAdditionalField(additionalFields, Property.SOURCE, metadata.get(TikaCoreProperties.SOURCE));
addAdditionalField(additionalFields, Property.TYPE, metadata.get(TikaCoreProperties.TYPE));
addAdditionalField(additionalFields, Property.DESCRIPTION, metadata.get(TikaCoreProperties.DESCRIPTION));
addAdditionalField(additionalFields, Property.PRINT_DATE, metadata.get(TikaCoreProperties.PRINT_DATE));
addAdditionalField(additionalFields, Property.METADATA_DATE, metadata.get(TikaCoreProperties.METADATA_DATE));
addAdditionalField(additionalFields, Property.LATITUDE, metadata.get(TikaCoreProperties.LATITUDE));
addAdditionalField(additionalFields, Property.LONGITUDE, metadata.get(TikaCoreProperties.LONGITUDE));
addAdditionalField(additionalFields, Property.ALTITUDE, metadata.get(TikaCoreProperties.ALTITUDE));
addAdditionalField(additionalFields, Property.RATING, metadata.get(TikaCoreProperties.RATING));
addAdditionalField(additionalFields, Property.COMMENTS, metadata.get(TikaCoreProperties.COMMENTS));

ingestDocument.setFieldValue(targetField, additionalFields);

if (removeBinary) {
Expand All @@ -186,6 +182,18 @@ public IngestDocument execute(IngestDocument ingestDocument) {
return ingestDocument;
}

/**
* Add an additional field if not null or empty
* @param additionalFields additional fields
* @param property property to add
* @param value value to add
*/
private <T> void addAdditionalField(Map<String, Object> additionalFields, Property property, String value) {
if (properties.contains(property) && Strings.hasLength(value)) {
additionalFields.put(property.toLowerCase(), value);
}
}

@Override
public String getType() {
return TYPE;
Expand Down Expand Up @@ -270,7 +278,27 @@ enum Property {
DATE,
CONTENT_TYPE,
CONTENT_LENGTH,
LANGUAGE;
LANGUAGE,
MODIFIED,
FORMAT,
IDENTIFIER,
CONTRIBUTOR,
COVERAGE,
MODIFIER,
CREATOR_TOOL,
PUBLISHER,
RELATION,
RIGHTS,
SOURCE,
TYPE,
DESCRIPTION,
PRINT_DATE,
METADATA_DATE,
LATITUDE,
LONGITUDE,
ALTITUDE,
RATING,
COMMENTS;

public static Property parse(String value) {
return valueOf(value.toUpperCase(Locale.ROOT));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,20 @@ public void testEnglishTextDocument() throws Exception {
}

public void testHtmlDocumentWithRandomFields() throws Exception {
// date is not present in the html doc
// some metadata are not present in the html doc
// "date", "metadata_date", "comments", "modified", "modifier", "print_date", "relation", "creator_tool", "altitude"
// "identifier", "longitude", "publisher", "description", "latitude", "format", "source", "coverage"
// "rating", "type", "contributor", "rights"
// we are only trying with content, title, author, keywords, content_type and content_length.
ArrayList<AttachmentProcessor.Property> fieldsList = new ArrayList<>(
EnumSet.complementOf(EnumSet.of(AttachmentProcessor.Property.DATE))
EnumSet.of(
AttachmentProcessor.Property.CONTENT,
AttachmentProcessor.Property.TITLE,
AttachmentProcessor.Property.AUTHOR,
AttachmentProcessor.Property.KEYWORDS,
AttachmentProcessor.Property.CONTENT_TYPE,
AttachmentProcessor.Property.CONTENT_LENGTH
)
);
Set<AttachmentProcessor.Property> selectedProperties = new HashSet<>();

Expand Down Expand Up @@ -128,7 +139,20 @@ public void testEmptyTextDocument() throws Exception {
public void testWordDocument() throws Exception {
Map<String, Object> attachmentData = parseDocument("issue-104.docx", processor);

assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length"));
assertThat(
attachmentData.keySet(),
containsInAnyOrder(
"content",
"language",
"date",
"author",
"content_type",
"content_length",
"modifier",
"modified",
"publisher"
)
);
assertThat(attachmentData.get("content"), is(notNullValue()));
assertThat(attachmentData.get("language"), is("en"));
assertThat(attachmentData.get("date"), is("2012-10-12T11:17:00Z"));
Expand All @@ -138,12 +162,28 @@ public void testWordDocument() throws Exception {
attachmentData.get("content_type").toString(),
is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
);
assertThat(attachmentData.get("modifier").toString(), is("Luka Lampret"));
assertThat(attachmentData.get("modified").toString(), is("2015-02-20T11:36:00Z"));
assertThat(attachmentData.get("publisher").toString(), is("JDI"));
}

public void testWordDocumentWithVisioSchema() throws Exception {
Map<String, Object> attachmentData = parseDocument("issue-22077.docx", processor);

assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length"));
assertThat(
attachmentData.keySet(),
containsInAnyOrder(
"content",
"language",
"date",
"author",
"content_type",
"content_length",
"modifier",
"modified",
"print_date"
)
);
assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
assertThat(attachmentData.get("language"), is("en"));
assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z"));
Expand All @@ -153,18 +193,37 @@ public void testWordDocumentWithVisioSchema() throws Exception {
attachmentData.get("content_type").toString(),
is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
);
assertThat(attachmentData.get("modifier").toString(), is("Chris Dufour"));
assertThat(attachmentData.get("modified").toString(), is("2016-12-04T16:58:00Z"));
assertThat(attachmentData.get("print_date").toString(), is("2015-01-05T19:12:00Z"));
}

public void testLegacyWordDocumentWithVisioSchema() throws Exception {
Map<String, Object> attachmentData = parseDocument("issue-22077.doc", processor);

assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length"));
assertThat(
attachmentData.keySet(),
containsInAnyOrder(
"content",
"language",
"date",
"author",
"content_type",
"content_length",
"modifier",
"modified",
"print_date"
)
);
assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
assertThat(attachmentData.get("language"), is("en"));
assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z"));
assertThat(attachmentData.get("author"), is(notNullValue()));
assertThat(attachmentData.get("content_length"), is(notNullValue()));
assertThat(attachmentData.get("content_type").toString(), is("application/msword"));
assertThat(attachmentData.get("modifier").toString(), is("David Pilato"));
assertThat(attachmentData.get("modified").toString(), is("2016-12-16T15:04:00Z"));
assertThat(attachmentData.get("print_date").toString(), is("2015-01-05T19:12:00Z"));
}

public void testPdf() throws Exception {
Expand Down Expand Up @@ -217,9 +276,26 @@ public void testEpubDocument() throws Exception {

assertThat(
attachmentData.keySet(),
containsInAnyOrder("language", "content", "author", "title", "content_type", "content_length", "date", "keywords")
containsInAnyOrder(
"language",
"content",
"author",
"title",
"content_type",
"content_length",
"date",
"keywords",
"identifier",
"contributor",
"publisher",
"description"
)
);
assertThat(attachmentData.get("content_type").toString(), containsString("application/epub+zip"));
assertThat(attachmentData.get("identifier").toString(), is("1234567890"));
assertThat(attachmentData.get("contributor").toString(), is("no-one"));
assertThat(attachmentData.get("publisher").toString(), is("Apache"));
assertThat(attachmentData.get("description").toString(), is("This is an ePub test publication for Tika."));
}

// no real detection, just rudimentary
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
---
"Test ingest attachment processor with .doc file":
- skip:
version: " - 7.99.99"
reason: "new fields added in 8.0.0"
- do:
ingest.put_pipeline:
id: "my_pipeline"
Expand Down Expand Up @@ -27,17 +30,22 @@
get:
index: test
id: 1
- length: { _source.attachment: 6 }
- length: { _source.attachment: 8 }
- match: { _source.attachment.content: "Test elasticsearch" }
- match: { _source.attachment.language: "et" }
- match: { _source.attachment.author: "David Pilato" }
- match: { _source.attachment.date: "2016-03-10T08:25:00Z" }
- match: { _source.attachment.content_length: 19 }
- match: { _source.attachment.content_type: "application/msword" }
- match: { _source.attachment.modifier: "David Pilato" }
- match: { _source.attachment.modified: "2016-03-10T08:25:00Z" }


---
"Test ingest attachment processor with .docx file":
- skip:
version: " - 7.99.99"
reason: "new fields added in 8.0.0"
- do:
ingest.put_pipeline:
id: "my_pipeline"
Expand Down Expand Up @@ -65,10 +73,12 @@
get:
index: test
id: 1
- length: { _source.attachment: 6 }
- length: { _source.attachment: 8 }
- match: { _source.attachment.content: "Test elasticsearch" }
- match: { _source.attachment.language: "et" }
- match: { _source.attachment.author: "David Pilato" }
- match: { _source.attachment.date: "2016-03-10T08:24:00Z" }
- match: { _source.attachment.content_length: 19 }
- match: { _source.attachment.content_type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document" }
- match: { _source.attachment.modifier: "David Pilato" }
- match: { _source.attachment.modified: "2016-03-10T08:24:00Z" }