diff --git a/docs/plugins/ingest-attachment.asciidoc b/docs/plugins/ingest-attachment.asciidoc index 50711023f93ba..364e0cb9b5564 100644 --- a/docs/plugins/ingest-attachment.asciidoc +++ b/docs/plugins/ingest-attachment.asciidoc @@ -98,6 +98,40 @@ The document's `attachment` object contains extracted properties for the file: NOTE: Keeping the binary as a field within the document might consume a lot of resources. It is highly recommended to remove that field from the document. Set `remove_binary` to `true` to automatically remove the field. +[[ingest-attachment-fields]] +==== Exported fields + +The fields which might be extracted from a document are: + +* `content`, +* `title`, +* `author`, +* `keywords`, +* `date`, +* `content_type`, +* `content_length`, +* `language`, +* `modified`, +* `format`, +* `identifier`, +* `contributor`, +* `coverage`, +* `modifier`, +* `creator_tool`, +* `publisher`, +* `relation`, +* `rights`, +* `source`, +* `type`, +* `description`, +* `print_date`, +* `metadata_date`, +* `latitude`, +* `longitude`, +* `altitude`, +* `rating`, +* `comments` + To extract only certain `attachment` fields, specify the `properties` array: [source,console] diff --git a/plugins/ingest-attachment/build.gradle b/plugins/ingest-attachment/build.gradle index 94cbc91b49336..87afeae8fc4de 100644 --- a/plugins/ingest-attachment/build.gradle +++ b/plugins/ingest-attachment/build.gradle @@ -86,6 +86,12 @@ tasks.named("forbiddenPatterns").configure { exclude '**/text-cjk-*.txt' } +tasks.named("yamlRestTestV7CompatTransform").configure { task -> + // 2 new tika metadata fields are returned in v8 + task.replaceValueInLength("_source.attachment", 8, "Test ingest attachment processor with .doc file") + task.replaceValueInLength("_source.attachment", 8, "Test ingest attachment processor with .docx file") +} + tasks.named("thirdPartyAudit").configure { ignoreMissingClasses() } diff --git a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java index 44fc9a77ffd32..916b7502b1cdf 100644 --- a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java +++ b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java @@ -11,6 +11,7 @@ import org.apache.tika.exception.ZeroByteFileException; import org.apache.tika.language.LanguageIdentifier; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.Strings; @@ -132,40 +133,11 @@ public IngestDocument execute(IngestDocument ingestDocument) { additionalFields.put(Property.LANGUAGE.toLowerCase(), language); } - if (properties.contains(Property.DATE)) { - String createdDate = metadata.get(TikaCoreProperties.CREATED); - if (createdDate != null) { - additionalFields.put(Property.DATE.toLowerCase(), createdDate); - } - } - - if (properties.contains(Property.TITLE)) { - String title = metadata.get(TikaCoreProperties.TITLE); - if (Strings.hasLength(title)) { - additionalFields.put(Property.TITLE.toLowerCase(), title); - } - } - - if (properties.contains(Property.AUTHOR)) { - String author = metadata.get("Author"); - if (Strings.hasLength(author)) { - additionalFields.put(Property.AUTHOR.toLowerCase(), author); - } - } - - if (properties.contains(Property.KEYWORDS)) { - String keywords = metadata.get("Keywords"); - if (Strings.hasLength(keywords)) { - additionalFields.put(Property.KEYWORDS.toLowerCase(), keywords); - } - } - - if (properties.contains(Property.CONTENT_TYPE)) { - String contentType = metadata.get(Metadata.CONTENT_TYPE); - if (Strings.hasLength(contentType)) { - additionalFields.put(Property.CONTENT_TYPE.toLowerCase(), contentType); - } - } + addAdditionalField(additionalFields, Property.DATE, metadata.get(TikaCoreProperties.CREATED)); + addAdditionalField(additionalFields, Property.TITLE, metadata.get(TikaCoreProperties.TITLE)); + addAdditionalField(additionalFields, Property.AUTHOR, metadata.get("Author")); + addAdditionalField(additionalFields, Property.KEYWORDS, metadata.get("Keywords")); + addAdditionalField(additionalFields, Property.CONTENT_TYPE, metadata.get(Metadata.CONTENT_TYPE)); if (properties.contains(Property.CONTENT_LENGTH)) { String contentLength = metadata.get(Metadata.CONTENT_LENGTH); @@ -178,6 +150,30 @@ public IngestDocument execute(IngestDocument ingestDocument) { additionalFields.put(Property.CONTENT_LENGTH.toLowerCase(), length); } + addAdditionalField(additionalFields, Property.AUTHOR, metadata.get(TikaCoreProperties.CREATOR)); + addAdditionalField(additionalFields, Property.KEYWORDS, metadata.get(Office.KEYWORDS)); + + addAdditionalField(additionalFields, Property.MODIFIED, metadata.get(TikaCoreProperties.MODIFIED)); + addAdditionalField(additionalFields, Property.FORMAT, metadata.get(TikaCoreProperties.FORMAT)); + addAdditionalField(additionalFields, Property.IDENTIFIER, metadata.get(TikaCoreProperties.IDENTIFIER)); + addAdditionalField(additionalFields, Property.CONTRIBUTOR, metadata.get(TikaCoreProperties.CONTRIBUTOR)); + addAdditionalField(additionalFields, Property.COVERAGE, metadata.get(TikaCoreProperties.COVERAGE)); + addAdditionalField(additionalFields, Property.MODIFIER, metadata.get(TikaCoreProperties.MODIFIER)); + addAdditionalField(additionalFields, Property.CREATOR_TOOL, metadata.get(TikaCoreProperties.CREATOR_TOOL)); + addAdditionalField(additionalFields, Property.PUBLISHER, metadata.get(TikaCoreProperties.PUBLISHER)); + addAdditionalField(additionalFields, Property.RELATION, metadata.get(TikaCoreProperties.RELATION)); + addAdditionalField(additionalFields, Property.RIGHTS, metadata.get(TikaCoreProperties.RIGHTS)); + addAdditionalField(additionalFields, Property.SOURCE, metadata.get(TikaCoreProperties.SOURCE)); + addAdditionalField(additionalFields, Property.TYPE, metadata.get(TikaCoreProperties.TYPE)); + addAdditionalField(additionalFields, Property.DESCRIPTION, metadata.get(TikaCoreProperties.DESCRIPTION)); + addAdditionalField(additionalFields, Property.PRINT_DATE, metadata.get(TikaCoreProperties.PRINT_DATE)); + addAdditionalField(additionalFields, Property.METADATA_DATE, metadata.get(TikaCoreProperties.METADATA_DATE)); + addAdditionalField(additionalFields, Property.LATITUDE, metadata.get(TikaCoreProperties.LATITUDE)); + addAdditionalField(additionalFields, Property.LONGITUDE, metadata.get(TikaCoreProperties.LONGITUDE)); + addAdditionalField(additionalFields, Property.ALTITUDE, metadata.get(TikaCoreProperties.ALTITUDE)); + addAdditionalField(additionalFields, Property.RATING, metadata.get(TikaCoreProperties.RATING)); + addAdditionalField(additionalFields, Property.COMMENTS, metadata.get(TikaCoreProperties.COMMENTS)); + ingestDocument.setFieldValue(targetField, additionalFields); if (removeBinary) { @@ -186,6 +182,18 @@ public IngestDocument execute(IngestDocument ingestDocument) { return ingestDocument; } + /** + * Add an additional field if not null or empty + * @param additionalFields additional fields + * @param property property to add + * @param value value to add + */ + private void addAdditionalField(Map additionalFields, Property property, String value) { + if (properties.contains(property) && Strings.hasLength(value)) { + additionalFields.put(property.toLowerCase(), value); + } + } + @Override public String getType() { return TYPE; @@ -270,7 +278,27 @@ enum Property { DATE, CONTENT_TYPE, CONTENT_LENGTH, - LANGUAGE; + LANGUAGE, + MODIFIED, + FORMAT, + IDENTIFIER, + CONTRIBUTOR, + COVERAGE, + MODIFIER, + CREATOR_TOOL, + PUBLISHER, + RELATION, + RIGHTS, + SOURCE, + TYPE, + DESCRIPTION, + PRINT_DATE, + METADATA_DATE, + LATITUDE, + LONGITUDE, + ALTITUDE, + RATING, + COMMENTS; public static Property parse(String value) { return valueOf(value.toUpperCase(Locale.ROOT)); diff --git a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java index 1758396819822..1fead50a600e7 100644 --- a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java +++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java @@ -68,9 +68,20 @@ public void testEnglishTextDocument() throws Exception { } public void testHtmlDocumentWithRandomFields() throws Exception { - // date is not present in the html doc + // some metadata are not present in the html doc + // "date", "metadata_date", "comments", "modified", "modifier", "print_date", "relation", "creator_tool", "altitude" + // "identifier", "longitude", "publisher", "description", "latitude", "format", "source", "coverage" + // "rating", "type", "contributor", "rights" + // we are only trying with content, title, author, keywords, content_type and content_length. ArrayList fieldsList = new ArrayList<>( - EnumSet.complementOf(EnumSet.of(AttachmentProcessor.Property.DATE)) + EnumSet.of( + AttachmentProcessor.Property.CONTENT, + AttachmentProcessor.Property.TITLE, + AttachmentProcessor.Property.AUTHOR, + AttachmentProcessor.Property.KEYWORDS, + AttachmentProcessor.Property.CONTENT_TYPE, + AttachmentProcessor.Property.CONTENT_LENGTH + ) ); Set selectedProperties = new HashSet<>(); @@ -128,7 +139,20 @@ public void testEmptyTextDocument() throws Exception { public void testWordDocument() throws Exception { Map attachmentData = parseDocument("issue-104.docx", processor); - assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length")); + assertThat( + attachmentData.keySet(), + containsInAnyOrder( + "content", + "language", + "date", + "author", + "content_type", + "content_length", + "modifier", + "modified", + "publisher" + ) + ); assertThat(attachmentData.get("content"), is(notNullValue())); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("date"), is("2012-10-12T11:17:00Z")); @@ -138,12 +162,28 @@ public void testWordDocument() throws Exception { attachmentData.get("content_type").toString(), is("application/vnd.openxmlformats-officedocument.wordprocessingml.document") ); + assertThat(attachmentData.get("modifier").toString(), is("Luka Lampret")); + assertThat(attachmentData.get("modified").toString(), is("2015-02-20T11:36:00Z")); + assertThat(attachmentData.get("publisher").toString(), is("JDI")); } public void testWordDocumentWithVisioSchema() throws Exception { Map attachmentData = parseDocument("issue-22077.docx", processor); - assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length")); + assertThat( + attachmentData.keySet(), + containsInAnyOrder( + "content", + "language", + "date", + "author", + "content_type", + "content_length", + "modifier", + "modified", + "print_date" + ) + ); assertThat(attachmentData.get("content").toString(), containsString("Table of Contents")); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z")); @@ -153,18 +193,37 @@ public void testWordDocumentWithVisioSchema() throws Exception { attachmentData.get("content_type").toString(), is("application/vnd.openxmlformats-officedocument.wordprocessingml.document") ); + assertThat(attachmentData.get("modifier").toString(), is("Chris Dufour")); + assertThat(attachmentData.get("modified").toString(), is("2016-12-04T16:58:00Z")); + assertThat(attachmentData.get("print_date").toString(), is("2015-01-05T19:12:00Z")); } public void testLegacyWordDocumentWithVisioSchema() throws Exception { Map attachmentData = parseDocument("issue-22077.doc", processor); - assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length")); + assertThat( + attachmentData.keySet(), + containsInAnyOrder( + "content", + "language", + "date", + "author", + "content_type", + "content_length", + "modifier", + "modified", + "print_date" + ) + ); assertThat(attachmentData.get("content").toString(), containsString("Table of Contents")); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z")); assertThat(attachmentData.get("author"), is(notNullValue())); assertThat(attachmentData.get("content_length"), is(notNullValue())); assertThat(attachmentData.get("content_type").toString(), is("application/msword")); + assertThat(attachmentData.get("modifier").toString(), is("David Pilato")); + assertThat(attachmentData.get("modified").toString(), is("2016-12-16T15:04:00Z")); + assertThat(attachmentData.get("print_date").toString(), is("2015-01-05T19:12:00Z")); } public void testPdf() throws Exception { @@ -217,9 +276,26 @@ public void testEpubDocument() throws Exception { assertThat( attachmentData.keySet(), - containsInAnyOrder("language", "content", "author", "title", "content_type", "content_length", "date", "keywords") + containsInAnyOrder( + "language", + "content", + "author", + "title", + "content_type", + "content_length", + "date", + "keywords", + "identifier", + "contributor", + "publisher", + "description" + ) ); assertThat(attachmentData.get("content_type").toString(), containsString("application/epub+zip")); + assertThat(attachmentData.get("identifier").toString(), is("1234567890")); + assertThat(attachmentData.get("contributor").toString(), is("no-one")); + assertThat(attachmentData.get("publisher").toString(), is("Apache")); + assertThat(attachmentData.get("description").toString(), is("This is an ePub test publication for Tika.")); } // no real detection, just rudimentary diff --git a/plugins/ingest-attachment/src/yamlRestTest/resources/rest-api-spec/test/ingest_attachment/30_files_supported.yml b/plugins/ingest-attachment/src/yamlRestTest/resources/rest-api-spec/test/ingest_attachment/30_files_supported.yml index 543f394782da9..324776bc20f87 100644 --- a/plugins/ingest-attachment/src/yamlRestTest/resources/rest-api-spec/test/ingest_attachment/30_files_supported.yml +++ b/plugins/ingest-attachment/src/yamlRestTest/resources/rest-api-spec/test/ingest_attachment/30_files_supported.yml @@ -1,5 +1,8 @@ --- "Test ingest attachment processor with .doc file": + - skip: + version: " - 7.99.99" + reason: "new fields added in 8.0.0" - do: ingest.put_pipeline: id: "my_pipeline" @@ -27,17 +30,22 @@ get: index: test id: 1 - - length: { _source.attachment: 6 } + - length: { _source.attachment: 8 } - match: { _source.attachment.content: "Test elasticsearch" } - match: { _source.attachment.language: "et" } - match: { _source.attachment.author: "David Pilato" } - match: { _source.attachment.date: "2016-03-10T08:25:00Z" } - match: { _source.attachment.content_length: 19 } - match: { _source.attachment.content_type: "application/msword" } + - match: { _source.attachment.modifier: "David Pilato" } + - match: { _source.attachment.modified: "2016-03-10T08:25:00Z" } --- "Test ingest attachment processor with .docx file": + - skip: + version: " - 7.99.99" + reason: "new fields added in 8.0.0" - do: ingest.put_pipeline: id: "my_pipeline" @@ -65,10 +73,12 @@ get: index: test id: 1 - - length: { _source.attachment: 6 } + - length: { _source.attachment: 8 } - match: { _source.attachment.content: "Test elasticsearch" } - match: { _source.attachment.language: "et" } - match: { _source.attachment.author: "David Pilato" } - match: { _source.attachment.date: "2016-03-10T08:24:00Z" } - match: { _source.attachment.content_length: 19 } - match: { _source.attachment.content_type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document" } + - match: { _source.attachment.modifier: "David Pilato" } + - match: { _source.attachment.modified: "2016-03-10T08:24:00Z" }