From 5cbed684e4e129e560ccedff530c66217d0ef523 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Wed, 6 Oct 2021 14:35:12 +0200 Subject: [PATCH 1/7] Extract more standard metadata from binary files Until now, we have been extracted a few number of fields from the binary files sent to the ingest attachment plugin: * `content`, * `title`, * `author`, * `keywords`, * `date`, * `content_type`, * `content_length`, * `language`. Tika has a list of more standard properties which can be extracted: * `modified`, * `format`, * `identifier`, * `contributor`, * `coverage`, * `modifier`, * `creator_tool`, * `publisher`, * `relation`, * `rights`, * `source`, * `type`, * `description`, * `print_date`, * `metadata_date`, * `latitude`, * `longitude`, * `altitude`, * `rating`, * `comments` This commit exposes those new fields. Related to #22339. --- docs/plugins/ingest-attachment.asciidoc | 34 +++++++ .../attachment/AttachmentProcessor.java | 97 ++++++++++++------- .../attachment/AttachmentProcessorTests.java | 34 +++++-- 3 files changed, 124 insertions(+), 41 deletions(-) diff --git a/docs/plugins/ingest-attachment.asciidoc b/docs/plugins/ingest-attachment.asciidoc index 50711023f93ba..364e0cb9b5564 100644 --- a/docs/plugins/ingest-attachment.asciidoc +++ b/docs/plugins/ingest-attachment.asciidoc @@ -98,6 +98,40 @@ The document's `attachment` object contains extracted properties for the file: NOTE: Keeping the binary as a field within the document might consume a lot of resources. It is highly recommended to remove that field from the document. Set `remove_binary` to `true` to automatically remove the field. +[[ingest-attachment-fields]] +==== Exported fields + +The fields which might be extracted from a document are: + +* `content`, +* `title`, +* `author`, +* `keywords`, +* `date`, +* `content_type`, +* `content_length`, +* `language`, +* `modified`, +* `format`, +* `identifier`, +* `contributor`, +* `coverage`, +* `modifier`, +* `creator_tool`, +* `publisher`, +* `relation`, +* `rights`, +* `source`, +* `type`, +* `description`, +* `print_date`, +* `metadata_date`, +* `latitude`, +* `longitude`, +* `altitude`, +* `rating`, +* `comments` + To extract only certain `attachment` fields, specify the `properties` array: [source,console] diff --git a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java index 22857331f896c..a8c95a4a94ea9 100644 --- a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java +++ b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java @@ -11,6 +11,7 @@ import org.apache.tika.exception.ZeroByteFileException; import org.apache.tika.language.LanguageIdentifier; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.Office; import org.apache.tika.metadata.TikaCoreProperties; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.Strings; @@ -123,40 +124,12 @@ public IngestDocument execute(IngestDocument ingestDocument) { additionalFields.put(Property.LANGUAGE.toLowerCase(), language); } - if (properties.contains(Property.DATE)) { - String createdDate = metadata.get(TikaCoreProperties.CREATED); - if (createdDate != null) { - additionalFields.put(Property.DATE.toLowerCase(), createdDate); - } - } + addAdditionalField(additionalFields, Property.DATE, metadata.get(TikaCoreProperties.CREATED)); + addAdditionalField(additionalFields, Property.TITLE, metadata.get(TikaCoreProperties.TITLE)); + addAdditionalField(additionalFields, Property.AUTHOR, metadata.get("Author")); + addAdditionalField(additionalFields, Property.KEYWORDS, metadata.get("Keywords")); + addAdditionalField(additionalFields, Property.CONTENT_TYPE, metadata.get(Metadata.CONTENT_TYPE)); - if (properties.contains(Property.TITLE)) { - String title = metadata.get(TikaCoreProperties.TITLE); - if (Strings.hasLength(title)) { - additionalFields.put(Property.TITLE.toLowerCase(), title); - } - } - - if (properties.contains(Property.AUTHOR)) { - String author = metadata.get("Author"); - if (Strings.hasLength(author)) { - additionalFields.put(Property.AUTHOR.toLowerCase(), author); - } - } - - if (properties.contains(Property.KEYWORDS)) { - String keywords = metadata.get("Keywords"); - if (Strings.hasLength(keywords)) { - additionalFields.put(Property.KEYWORDS.toLowerCase(), keywords); - } - } - - if (properties.contains(Property.CONTENT_TYPE)) { - String contentType = metadata.get(Metadata.CONTENT_TYPE); - if (Strings.hasLength(contentType)) { - additionalFields.put(Property.CONTENT_TYPE.toLowerCase(), contentType); - } - } if (properties.contains(Property.CONTENT_LENGTH)) { String contentLength = metadata.get(Metadata.CONTENT_LENGTH); @@ -169,6 +142,30 @@ public IngestDocument execute(IngestDocument ingestDocument) { additionalFields.put(Property.CONTENT_LENGTH.toLowerCase(), length); } + addAdditionalField(additionalFields, Property.AUTHOR, metadata.get(TikaCoreProperties.CREATOR)); + addAdditionalField(additionalFields, Property.KEYWORDS, metadata.get(Office.KEYWORDS)); + + addAdditionalField(additionalFields, Property.MODIFIED, metadata.get(TikaCoreProperties.MODIFIED)); + addAdditionalField(additionalFields, Property.FORMAT, metadata.get(TikaCoreProperties.FORMAT)); + addAdditionalField(additionalFields, Property.IDENTIFIER, metadata.get(TikaCoreProperties.IDENTIFIER)); + addAdditionalField(additionalFields, Property.CONTRIBUTOR, metadata.get(TikaCoreProperties.CONTRIBUTOR)); + addAdditionalField(additionalFields, Property.COVERAGE, metadata.get(TikaCoreProperties.COVERAGE)); + addAdditionalField(additionalFields, Property.MODIFIER, metadata.get(TikaCoreProperties.MODIFIER)); + addAdditionalField(additionalFields, Property.CREATOR_TOOL, metadata.get(TikaCoreProperties.CREATOR_TOOL)); + addAdditionalField(additionalFields, Property.PUBLISHER, metadata.get(TikaCoreProperties.PUBLISHER)); + addAdditionalField(additionalFields, Property.RELATION, metadata.get(TikaCoreProperties.RELATION)); + addAdditionalField(additionalFields, Property.RIGHTS, metadata.get(TikaCoreProperties.RIGHTS)); + addAdditionalField(additionalFields, Property.SOURCE, metadata.get(TikaCoreProperties.SOURCE)); + addAdditionalField(additionalFields, Property.TYPE, metadata.get(TikaCoreProperties.TYPE)); + addAdditionalField(additionalFields, Property.DESCRIPTION, metadata.get(TikaCoreProperties.DESCRIPTION)); + addAdditionalField(additionalFields, Property.PRINT_DATE, metadata.get(TikaCoreProperties.PRINT_DATE)); + addAdditionalField(additionalFields, Property.METADATA_DATE, metadata.get(TikaCoreProperties.METADATA_DATE)); + addAdditionalField(additionalFields, Property.LATITUDE, metadata.get(TikaCoreProperties.LATITUDE)); + addAdditionalField(additionalFields, Property.LONGITUDE, metadata.get(TikaCoreProperties.LONGITUDE)); + addAdditionalField(additionalFields, Property.ALTITUDE, metadata.get(TikaCoreProperties.ALTITUDE)); + addAdditionalField(additionalFields, Property.RATING, metadata.get(TikaCoreProperties.RATING)); + addAdditionalField(additionalFields, Property.COMMENTS, metadata.get(TikaCoreProperties.COMMENTS)); + ingestDocument.setFieldValue(targetField, additionalFields); if (removeBinary) { @@ -177,6 +174,18 @@ public IngestDocument execute(IngestDocument ingestDocument) { return ingestDocument; } + /** + * Add an additional field if not null or empty + * @param additionalFields additional fields + * @param property property to add + * @param value value to add + */ + private void addAdditionalField(Map additionalFields, Property property, String value) { + if (properties.contains(property) && Strings.hasLength(value)) { + additionalFields.put(property.toLowerCase(), value); + } + } + @Override public String getType() { return TYPE; @@ -243,7 +252,27 @@ enum Property { DATE, CONTENT_TYPE, CONTENT_LENGTH, - LANGUAGE; + LANGUAGE, + MODIFIED, + FORMAT, + IDENTIFIER, + CONTRIBUTOR, + COVERAGE, + MODIFIER, + CREATOR_TOOL, + PUBLISHER, + RELATION, + RIGHTS, + SOURCE, + TYPE, + DESCRIPTION, + PRINT_DATE, + METADATA_DATE, + LATITUDE, + LONGITUDE, + ALTITUDE, + RATING, + COMMENTS; public static Property parse(String value) { return valueOf(value.toUpperCase(Locale.ROOT)); diff --git a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java index 63197de7f7981..cd6802cd13c1d 100644 --- a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java +++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java @@ -58,9 +58,14 @@ public void testEnglishTextDocument() throws Exception { } public void testHtmlDocumentWithRandomFields() throws Exception { - //date is not present in the html doc - ArrayList fieldsList = new ArrayList<>(EnumSet.complementOf(EnumSet.of - (AttachmentProcessor.Property.DATE))); + // some metadata are not present in the html doc + // "date", "metadata_date", "comments", "modified", "modifier", "print_date", "relation", "creator_tool", "altitude" + // "identifier", "longitude", "publisher", "description", "latitude", "format", "source", "coverage" + // "rating", "type", "contributor", "rights" + // we are only trying with content, title, author, keywords, content_type and content_length. + ArrayList fieldsList = new ArrayList<>(EnumSet.of(AttachmentProcessor.Property.CONTENT, + AttachmentProcessor.Property.TITLE, AttachmentProcessor.Property.AUTHOR, AttachmentProcessor.Property.KEYWORDS, + AttachmentProcessor.Property.CONTENT_TYPE, AttachmentProcessor.Property.CONTENT_LENGTH)); Set selectedProperties = new HashSet<>(); int numFields = randomIntBetween(1, fieldsList.size()); @@ -108,7 +113,8 @@ public void testWordDocument() throws Exception { Map attachmentData = parseDocument("issue-104.docx", processor); assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", - "content_length")); + "content_length", "modifier", "modified", "publisher")); + assertThat(attachmentData.get("content"), is(notNullValue())); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("date"), is("2012-10-12T11:17:00Z")); @@ -116,13 +122,16 @@ public void testWordDocument() throws Exception { assertThat(attachmentData.get("content_length"), is(notNullValue())); assertThat(attachmentData.get("content_type").toString(), is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")); + assertThat(attachmentData.get("modifier").toString(), is("Luka Lampret")); + assertThat(attachmentData.get("modified").toString(), is("2015-02-20T11:36:00Z")); + assertThat(attachmentData.get("publisher").toString(), is("JDI")); } public void testWordDocumentWithVisioSchema() throws Exception { Map attachmentData = parseDocument("issue-22077.docx", processor); assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", - "content_length")); + "content_length", "modifier", "modified", "print_date")); assertThat(attachmentData.get("content").toString(), containsString("Table of Contents")); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z")); @@ -130,13 +139,16 @@ public void testWordDocumentWithVisioSchema() throws Exception { assertThat(attachmentData.get("content_length"), is(notNullValue())); assertThat(attachmentData.get("content_type").toString(), is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")); + assertThat(attachmentData.get("modifier").toString(), is("Chris Dufour")); + assertThat(attachmentData.get("modified").toString(), is("2016-12-04T16:58:00Z")); + assertThat(attachmentData.get("print_date").toString(), is("2015-01-05T19:12:00Z")); } public void testLegacyWordDocumentWithVisioSchema() throws Exception { Map attachmentData = parseDocument("issue-22077.doc", processor); assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", - "content_length")); + "content_length", "modifier", "modified", "print_date")); assertThat(attachmentData.get("content").toString(), containsString("Table of Contents")); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z")); @@ -144,6 +156,9 @@ public void testLegacyWordDocumentWithVisioSchema() throws Exception { assertThat(attachmentData.get("content_length"), is(notNullValue())); assertThat(attachmentData.get("content_type").toString(), is("application/msword")); + assertThat(attachmentData.get("modifier").toString(), is("David Pilato")); + assertThat(attachmentData.get("modified").toString(), is("2016-12-16T15:04:00Z")); + assertThat(attachmentData.get("print_date").toString(), is("2015-01-05T19:12:00Z")); } public void testPdf() throws Exception { @@ -191,8 +206,13 @@ public void testEpubDocument() throws Exception { Map attachmentData = parseDocument("testEPUB.epub", processor); assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "author", "title", "content_type", "content_length", - "date", "keywords")); + "date", "keywords", + "identifier", "contributor", "publisher", "description")); assertThat(attachmentData.get("content_type").toString(), containsString("application/epub+zip")); + assertThat(attachmentData.get("identifier").toString(), is("1234567890")); + assertThat(attachmentData.get("contributor").toString(), is("no-one")); + assertThat(attachmentData.get("publisher").toString(), is("Apache")); + assertThat(attachmentData.get("description").toString(), is("This is an ePub test publication for Tika.")); } // no real detection, just rudimentary From 2645084a3ecb72346797a8943ffe26c83bbbced6 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Thu, 21 Oct 2021 10:52:07 +0200 Subject: [PATCH 2/7] Fix integration Rest tests --- .../test/ingest_attachment/30_files_supported.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/plugins/ingest-attachment/src/yamlRestTest/resources/rest-api-spec/test/ingest_attachment/30_files_supported.yml b/plugins/ingest-attachment/src/yamlRestTest/resources/rest-api-spec/test/ingest_attachment/30_files_supported.yml index 543f394782da9..f9b50d1a3a6e3 100644 --- a/plugins/ingest-attachment/src/yamlRestTest/resources/rest-api-spec/test/ingest_attachment/30_files_supported.yml +++ b/plugins/ingest-attachment/src/yamlRestTest/resources/rest-api-spec/test/ingest_attachment/30_files_supported.yml @@ -27,13 +27,15 @@ get: index: test id: 1 - - length: { _source.attachment: 6 } + - length: { _source.attachment: 8 } - match: { _source.attachment.content: "Test elasticsearch" } - match: { _source.attachment.language: "et" } - match: { _source.attachment.author: "David Pilato" } - match: { _source.attachment.date: "2016-03-10T08:25:00Z" } - match: { _source.attachment.content_length: 19 } - match: { _source.attachment.content_type: "application/msword" } + - match: { _source.attachment.modifier: "David Pilato" } + - match: { _source.attachment.modified: "2016-03-10T08:25:00Z" } --- @@ -65,10 +67,12 @@ get: index: test id: 1 - - length: { _source.attachment: 6 } + - length: { _source.attachment: 8 } - match: { _source.attachment.content: "Test elasticsearch" } - match: { _source.attachment.language: "et" } - match: { _source.attachment.author: "David Pilato" } - match: { _source.attachment.date: "2016-03-10T08:24:00Z" } - match: { _source.attachment.content_length: 19 } - match: { _source.attachment.content_type: "application/vnd.openxmlformats-officedocument.wordprocessingml.document" } + - match: { _source.attachment.modifier: "David Pilato" } + - match: { _source.attachment.modified: "2016-03-10T08:24:00Z" } From abb81fe327d958c972733a7a1513c0231bba00b9 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Tue, 26 Oct 2021 11:41:43 +0200 Subject: [PATCH 3/7] Don't run bwc tests for 7.x As we added new fields, this test is failing the bwc tests. We can not have access to `RestApiVersion` in the `AttachmentProcessor` so we can't decide to produce or not the fields depending on the version. As the change is trivial and not removing any existing field, we could skip this regression test. --- .../test/ingest_attachment/30_files_supported.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/plugins/ingest-attachment/src/yamlRestTest/resources/rest-api-spec/test/ingest_attachment/30_files_supported.yml b/plugins/ingest-attachment/src/yamlRestTest/resources/rest-api-spec/test/ingest_attachment/30_files_supported.yml index f9b50d1a3a6e3..324776bc20f87 100644 --- a/plugins/ingest-attachment/src/yamlRestTest/resources/rest-api-spec/test/ingest_attachment/30_files_supported.yml +++ b/plugins/ingest-attachment/src/yamlRestTest/resources/rest-api-spec/test/ingest_attachment/30_files_supported.yml @@ -1,5 +1,8 @@ --- "Test ingest attachment processor with .doc file": + - skip: + version: " - 7.99.99" + reason: "new fields added in 8.0.0" - do: ingest.put_pipeline: id: "my_pipeline" @@ -40,6 +43,9 @@ --- "Test ingest attachment processor with .docx file": + - skip: + version: " - 7.99.99" + reason: "new fields added in 8.0.0" - do: ingest.put_pipeline: id: "my_pipeline" From ad00ba5ba9da761bd7d1c7ccb7bb94828735c805 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Mon, 22 Nov 2021 09:07:10 -0600 Subject: [PATCH 4/7] fixing checkstyle error --- .../ingest/attachment/AttachmentProcessorTests.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java index 99a3a78176cf8..7efc05ff8776d 100644 --- a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java +++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java @@ -133,7 +133,8 @@ public void testEmptyTextDocument() throws Exception { public void testWordDocument() throws Exception { Map attachmentData = parseDocument("issue-104.docx", processor); - assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length", "modifier", "modified", "publisher")); + assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length", + "modifier", "modified", "publisher")); assertThat(attachmentData.get("content"), is(notNullValue())); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("date"), is("2012-10-12T11:17:00Z")); @@ -151,7 +152,8 @@ public void testWordDocument() throws Exception { public void testWordDocumentWithVisioSchema() throws Exception { Map attachmentData = parseDocument("issue-22077.docx", processor); - assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length", "modifier", "modified", "print_date")); + assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length", + "modifier", "modified", "print_date")); assertThat(attachmentData.get("content").toString(), containsString("Table of Contents")); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z")); @@ -169,7 +171,8 @@ public void testWordDocumentWithVisioSchema() throws Exception { public void testLegacyWordDocumentWithVisioSchema() throws Exception { Map attachmentData = parseDocument("issue-22077.doc", processor); - assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length", "modifier", "modified", "print_date")); + assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length", + "modifier", "modified", "print_date")); assertThat(attachmentData.get("content").toString(), containsString("Table of Contents")); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z")); From 63ce39abeeb12114516ed62207a7f21f91433432 Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Mon, 22 Nov 2021 09:20:02 -0600 Subject: [PATCH 5/7] fixing checkstyle error --- .../attachment/AttachmentProcessor.java | 1 - .../attachment/AttachmentProcessorTests.java | 78 ++++++++++++++++--- 2 files changed, 66 insertions(+), 13 deletions(-) diff --git a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java index d80d3692375d7..35e5f3ac5fe98 100644 --- a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java +++ b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java @@ -139,7 +139,6 @@ public IngestDocument execute(IngestDocument ingestDocument) { addAdditionalField(additionalFields, Property.KEYWORDS, metadata.get("Keywords")); addAdditionalField(additionalFields, Property.CONTENT_TYPE, metadata.get(Metadata.CONTENT_TYPE)); - if (properties.contains(Property.CONTENT_LENGTH)) { String contentLength = metadata.get(Metadata.CONTENT_LENGTH); long length; diff --git a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java index 7efc05ff8776d..4a9eb3c04da6a 100644 --- a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java +++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java @@ -73,10 +73,16 @@ public void testHtmlDocumentWithRandomFields() throws Exception { // "identifier", "longitude", "publisher", "description", "latitude", "format", "source", "coverage" // "rating", "type", "contributor", "rights" // we are only trying with content, title, author, keywords, content_type and content_length. - ArrayList fieldsList = new ArrayList<>(EnumSet.of(AttachmentProcessor.Property.CONTENT, - AttachmentProcessor.Property.TITLE, AttachmentProcessor.Property.AUTHOR, AttachmentProcessor.Property.KEYWORDS, - AttachmentProcessor.Property.CONTENT_TYPE, AttachmentProcessor.Property.CONTENT_LENGTH)) - ; + ArrayList fieldsList = new ArrayList<>( + EnumSet.of( + AttachmentProcessor.Property.CONTENT, + AttachmentProcessor.Property.TITLE, + AttachmentProcessor.Property.AUTHOR, + AttachmentProcessor.Property.KEYWORDS, + AttachmentProcessor.Property.CONTENT_TYPE, + AttachmentProcessor.Property.CONTENT_LENGTH + ) + ); Set selectedProperties = new HashSet<>(); int numFields = randomIntBetween(1, fieldsList.size()); @@ -133,8 +139,20 @@ public void testEmptyTextDocument() throws Exception { public void testWordDocument() throws Exception { Map attachmentData = parseDocument("issue-104.docx", processor); - assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length", - "modifier", "modified", "publisher")); + assertThat( + attachmentData.keySet(), + containsInAnyOrder( + "content", + "language", + "date", + "author", + "content_type", + "content_length", + "modifier", + "modified", + "publisher" + ) + ); assertThat(attachmentData.get("content"), is(notNullValue())); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("date"), is("2012-10-12T11:17:00Z")); @@ -152,8 +170,20 @@ public void testWordDocument() throws Exception { public void testWordDocumentWithVisioSchema() throws Exception { Map attachmentData = parseDocument("issue-22077.docx", processor); - assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length", - "modifier", "modified", "print_date")); + assertThat( + attachmentData.keySet(), + containsInAnyOrder( + "content", + "language", + "date", + "author", + "content_type", + "content_length", + "modifier", + "modified", + "print_date" + ) + ); assertThat(attachmentData.get("content").toString(), containsString("Table of Contents")); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z")); @@ -171,8 +201,20 @@ public void testWordDocumentWithVisioSchema() throws Exception { public void testLegacyWordDocumentWithVisioSchema() throws Exception { Map attachmentData = parseDocument("issue-22077.doc", processor); - assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", "content_length", - "modifier", "modified", "print_date")); + assertThat( + attachmentData.keySet(), + containsInAnyOrder( + "content", + "language", + "date", + "author", + "content_type", + "content_length", + "modifier", + "modified", + "print_date" + ) + ); assertThat(attachmentData.get("content").toString(), containsString("Table of Contents")); assertThat(attachmentData.get("language"), is("en")); assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z")); @@ -234,8 +276,20 @@ public void testEpubDocument() throws Exception { assertThat( attachmentData.keySet(), - containsInAnyOrder("language", "content", "author", "title", "content_type", "content_length", "date", "keywords", - "identifier", "contributor", "publisher", "description") + containsInAnyOrder( + "language", + "content", + "author", + "title", + "content_type", + "content_length", + "date", + "keywords", + "identifier", + "contributor", + "publisher", + "description" + ) ); assertThat(attachmentData.get("content_type").toString(), containsString("application/epub+zip")); assertThat(attachmentData.get("identifier").toString(), is("1234567890")); From 6738faf722cdcefb61f4e114856841c1d96bfc2c Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Mon, 22 Nov 2021 11:51:15 -0600 Subject: [PATCH 6/7] making yamlRestTestV7CompatTransform pass --- plugins/ingest-attachment/build.gradle | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/plugins/ingest-attachment/build.gradle b/plugins/ingest-attachment/build.gradle index 94cbc91b49336..beaa83c2b998d 100644 --- a/plugins/ingest-attachment/build.gradle +++ b/plugins/ingest-attachment/build.gradle @@ -86,6 +86,11 @@ tasks.named("forbiddenPatterns").configure { exclude '**/text-cjk-*.txt' } +tasks.named("yamlRestTestV7CompatTransform").configure { task -> + // 2 new tika metadata fields are returned in v8 + task.replaceValueInLength("_source.attachment", 8) +} + tasks.named("thirdPartyAudit").configure { ignoreMissingClasses() } From 9b4e1e0e7ca2d392c6ea91275aa3d19c1e6b4b5c Mon Sep 17 00:00:00 2001 From: Keith Massey Date: Mon, 22 Nov 2021 14:28:29 -0600 Subject: [PATCH 7/7] making yamlRestTestV7CompatTransform pass --- plugins/ingest-attachment/build.gradle | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/plugins/ingest-attachment/build.gradle b/plugins/ingest-attachment/build.gradle index beaa83c2b998d..87afeae8fc4de 100644 --- a/plugins/ingest-attachment/build.gradle +++ b/plugins/ingest-attachment/build.gradle @@ -88,7 +88,8 @@ tasks.named("forbiddenPatterns").configure { tasks.named("yamlRestTestV7CompatTransform").configure { task -> // 2 new tika metadata fields are returned in v8 - task.replaceValueInLength("_source.attachment", 8) + task.replaceValueInLength("_source.attachment", 8, "Test ingest attachment processor with .doc file") + task.replaceValueInLength("_source.attachment", 8, "Test ingest attachment processor with .docx file") } tasks.named("thirdPartyAudit").configure {