Skip to content

Commit

Permalink
Remove support for Visio and potm files
Browse files Browse the repository at this point in the history
* Send a non supported document to an ingest pipeline using `ingest-attachment`
* If Tika is not able to parse the document because of a missing class (we are not importing all jars needed by Tika), Tika throws a Throwable which is not catch.

This commit removes support for Visio and POTM office files.

So elasticsearch is not killed anymore when you run a command like:

```
GET _ingest/pipeline/_simulate
{
  "pipeline" : {
    "processors" : [
      {
        "attachment" : {
          "field" : "file"
        }
      }
    ]
  },
  "docs" : [
    {
      "_source" : {
        "file" : "BASE64CONTENT"
      }
    }
  ]
}
```

The good news is that it does not kill the node anymore and allows to extract the text which is in the Office document even if we have a Visio content (which is not extracted anymore).

Related to #22077

Backport of #23214 in 5.3 branch

(cherry picked from commit 76a977a)
  • Loading branch information
dadoonet committed Apr 23, 2017
1 parent 9e7cb8b commit dc4888e
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 1 deletion.
2 changes: 2 additions & 0 deletions plugins/ingest-attachment/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,11 @@ dependencyLicenses {
}

forbiddenPatterns {
exclude '**/*.doc'
exclude '**/*.docx'
exclude '**/*.pdf'
exclude '**/*.epub'
exclude '**/*.vsdx'
}

thirdPartyAudit.excludes = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.elasticsearch.SpecialPermission;
import org.elasticsearch.bootstrap.JarHell;
import org.elasticsearch.common.SuppressForbidden;
Expand All @@ -45,7 +47,9 @@
import java.security.PrivilegedExceptionAction;
import java.security.ProtectionDomain;
import java.security.SecurityPermission;
import java.util.Collections;
import java.util.PropertyPermission;
import java.util.Set;

/**
* Runs tika with limited parsers and limited permissions.
Expand All @@ -54,6 +58,9 @@
*/
final class TikaImpl {

/** Exclude some formats */
private static final Set<MediaType> EXCLUDES = Collections.singleton(MediaType.application("x-tika-ooxml"));

/** subset of parsers for types we support */
private static final Parser PARSERS[] = new Parser[] {
// documents
Expand All @@ -63,7 +70,7 @@ final class TikaImpl {
new org.apache.tika.parser.txt.TXTParser(),
new org.apache.tika.parser.microsoft.OfficeParser(),
new org.apache.tika.parser.microsoft.OldExcelParser(),
new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(),
ParserDecorator.withoutTypes(new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES),
new org.apache.tika.parser.odf.OpenDocumentParser(),
new org.apache.tika.parser.iwork.IWorkPackageParser(),
new org.apache.tika.parser.xml.DcXMLParser(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.not;
import static org.hamcrest.Matchers.notNullValue;
import static org.hamcrest.Matchers.nullValue;
import static org.hamcrest.core.IsCollectionContaining.hasItem;

public class AttachmentProcessorTests extends ESTestCase {
Expand Down Expand Up @@ -130,6 +131,34 @@ public void testWordDocument() throws Exception {
is("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
}

public void testWordDocumentWithVisioSchema() throws Exception {
Map<String, Object> attachmentData = parseDocument("issue-22077.docx", processor);

assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type",
"content_length"));
assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
assertThat(attachmentData.get("language"), is("en"));
assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z"));
assertThat(attachmentData.get("author"), is(notNullValue()));
assertThat(attachmentData.get("content_length"), is(notNullValue()));
assertThat(attachmentData.get("content_type").toString(),
is("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
}

public void testLegacyWordDocumentWithVisioSchema() throws Exception {
Map<String, Object> attachmentData = parseDocument("issue-22077.doc", processor);

assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type",
"content_length"));
assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
assertThat(attachmentData.get("language"), is("en"));
assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z"));
assertThat(attachmentData.get("author"), is(notNullValue()));
assertThat(attachmentData.get("content_length"), is(notNullValue()));
assertThat(attachmentData.get("content_type").toString(),
is("application/msword"));
}

public void testPdf() throws Exception {
Map<String, Object> attachmentData = parseDocument("test.pdf", processor);
assertThat(attachmentData.get("content"),
Expand All @@ -138,6 +167,13 @@ public void testPdf() throws Exception {
assertThat(attachmentData.get("content_length"), is(notNullValue()));
}

public void testVisioIsExcluded() throws Exception {
Map<String, Object> attachmentData = parseDocument("issue-22077.vsdx", processor);
assertThat(attachmentData.get("content"), nullValue());
assertThat(attachmentData.get("content_type"), is("application/vnd.ms-visio.drawing"));
assertThat(attachmentData.get("content_length"), is(0L));
}

public void testEncryptedPdf() throws Exception {
ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> parseDocument("encrypted.pdf", processor));
assertThat(e.getDetailedMessage(), containsString("document is encrypted"));
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit dc4888e

Please sign in to comment.