Skip to content

Commit

Permalink
Remove support for Visio and potm files
Browse files Browse the repository at this point in the history
* Parse a non supported document using `mapper-attachments`
* If Tika is not able to parse the document because of a missing class (we are not importing all jars needed by Tika), Tika throws a Throwable which is not catch.

This commit removes support for Visio and POTM office files.

The good news is that it does not kill the node anymore and allows to extract the text which is in the Office document even if we have a Visio content (which is not extracted anymore).

Related to #22077 and #22079 for mapper-attachments plugin

Backport of #23214 in 5.2 branch
  • Loading branch information
dadoonet committed Feb 20, 2017
1 parent 76a977a commit 07a9f29
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 1 deletion.
2 changes: 2 additions & 0 deletions plugins/mapper-attachments/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,11 @@ dependencyLicenses {
}

forbiddenPatterns {
exclude '**/*.doc'
exclude '**/*.docx'
exclude '**/*.pdf'
exclude '**/*.epub'
exclude '**/*.vsdx'
}

thirdPartyAudit.excludes = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.elasticsearch.SpecialPermission;
import org.elasticsearch.bootstrap.JarHell;
import org.elasticsearch.common.SuppressForbidden;
Expand All @@ -45,7 +47,9 @@
import java.security.PrivilegedExceptionAction;
import java.security.ProtectionDomain;
import java.security.SecurityPermission;
import java.util.Collections;
import java.util.PropertyPermission;
import java.util.Set;

/**
* Runs tika with limited parsers and limited permissions.
Expand All @@ -54,6 +58,9 @@
*/
final class TikaImpl {

/** Exclude some formats */
private static final Set<MediaType> EXCLUDES = Collections.singleton(MediaType.application("x-tika-ooxml"));

/** subset of parsers for types we support */
private static final Parser PARSERS[] = new Parser[] {
// documents
Expand All @@ -63,7 +70,7 @@ final class TikaImpl {
new org.apache.tika.parser.txt.TXTParser(),
new org.apache.tika.parser.microsoft.OfficeParser(),
new org.apache.tika.parser.microsoft.OldExcelParser(),
new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(),
ParserDecorator.withoutTypes(new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES),
new org.apache.tika.parser.odf.OpenDocumentParser(),
new org.apache.tika.parser.iwork.IWorkPackageParser(),
new org.apache.tika.parser.xml.DcXMLParser(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@
import static org.elasticsearch.mapper.attachments.AttachmentMapper.FieldNames.TITLE;
import static org.elasticsearch.test.StreamsUtils.copyToBytesFromClasspath;
import static org.elasticsearch.test.StreamsUtils.copyToStringFromClasspath;
import static org.hamcrest.Matchers.is;
import static org.hamcrest.Matchers.isEmptyOrNullString;
import static org.hamcrest.Matchers.isEmptyString;
import static org.hamcrest.Matchers.not;

/**
Expand Down Expand Up @@ -121,6 +123,40 @@ public void testAsciidocDocument() throws Exception {
testMapper("asciidoc.asciidoc", false);
}

public void testWordDocumentWithVisioSchema() throws Exception {
assertParseable("issue-22077.docx");
testMapper("issue-22077.docx", false);
}

public void testLegacyWordDocumentWithVisioSchema() throws Exception {
assertParseable("issue-22077.doc");
testMapper("issue-22077.doc", false);
}

public void testVisioIsExcluded() throws Exception {
String filename = "issue-22077.vsdx";
try (InputStream is = VariousDocTests.class.getResourceAsStream("/org/elasticsearch/index/mapper/attachment/test/sample-files/" +
filename)) {
byte bytes[] = IOUtils.toByteArray(is);
String parsedContent = TikaImpl.parse(bytes, new Metadata(), -1);
assertThat(parsedContent, isEmptyString());
}

byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/attachment/test/sample-files/" + filename);
BytesReference json = jsonBuilder()
.startObject()
.startObject("file")
.field("_name", filename)
.field("_content", html)
.endObject()
.endObject().bytes();

ParseContext.Document doc = docMapper.parse("person", "person", "1", json).rootDoc();
assertThat(doc.get(docMapper.mappers().getMapper("file.content").fieldType().name()), isEmptyString());
assertThat(doc.get(docMapper.mappers().getMapper("file.content_type").fieldType().name()), is("application/vnd.ms-visio.drawing"));
assertThat(doc.get(docMapper.mappers().getMapper("file.content_length").fieldType().name()), is("210451"));
}

void assertException(String filename, String expectedMessage) throws Exception {
try (InputStream is = VariousDocTests.class.getResourceAsStream("/org/elasticsearch/index/mapper/attachment/test/sample-files/" + filename)) {
byte bytes[] = IOUtils.toByteArray(is);
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 07a9f29

Please sign in to comment.