Skip to content

Commit

Permalink
Merge pull request #4168 from evolvedbinary/feature/update-tika
Browse files Browse the repository at this point in the history
Update to Tika 2.2.1
  • Loading branch information
dizzzz committed Jan 6, 2022
2 parents 1a18922 + 7a61bfc commit a154289
Show file tree
Hide file tree
Showing 9 changed files with 329 additions and 187 deletions.
Expand Up @@ -119,10 +119,10 @@ private XPathException errorMapAsXPathException(final MapType errorMap) throws X
}

private static final Pattern PTN_CAUSED_BY = Pattern.compile("Caused by:\\s([a-zA-Z0-9_$\\.]+)(?::\\s(.+))?");
private static final Pattern PTN_AT = Pattern.compile("at\\s((?:[a-zA-Z0-9_$]+)(?:\\.[a-zA-Z0-9_$]+)*)\\.([a-zA-Z0-9_$-]+)\\(([a-zA-Z0-9_]+\\.java):([0-9]+)\\)");
private static final Pattern PTN_AT = Pattern.compile("at\\s((?:[a-zA-Z0-9_$]+)(?:\\.[a-zA-Z0-9_$]+)*)\\.((?:[a-zA-Z0-9_$-]+)|(?:<init>))\\(([a-zA-Z0-9_]+\\.java):([0-9]+)\\)");

protected StackTraceElement[] convertStackTraceElements(final Sequence seqJavaStackTrace) throws XPathException {
StackTraceElement[] traceElements = new StackTraceElement[seqJavaStackTrace.getItemCount() - 1];
StackTraceElement[] traceElements = null;

final Matcher matcherAt = PTN_AT.matcher("");

Expand All @@ -134,10 +134,14 @@ protected StackTraceElement[] convertStackTraceElements(final Sequence seqJavaSt
if (stackTraceElement == null) {
break;
}

if (traceElements == null) {
traceElements = new StackTraceElement[seqJavaStackTrace.getItemCount() - 1];
}
traceElements[i - 1] = stackTraceElement;
}

if (i + 1 < seqJavaStackTrace.getItemCount()) {
if (traceElements != null && i + 1 < seqJavaStackTrace.getItemCount()) {
traceElements = Arrays.copyOf(traceElements, i - 2);
}

Expand Down
45 changes: 41 additions & 4 deletions extensions/contentextraction/pom.xml
Expand Up @@ -46,7 +46,7 @@
</scm>

<properties>
<tika.version>1.28</tika.version>
<tika.version>2.2.1</tika.version>
</properties>

<dependencies>
Expand All @@ -66,10 +66,9 @@
<artifactId>tika-core</artifactId>
<version>${tika.version}</version>
</dependency>

<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>${tika.version}</version>
<scope>runtime</scope>
<exclusions>
Expand Down Expand Up @@ -97,6 +96,12 @@
<scope>test</scope>
</dependency>

<dependency>
<groupId>com.evolvedbinary.j8fu</groupId>
<artifactId>j8fu</artifactId>
<scope>test</scope>
</dependency>

</dependencies>

<build>
Expand All @@ -110,7 +115,39 @@
<filtering>true</filtering>
</testResource>
</testResources>

<plugins>
<plugin>
<groupId>com.mycila</groupId>
<artifactId>license-maven-plugin</artifactId>
<configuration>
<licenseSets>

<licenseSet>
<!--
eXist-db's License
-->
<header>${project.parent.relativePath}/LGPL-21-license.template.txt</header>
<excludes>
<exclude>src/test/java/org/exist/contentextraction/xquery/Util.java</exclude>
</excludes>
</licenseSet>

<licenseSet>
<!--
FDB backport to LGPL 2.1-only licensed code
-->
<header>${project.parent.relativePath}/FDB-backport-LGPL-21-ONLY-license.template.txt</header>
<includes>
<include>src/test/java/org/exist/contentextraction/xquery/Util.java</include>
</includes>

</licenseSet>

</licenseSets>
</configuration>
</plugin>

<plugin>
<groupId>org.owasp</groupId>
<artifactId>dependency-check-maven</artifactId>
Expand All @@ -131,7 +168,7 @@
<configuration>
<failOnWarning>true</failOnWarning>
<ignoredUnusedDeclaredDependencies>
<ignoredUnusedDeclaredDependency>org.apache.tika:tika-parsers</ignoredUnusedDeclaredDependency>
<ignoredUnusedDeclaredDependency>org.apache.tika:tika-parsers-standard-package</ignoredUnusedDeclaredDependency>
<ignoredUnusedDeclaredDependency>org.junit.vintage:junit-vintage-engine:jar:${junit.vintage.version}</ignoredUnusedDeclaredDependency>
</ignoredUnusedDeclaredDependencies>
</configuration>
Expand Down
Expand Up @@ -27,48 +27,38 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.exist.util.serializer.Receiver;
import org.exist.util.serializer.SAXToReceiver;
import org.exist.xquery.value.BinaryValue;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;



/**
* @author <a href="mailto:dulip.withanage@gmail.com">Dulip Withanage</a>
* @version 1.0
*/
public class ContentExtraction {
final Parser parser = new AutoDetectParser();
final ParseContext parseContext = new ParseContext();

public ContentExtraction() {
parseContext.set(Parser.class, parser);
}
final AutoDetectParser parser = new AutoDetectParser();

public Metadata extractContentAndMetadata(final BinaryValue binaryValue, final ContentHandler contentHandler) throws IOException, SAXException, ContentExtractionException {
try (final InputStream is = binaryValue.getInputStream()) {
final Metadata metadata = new Metadata();
parser.parse(is, contentHandler, metadata, parseContext);
parser.parse(is, contentHandler, metadata);
return metadata;
} catch (final TikaException e) {
throw new ContentExtractionException("Problem with content extraction library: " + e.getMessage(), e);
}
}

public void extractContentAndMetadata(BinaryValue binaryValue, Receiver receiver)
public void extractContentAndMetadata(final BinaryValue binaryValue, final Receiver receiver)
throws IOException, SAXException, ContentExtractionException {

extractContentAndMetadata(binaryValue, new SAXToReceiver(receiver, false));
}

public Metadata extractMetadata(final BinaryValue binaryValue) throws IOException, SAXException, ContentExtractionException {
try (final InputStream is = binaryValue.getInputStream()) {
final Metadata metadata = new Metadata();
parser.parse(is, null, metadata, parseContext);
parser.parse(is, null, metadata);
return metadata;
} catch (final TikaException e) {
throw new ContentExtractionException("Problem with content extraction library: " + e.getMessage(), e);
Expand Down
@@ -0,0 +1,182 @@
/*
* eXist-db Open Source Native XML Database
* Copyright (C) 2001 The eXist-db Authors
*
* info@exist-db.org
* http://www.exist-db.org
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
package org.exist.contentextraction.xquery;

import com.evolvedbinary.j8fu.tuple.Tuple2;
import org.exist.EXistException;
import org.exist.collections.Collection;
import org.exist.collections.triggers.TriggerException;
import org.exist.security.PermissionDeniedException;
import org.exist.source.Source;
import org.exist.source.StringSource;
import org.exist.storage.BrokerPool;
import org.exist.storage.DBBroker;
import org.exist.storage.lock.Lock;
import org.exist.storage.txn.Txn;
import org.exist.test.ExistEmbeddedServer;
import org.exist.util.LockException;
import org.exist.xmldb.XmldbURI;
import org.exist.xquery.XPathException;
import org.exist.xquery.value.Sequence;
import org.junit.*;

import java.io.IOException;
import java.io.InputStream;
import java.util.Optional;

import static com.evolvedbinary.j8fu.tuple.Tuple.Tuple;
import static org.exist.contentextraction.xquery.Util.executeQuery;
import static org.exist.contentextraction.xquery.Util.withCompiledQuery;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;


public class ContentFunctionsTest {

@ClassRule
public static final ExistEmbeddedServer existEmbeddedServer = new ExistEmbeddedServer(true, true);

@BeforeClass
public static void setup() throws EXistException, PermissionDeniedException, IOException, TriggerException, LockException {
final BrokerPool pool = existEmbeddedServer.getBrokerPool();
try (final DBBroker broker = pool.get(Optional.of(pool.getSecurityManager().getSystemSubject()));
final Txn transaction = pool.getTransactionManager().beginTransaction()) {

try (final Collection collection = broker.getOrCreateCollection(transaction, XmldbURI.create("/db/content-functions-test"))) {

try (final InputStream is = ContentFunctionsTest.class.getResourceAsStream("minimal.pdf")) {
assertNotNull(is);
collection.addBinaryResource(transaction, broker, XmldbURI.create("minimal.pdf"), is, "application/pdf", -1);
}

try (final InputStream is = ContentFunctionsTest.class.getResourceAsStream("test.xlsx")) {
assertNotNull(is);
collection.addBinaryResource(transaction, broker, XmldbURI.create("test.xlsx"), is, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", -1);
}

}

transaction.commit();
}
}

@AfterClass
public static void teardown() throws EXistException, PermissionDeniedException, IOException, TriggerException {
final BrokerPool pool = existEmbeddedServer.getBrokerPool();
try (final DBBroker broker = pool.get(Optional.of(pool.getSecurityManager().getSystemSubject()));
final Txn transaction = pool.getTransactionManager().beginTransaction()) {

try (final Collection collection = broker.openCollection(XmldbURI.create("/db/content-functions-test"), Lock.LockMode.WRITE_LOCK)) {
if (collection != null) {
broker.removeCollection(transaction, collection);
}
}
}
}

@Test
public void getMetadataFromPdf() throws EXistException, XPathException, PermissionDeniedException, IOException {
final String mainQuery =
"declare namespace html = \"http://www.w3.org/1999/xhtml\";\n" +
"declare namespace contentextraction = \"http://exist-db.org/xquery/contentextraction\";\n" +
"declare namespace util = \"http://exist-db.org/xquery/util\";\n" +
"let $bin := util:binary-doc(\"/db/content-functions-test/minimal.pdf\")\n" +
" return\n" +
" contentextraction:get-metadata($bin)//html:meta[@name = (\"xmpTPg:NPages\", \"Content-Type\")]/@content";

final BrokerPool pool = existEmbeddedServer.getBrokerPool();
final Source mainQuerySource = new StringSource(mainQuery);
try (final DBBroker broker = pool.getBroker();
final Txn transaction = pool.getTransactionManager().beginTransaction()) {

final Tuple2<Integer, String> metadata = withCompiledQuery(broker, mainQuerySource, mainCompiledQuery -> {
final Sequence result = executeQuery(broker, mainCompiledQuery);
assertEquals(2, result.getItemCount());

return Tuple(result.itemAt(0).toJavaObject(int.class), result.itemAt(1).getStringValue());
});

transaction.commit();

assertEquals(1, metadata._1.intValue());
assertEquals("application/pdf", metadata._2);
}
}

@Test
public void getMetadataAndContentFromPdf() throws EXistException, XPathException, PermissionDeniedException, IOException {
final String mainQuery =
"declare namespace html = \"http://www.w3.org/1999/xhtml\";\n" +
"declare namespace contentextraction = \"http://exist-db.org/xquery/contentextraction\";\n" +
"declare namespace util = \"http://exist-db.org/xquery/util\";\n" +
"let $bin := util:binary-doc(\"/db/content-functions-test/minimal.pdf\")\n" +
" return\n" +
" contentextraction:get-metadata-and-content($bin)//html:p[2]/string()";

final BrokerPool pool = existEmbeddedServer.getBrokerPool();
final Source mainQuerySource = new StringSource(mainQuery);
try (final DBBroker broker = pool.getBroker();
final Txn transaction = pool.getTransactionManager().beginTransaction()) {

final String content = withCompiledQuery(broker, mainQuerySource, mainCompiledQuery -> {
final Sequence result = executeQuery(broker, mainCompiledQuery);
assertEquals(1, result.getItemCount());

return result.itemAt(0).getStringValue();
});

transaction.commit();

assertEquals("Hello World", content);
}
}

@Ignore("see https://github.com/eXist-db/exist/issues/3835")
@Test
public void getMetadataFromXlsx() throws EXistException, XPathException, PermissionDeniedException, IOException {
final String mainQuery =
"declare namespace html = \"http://www.w3.org/1999/xhtml\";\n" +
"declare namespace contentextraction = \"http://exist-db.org/xquery/contentextraction\";\n" +
"declare namespace util = \"http://exist-db.org/xquery/util\";\n" +
"let $bin := util:binary-doc(\"/db/content-functions-test/test.xlsx\")\n" +
" return\n" +
" contentextraction:get-metadata($bin)//html:meta[@name = (\"xmpTPg:NPages\", \"Content-Type\")]/@content";

final BrokerPool pool = existEmbeddedServer.getBrokerPool();
final Source mainQuerySource = new StringSource(mainQuery);
try (final DBBroker broker = pool.getBroker();
final Txn transaction = pool.getTransactionManager().beginTransaction()) {

final Tuple2<Integer, String> metadata = withCompiledQuery(broker, mainQuerySource, mainCompiledQuery -> {
final Sequence result = executeQuery(broker, mainCompiledQuery);
assertEquals(2, result.getItemCount());

return Tuple(result.itemAt(0).toJavaObject(int.class), result.itemAt(1).getStringValue());
});

transaction.commit();

assertEquals(1, metadata._1.intValue());
assertEquals("application/pdf", metadata._2);
}
}
}

0 comments on commit a154289

Please sign in to comment.