Skip to content

Commit a154289

Browse files
authored
Merge pull request #4168 from evolvedbinary/feature/update-tika
Update to Tika 2.2.1
2 parents 1a18922 + 7a61bfc commit a154289

File tree

9 files changed

+329
-187
lines changed

9 files changed

+329
-187
lines changed

exist-core/src/main/java/org/exist/test/runner/ExtTestErrorFunction.java

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,10 +119,10 @@ private XPathException errorMapAsXPathException(final MapType errorMap) throws X
119119
}
120120

121121
private static final Pattern PTN_CAUSED_BY = Pattern.compile("Caused by:\\s([a-zA-Z0-9_$\\.]+)(?::\\s(.+))?");
122-
private static final Pattern PTN_AT = Pattern.compile("at\\s((?:[a-zA-Z0-9_$]+)(?:\\.[a-zA-Z0-9_$]+)*)\\.([a-zA-Z0-9_$-]+)\\(([a-zA-Z0-9_]+\\.java):([0-9]+)\\)");
122+
private static final Pattern PTN_AT = Pattern.compile("at\\s((?:[a-zA-Z0-9_$]+)(?:\\.[a-zA-Z0-9_$]+)*)\\.((?:[a-zA-Z0-9_$-]+)|(?:<init>))\\(([a-zA-Z0-9_]+\\.java):([0-9]+)\\)");
123123

124124
protected StackTraceElement[] convertStackTraceElements(final Sequence seqJavaStackTrace) throws XPathException {
125-
StackTraceElement[] traceElements = new StackTraceElement[seqJavaStackTrace.getItemCount() - 1];
125+
StackTraceElement[] traceElements = null;
126126

127127
final Matcher matcherAt = PTN_AT.matcher("");
128128

@@ -134,10 +134,14 @@ protected StackTraceElement[] convertStackTraceElements(final Sequence seqJavaSt
134134
if (stackTraceElement == null) {
135135
break;
136136
}
137+
138+
if (traceElements == null) {
139+
traceElements = new StackTraceElement[seqJavaStackTrace.getItemCount() - 1];
140+
}
137141
traceElements[i - 1] = stackTraceElement;
138142
}
139143

140-
if (i + 1 < seqJavaStackTrace.getItemCount()) {
144+
if (traceElements != null && i + 1 < seqJavaStackTrace.getItemCount()) {
141145
traceElements = Arrays.copyOf(traceElements, i - 2);
142146
}
143147

extensions/contentextraction/pom.xml

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
</scm>
4747

4848
<properties>
49-
<tika.version>1.28</tika.version>
49+
<tika.version>2.2.1</tika.version>
5050
</properties>
5151

5252
<dependencies>
@@ -66,10 +66,9 @@
6666
<artifactId>tika-core</artifactId>
6767
<version>${tika.version}</version>
6868
</dependency>
69-
7069
<dependency>
7170
<groupId>org.apache.tika</groupId>
72-
<artifactId>tika-parsers</artifactId>
71+
<artifactId>tika-parsers-standard-package</artifactId>
7372
<version>${tika.version}</version>
7473
<scope>runtime</scope>
7574
<exclusions>
@@ -97,6 +96,12 @@
9796
<scope>test</scope>
9897
</dependency>
9998

99+
<dependency>
100+
<groupId>com.evolvedbinary.j8fu</groupId>
101+
<artifactId>j8fu</artifactId>
102+
<scope>test</scope>
103+
</dependency>
104+
100105
</dependencies>
101106

102107
<build>
@@ -110,7 +115,39 @@
110115
<filtering>true</filtering>
111116
</testResource>
112117
</testResources>
118+
113119
<plugins>
120+
<plugin>
121+
<groupId>com.mycila</groupId>
122+
<artifactId>license-maven-plugin</artifactId>
123+
<configuration>
124+
<licenseSets>
125+
126+
<licenseSet>
127+
<!--
128+
eXist-db's License
129+
-->
130+
<header>${project.parent.relativePath}/LGPL-21-license.template.txt</header>
131+
<excludes>
132+
<exclude>src/test/java/org/exist/contentextraction/xquery/Util.java</exclude>
133+
</excludes>
134+
</licenseSet>
135+
136+
<licenseSet>
137+
<!--
138+
FDB backport to LGPL 2.1-only licensed code
139+
-->
140+
<header>${project.parent.relativePath}/FDB-backport-LGPL-21-ONLY-license.template.txt</header>
141+
<includes>
142+
<include>src/test/java/org/exist/contentextraction/xquery/Util.java</include>
143+
</includes>
144+
145+
</licenseSet>
146+
147+
</licenseSets>
148+
</configuration>
149+
</plugin>
150+
114151
<plugin>
115152
<groupId>org.owasp</groupId>
116153
<artifactId>dependency-check-maven</artifactId>
@@ -131,7 +168,7 @@
131168
<configuration>
132169
<failOnWarning>true</failOnWarning>
133170
<ignoredUnusedDeclaredDependencies>
134-
<ignoredUnusedDeclaredDependency>org.apache.tika:tika-parsers</ignoredUnusedDeclaredDependency>
171+
<ignoredUnusedDeclaredDependency>org.apache.tika:tika-parsers-standard-package</ignoredUnusedDeclaredDependency>
135172
<ignoredUnusedDeclaredDependency>org.junit.vintage:junit-vintage-engine:jar:${junit.vintage.version}</ignoredUnusedDeclaredDependency>
136173
</ignoredUnusedDeclaredDependencies>
137174
</configuration>

extensions/contentextraction/src/main/java/org/exist/contentextraction/ContentExtraction.java

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,48 +27,38 @@
2727
import org.apache.tika.exception.TikaException;
2828
import org.apache.tika.metadata.Metadata;
2929
import org.apache.tika.parser.AutoDetectParser;
30-
import org.apache.tika.parser.ParseContext;
31-
import org.apache.tika.parser.Parser;
3230
import org.exist.util.serializer.Receiver;
3331
import org.exist.util.serializer.SAXToReceiver;
3432
import org.exist.xquery.value.BinaryValue;
3533
import org.xml.sax.ContentHandler;
3634
import org.xml.sax.SAXException;
3735

38-
39-
4036
/**
4137
* @author <a href="mailto:dulip.withanage@gmail.com">Dulip Withanage</a>
4238
* @version 1.0
4339
*/
4440
public class ContentExtraction {
45-
final Parser parser = new AutoDetectParser();
46-
final ParseContext parseContext = new ParseContext();
47-
48-
public ContentExtraction() {
49-
parseContext.set(Parser.class, parser);
50-
}
41+
final AutoDetectParser parser = new AutoDetectParser();
5142

5243
public Metadata extractContentAndMetadata(final BinaryValue binaryValue, final ContentHandler contentHandler) throws IOException, SAXException, ContentExtractionException {
5344
try (final InputStream is = binaryValue.getInputStream()) {
5445
final Metadata metadata = new Metadata();
55-
parser.parse(is, contentHandler, metadata, parseContext);
46+
parser.parse(is, contentHandler, metadata);
5647
return metadata;
5748
} catch (final TikaException e) {
5849
throw new ContentExtractionException("Problem with content extraction library: " + e.getMessage(), e);
5950
}
6051
}
6152

62-
public void extractContentAndMetadata(BinaryValue binaryValue, Receiver receiver)
53+
public void extractContentAndMetadata(final BinaryValue binaryValue, final Receiver receiver)
6354
throws IOException, SAXException, ContentExtractionException {
64-
6555
extractContentAndMetadata(binaryValue, new SAXToReceiver(receiver, false));
6656
}
6757

6858
public Metadata extractMetadata(final BinaryValue binaryValue) throws IOException, SAXException, ContentExtractionException {
6959
try (final InputStream is = binaryValue.getInputStream()) {
7060
final Metadata metadata = new Metadata();
71-
parser.parse(is, null, metadata, parseContext);
61+
parser.parse(is, null, metadata);
7262
return metadata;
7363
} catch (final TikaException e) {
7464
throw new ContentExtractionException("Problem with content extraction library: " + e.getMessage(), e);
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
/*
2+
* eXist-db Open Source Native XML Database
3+
* Copyright (C) 2001 The eXist-db Authors
4+
*
5+
* info@exist-db.org
6+
* http://www.exist-db.org
7+
*
8+
* This library is free software; you can redistribute it and/or
9+
* modify it under the terms of the GNU Lesser General Public
10+
* License as published by the Free Software Foundation; either
11+
* version 2.1 of the License, or (at your option) any later version.
12+
*
13+
* This library is distributed in the hope that it will be useful,
14+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16+
* Lesser General Public License for more details.
17+
*
18+
* You should have received a copy of the GNU Lesser General Public
19+
* License along with this library; if not, write to the Free Software
20+
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21+
*/
22+
package org.exist.contentextraction.xquery;
23+
24+
import com.evolvedbinary.j8fu.tuple.Tuple2;
25+
import org.exist.EXistException;
26+
import org.exist.collections.Collection;
27+
import org.exist.collections.triggers.TriggerException;
28+
import org.exist.security.PermissionDeniedException;
29+
import org.exist.source.Source;
30+
import org.exist.source.StringSource;
31+
import org.exist.storage.BrokerPool;
32+
import org.exist.storage.DBBroker;
33+
import org.exist.storage.lock.Lock;
34+
import org.exist.storage.txn.Txn;
35+
import org.exist.test.ExistEmbeddedServer;
36+
import org.exist.util.LockException;
37+
import org.exist.xmldb.XmldbURI;
38+
import org.exist.xquery.XPathException;
39+
import org.exist.xquery.value.Sequence;
40+
import org.junit.*;
41+
42+
import java.io.IOException;
43+
import java.io.InputStream;
44+
import java.util.Optional;
45+
46+
import static com.evolvedbinary.j8fu.tuple.Tuple.Tuple;
47+
import static org.exist.contentextraction.xquery.Util.executeQuery;
48+
import static org.exist.contentextraction.xquery.Util.withCompiledQuery;
49+
import static org.junit.Assert.assertEquals;
50+
import static org.junit.Assert.assertNotNull;
51+
52+
53+
public class ContentFunctionsTest {
54+
55+
@ClassRule
56+
public static final ExistEmbeddedServer existEmbeddedServer = new ExistEmbeddedServer(true, true);
57+
58+
@BeforeClass
59+
public static void setup() throws EXistException, PermissionDeniedException, IOException, TriggerException, LockException {
60+
final BrokerPool pool = existEmbeddedServer.getBrokerPool();
61+
try (final DBBroker broker = pool.get(Optional.of(pool.getSecurityManager().getSystemSubject()));
62+
final Txn transaction = pool.getTransactionManager().beginTransaction()) {
63+
64+
try (final Collection collection = broker.getOrCreateCollection(transaction, XmldbURI.create("/db/content-functions-test"))) {
65+
66+
try (final InputStream is = ContentFunctionsTest.class.getResourceAsStream("minimal.pdf")) {
67+
assertNotNull(is);
68+
collection.addBinaryResource(transaction, broker, XmldbURI.create("minimal.pdf"), is, "application/pdf", -1);
69+
}
70+
71+
try (final InputStream is = ContentFunctionsTest.class.getResourceAsStream("test.xlsx")) {
72+
assertNotNull(is);
73+
collection.addBinaryResource(transaction, broker, XmldbURI.create("test.xlsx"), is, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", -1);
74+
}
75+
76+
}
77+
78+
transaction.commit();
79+
}
80+
}
81+
82+
@AfterClass
83+
public static void teardown() throws EXistException, PermissionDeniedException, IOException, TriggerException {
84+
final BrokerPool pool = existEmbeddedServer.getBrokerPool();
85+
try (final DBBroker broker = pool.get(Optional.of(pool.getSecurityManager().getSystemSubject()));
86+
final Txn transaction = pool.getTransactionManager().beginTransaction()) {
87+
88+
try (final Collection collection = broker.openCollection(XmldbURI.create("/db/content-functions-test"), Lock.LockMode.WRITE_LOCK)) {
89+
if (collection != null) {
90+
broker.removeCollection(transaction, collection);
91+
}
92+
}
93+
}
94+
}
95+
96+
@Test
97+
public void getMetadataFromPdf() throws EXistException, XPathException, PermissionDeniedException, IOException {
98+
final String mainQuery =
99+
"declare namespace html = \"http://www.w3.org/1999/xhtml\";\n" +
100+
"declare namespace contentextraction = \"http://exist-db.org/xquery/contentextraction\";\n" +
101+
"declare namespace util = \"http://exist-db.org/xquery/util\";\n" +
102+
"let $bin := util:binary-doc(\"/db/content-functions-test/minimal.pdf\")\n" +
103+
" return\n" +
104+
" contentextraction:get-metadata($bin)//html:meta[@name = (\"xmpTPg:NPages\", \"Content-Type\")]/@content";
105+
106+
final BrokerPool pool = existEmbeddedServer.getBrokerPool();
107+
final Source mainQuerySource = new StringSource(mainQuery);
108+
try (final DBBroker broker = pool.getBroker();
109+
final Txn transaction = pool.getTransactionManager().beginTransaction()) {
110+
111+
final Tuple2<Integer, String> metadata = withCompiledQuery(broker, mainQuerySource, mainCompiledQuery -> {
112+
final Sequence result = executeQuery(broker, mainCompiledQuery);
113+
assertEquals(2, result.getItemCount());
114+
115+
return Tuple(result.itemAt(0).toJavaObject(int.class), result.itemAt(1).getStringValue());
116+
});
117+
118+
transaction.commit();
119+
120+
assertEquals(1, metadata._1.intValue());
121+
assertEquals("application/pdf", metadata._2);
122+
}
123+
}
124+
125+
@Test
126+
public void getMetadataAndContentFromPdf() throws EXistException, XPathException, PermissionDeniedException, IOException {
127+
final String mainQuery =
128+
"declare namespace html = \"http://www.w3.org/1999/xhtml\";\n" +
129+
"declare namespace contentextraction = \"http://exist-db.org/xquery/contentextraction\";\n" +
130+
"declare namespace util = \"http://exist-db.org/xquery/util\";\n" +
131+
"let $bin := util:binary-doc(\"/db/content-functions-test/minimal.pdf\")\n" +
132+
" return\n" +
133+
" contentextraction:get-metadata-and-content($bin)//html:p[2]/string()";
134+
135+
final BrokerPool pool = existEmbeddedServer.getBrokerPool();
136+
final Source mainQuerySource = new StringSource(mainQuery);
137+
try (final DBBroker broker = pool.getBroker();
138+
final Txn transaction = pool.getTransactionManager().beginTransaction()) {
139+
140+
final String content = withCompiledQuery(broker, mainQuerySource, mainCompiledQuery -> {
141+
final Sequence result = executeQuery(broker, mainCompiledQuery);
142+
assertEquals(1, result.getItemCount());
143+
144+
return result.itemAt(0).getStringValue();
145+
});
146+
147+
transaction.commit();
148+
149+
assertEquals("Hello World", content);
150+
}
151+
}
152+
153+
@Ignore("see https://github.com/eXist-db/exist/issues/3835")
154+
@Test
155+
public void getMetadataFromXlsx() throws EXistException, XPathException, PermissionDeniedException, IOException {
156+
final String mainQuery =
157+
"declare namespace html = \"http://www.w3.org/1999/xhtml\";\n" +
158+
"declare namespace contentextraction = \"http://exist-db.org/xquery/contentextraction\";\n" +
159+
"declare namespace util = \"http://exist-db.org/xquery/util\";\n" +
160+
"let $bin := util:binary-doc(\"/db/content-functions-test/test.xlsx\")\n" +
161+
" return\n" +
162+
" contentextraction:get-metadata($bin)//html:meta[@name = (\"xmpTPg:NPages\", \"Content-Type\")]/@content";
163+
164+
final BrokerPool pool = existEmbeddedServer.getBrokerPool();
165+
final Source mainQuerySource = new StringSource(mainQuery);
166+
try (final DBBroker broker = pool.getBroker();
167+
final Txn transaction = pool.getTransactionManager().beginTransaction()) {
168+
169+
final Tuple2<Integer, String> metadata = withCompiledQuery(broker, mainQuerySource, mainCompiledQuery -> {
170+
final Sequence result = executeQuery(broker, mainCompiledQuery);
171+
assertEquals(2, result.getItemCount());
172+
173+
return Tuple(result.itemAt(0).toJavaObject(int.class), result.itemAt(1).getStringValue());
174+
});
175+
176+
transaction.commit();
177+
178+
assertEquals(1, metadata._1.intValue());
179+
assertEquals("application/pdf", metadata._2);
180+
}
181+
}
182+
}

0 commit comments

Comments
 (0)