Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

<groupId>io.committed.krill</groupId>
<artifactId>krill</artifactId>
<version>1.0.3</version>
<version>1.0.4</version>
<name>krill</name>
<description>Uses Apache Tika (https://tika.apache.org/) and PDFBox (https://pdfbox.apache.org/) with subsequent post processing to generate a HTML representation of a document (PDF, CSV, XLS, etc) together with it metadata.</description>
<url>http://github.com/commitd/krill</url>
Expand Down Expand Up @@ -44,19 +44,19 @@
<java.version>1.8</java.version>
<dependency.locations.enabled>false</dependency.locations.enabled>

<commons-io.version>2.5</commons-io.version>
<commons-io.version>2.6</commons-io.version>
<commons-csv.version>1.5</commons-csv.version>
<slf4j.version>1.7.19</slf4j.version>
<tika.version>1.16</tika.version>
<jbig2-imageio.version>3.0.0</jbig2-imageio.version>
<guava.version>23.0</guava.version>
<jsoup.version>1.10.3</jsoup.version>
<slf4j.version>1.7.25</slf4j.version>
<tika.version>1.18</tika.version>
<jbig2-imageio.version>3.0.1</jbig2-imageio.version>
<guava.version>25.1-jre</guava.version>
<jsoup.version>1.11.3</jsoup.version>

<assertj.version>3.8.0</assertj.version>
<assertj.version>3.10.0</assertj.version>
<junit.version>4.12</junit.version>
<!-- there is something odd with versioning of the expected-failure plugin -->
<junit.expected-failure.version>0.0.9</junit.expected-failure.version>
<mockito.version>1.10.19</mockito.version>
<mockito.version>2.19.0</mockito.version>

<maven.compiler.plugin.version>2.3.2</maven.compiler.plugin.version>
<maven.checkstyle.plugin.version>2.17</maven.checkstyle.plugin.version>
Expand Down Expand Up @@ -151,7 +151,7 @@

<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
<artifactId>mockito-core</artifactId>
<version>${mockito.version}</version>
<scope>test</scope>
</dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import java.util.Set;

import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.mime.MediaType;
Expand Down Expand Up @@ -153,6 +154,8 @@ public Extraction parse(final InputStream stream, final String source)
parser.parse(stream,
new XHTMLContentHandler(new NoHeadTagInBodyContentHandler(handler), tikaMetadata),
tikaMetadata, context);
} catch (ZeroByteFileException exception){
//If a file is zero bytes, then we don't want to throw an exception but continue with an empty HTML string
} catch (IOException | SAXException | TikaException | NullPointerException exception) {
throw new ExtractionException("Failed to parse stream", exception);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public class RendererPackageWorkaround {
*/
public RendererPackageWorkaround() throws RuntimeException {
try {
pageDrawer = new PageDrawer(new PageDrawerParameters(null, null));
pageDrawer = new PageDrawer(new PageDrawerParameters(null, null, true));
method = PageDrawer.class.getDeclaredMethod("createGlyph2D", PDFont.class);
} catch (NoSuchMethodException | SecurityException | IOException e) {
throw new RuntimeException(e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ public CsvEmptyTikaFormatExtractorIT() {

@Test
public void testBody() {
assertBody("" + "<main class=\"SpreadSheet\"> \n" + " <article class=\"Sheet\"> \n"
+ " <table></table> \n" + " </article> \n" + "</main>");
assertBody("" );
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public void testBody() {
+ " A div can be used to define a new block element without any specific semantics. \n"
+ " </div> \n"
+ " <p>Sometimes it is useful to include larger blocks of preformatted text, which is typically rendered using a monospace font. This is useful when attempting to create diagrams using text:</p> \n"
+ " <pre>\n" + " Top ---&amp;gt; +----+\n" + " | |\n"
+ " <pre>" + " Top ---&amp;gt; +----+\n" + " | |\n"
+ " Bottom ---&amp;gt; +----+\n" + "</pre> \n" + " </section> \n" + "</main>");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@ public void testBody() {
// NOTE: That don't get lists

assertBody("" + "<main class=\"SlideShow\"> \n" + " <article class=\"Slide\"> \n"
+ " <section class=\"\"> \n" + " <p>Title</p> \n" + " <p>Subtitle</p> \n"
+ " <section> \n" + " <p>Title</p> \n" + " <p>Subtitle</p> \n"
+ " </section> \n" + " <aside> \n" + " <p>Notes</p> \n" + " <p>1</p> \n"
+ " </aside> \n" + " </article> \n" + " <article class=\"Slide\"> \n"
+ " <section class=\"\"> \n" + " <p>Heading</p> \n" + " <p>First bullet point</p> \n"
+ " <section> \n" + " <p>Heading</p> \n" + " <p>First bullet point</p> \n"
+ " <p>Second bullet point</p> \n" + " <p>Nested bullet point</p> \n"
+ " </section> \n" + " <aside> \n" + " <p>More notes</p> \n" + " <p>2</p> \n"
+ " </aside> \n" + " </article> \n" + " <article class=\"Slide\"> \n"
+ " <section class=\"\"> \n" + " <p>Heading</p> \n"
+ " <section> \n" + " <p>Heading</p> \n"
+ " <p>First section, first bullet</p> \n" + " <p>First section, second bullet</p> \n"
+ " <p>Second section, first bullet</p> \n" + " <p>Second section, second bullet</p> \n"
+ " </section> \n" + " </article> \n" + "</main>");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ public PptxPlainSimpleTitleSlideTikaFormatExtractorIT() {
@Test
public void testBody() {
assertBody("" + "<main class=\"SlideShow\"> \n" + " <article class=\"Slide\"> \n"
+ " <section class=\"\"> \n" + " <p>Title</p> \n" + " <p>Subtitle</p> \n"
+ " <section> \n" + " <p>Title</p> \n" + " <p>Subtitle</p> \n"
+ " </section> \n" + " <aside> \n" + " <p>Notes</p> \n" + " <p>1</p> \n"
+ " </aside> \n" + " </article> \n" + "</main>");
}
Expand Down