/
pdffontsPdfTypeDetector.kt
56 lines (45 loc) · 1.77 KB
/
pdffontsPdfTypeDetector.kt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
package net.dankito.text.extraction.pdf
import net.dankito.text.extraction.model.PdfType
import net.dankito.utils.process.CommandExecutor
import net.dankito.utils.process.CommandlineProgram
import net.dankito.utils.process.ICommandExecutor
import org.slf4j.LoggerFactory
import java.io.File
/**
* Thanks so much Kurt Pfeifle for giving this superb hint (https://stackoverflow.com/a/3108531):
* If a PDF uses fonts, than it contains (searchable) text.
* If not, than PDF only contains images.
*
* So we can use `pdffonts` of Poppler utils to check how many fonts a PDF uses.
*/
open class pdffontsPdfTypeDetector(
protected val commandExecutor: ICommandExecutor = CommandExecutor()
) : IPdfTypeDetector {
companion object {
private val log = LoggerFactory.getLogger(pdffontsPdfTypeDetector::class.java)
}
protected val commandlineProgram = CommandlineProgram("pdffonts", commandExecutor)
override val isAvailable: Boolean
get() = commandlineProgram.isAvailable
override fun detectPdfType(file: File): PdfType? {
try {
val executeCommandResult = commandExecutor.executeCommand(
commandlineProgram.programExecutablePath,
file.absolutePath
)
// first two lines are headers -> if there are more then two lines than PDF uses fonts
if (executeCommandResult.outputLines.size > 2) {
return PdfType.SearchableTextPdf
}
else if (executeCommandResult.outputLines.size == 2) {
return PdfType.ImageOnlyPdf
}
else {
return null // not a PDF
}
} catch (e: Exception) {
log.error("Could not get PDF type of file $file", e)
}
return null
}
}