Skip to content

Commit

Permalink
Merge branch 'develop' into add-regex-extract-text-kotlin-sample
Browse files Browse the repository at this point in the history
  • Loading branch information
datalogics-saharay committed Feb 27, 2024
2 parents 4a4400e + 70f4c94 commit 3601efa
Show file tree
Hide file tree
Showing 7 changed files with 635 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .github/workflows/test-kotlin-samples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
strategy:
matrix:
os: [windows-latest, ubuntu-latest]
dir: ['TextExtract', 'FlattenTransparency', 'SplitPDF', 'ConvertToOffice', 'MergePDF', 'ListWords', 'PDFOptimize', 'PDFAConverter', 'Redactions', 'RegexExtractText']
dir: ['TextExtract', 'FlattenTransparency', 'SplitPDF', 'ConvertToOffice', 'MergePDF', 'ListWords', 'PDFOptimize', 'PDFAConverter', 'Redactions', 'Watermark', 'RegexTextSearch', 'RegexExtractText']
steps:
- name: Checkout
uses: actions/checkout@v4
Expand Down
14 changes: 14 additions & 0 deletions RegexTextSearch/.idea/runConfigurations/RegexTextSearch.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

231 changes: 231 additions & 0 deletions RegexTextSearch/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.datalogics.pdfl.samples</groupId>
<artifactId>RegexTextSearch</artifactId>
<version>1.0-SNAPSHOT</version>

<repositories>
<repository>
<id>mavenCentral</id>
<url>https://repo1.maven.org/maven2/</url>
</repository>
</repositories>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<kotlin.code.style>official</kotlin.code.style>
<kotlin.compiler.jvmTarget>1.8</kotlin.compiler.jvmTarget>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>

<profiles>
<profile>
<id>Windows64</id>
<activation>
<os>
<family>windows</family>
<arch>amd64</arch>
</os>
</activation>
<properties>
<jni.classifier>win-x86-64-jni</jni.classifier>
</properties>
</profile>
<profile>
<id>MacArm</id>
<activation>
<os>
<family>mac</family>
<arch>aarch64</arch>
</os>
</activation>
<properties>
<jni.classifier>mac-arm-64-jni</jni.classifier>
</properties>
</profile>
<profile>
<id>Linux64</id>
<activation>
<os>
<!-- Use OS <name> instead of <family> because the "unix" <family> also includes Mac -->
<name>Linux</name>
<arch>amd64</arch>
</os>
</activation>
<properties>
<jni.classifier>linux-x86-64-jni</jni.classifier>
</properties>
</profile>
</profiles>

<dependencies>
<dependency>
<groupId>org.jetbrains.kotlin</groupId>
<artifactId>kotlin-stdlib-jdk8</artifactId>
<version>1.9.21</version>
</dependency>
<dependency>
<groupId>com.datalogics.pdfl</groupId>
<artifactId>pdfl</artifactId>
<version>18.31.0</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>com.datalogics.pdfl</groupId>
<artifactId>pdfl</artifactId>
<version>18.31.0</version>
</dependency>
<dependency>
<groupId>com.datalogics.pdfl</groupId>
<artifactId>pdfl</artifactId>
<version>18.31.0</version>
<type>zip</type>
<classifier>${jni.classifier}</classifier>
</dependency>
<dependency>
<groupId>com.datalogics.pdfl</groupId>
<artifactId>pdfl</artifactId>
<version>18.31.0</version>
<type>zip</type>
<classifier>resources</classifier>
</dependency>
<dependency>
<groupId>com.datalogics.pdfl</groupId>
<artifactId>pdfl</artifactId>
<version>18.31.0</version>
<classifier>javadoc</classifier>
</dependency>
</dependencies>

<build>
<sourceDirectory>src/main/kotlin</sourceDirectory>
<plugins>
<plugin>
<groupId>org.jetbrains.kotlin</groupId>
<artifactId>kotlin-maven-plugin</artifactId>
<version>1.9.21</version>
<executions>
<execution>
<id>compile</id>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.2</version>
</plugin>
<plugin>
<artifactId>maven-failsafe-plugin</artifactId>
<version>2.22.2</version>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.6.0</version>
<configuration>
<mainClass>RegexTextSearch</mainClass>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>unpack-resources</id>
<phase>generate-resources</phase>
<goals>
<goal>unpack</goal>
</goals>
<configuration>
<artifactItems>
<artifactItem>
<groupId>com.datalogics.pdfl</groupId>
<artifactId>pdfl</artifactId>
<classifier>resources</classifier>
<type>zip</type>
<outputDirectory>${project.build.directory}/lib/Resources</outputDirectory>
</artifactItem>
</artifactItems>
</configuration>
</execution>
<execution>
<id>unpack-jni</id>
<phase>generate-resources</phase>
<goals>
<goal>unpack</goal>
</goals>
<configuration>
<artifactItems>
<artifactItem>
<groupId>com.datalogics.pdfl</groupId>
<artifactId>pdfl</artifactId>
<classifier>${jni.classifier}</classifier>
<type>zip</type>
<outputDirectory>${project.build.directory}/lib</outputDirectory>
</artifactItem>
</artifactItems>
</configuration>
</execution>
<execution>
<id>unpack-license</id>
<phase>generate-resources</phase>
<goals>
<goal>unpack</goal>
</goals>
<configuration>
<artifactItems>
<artifactItem>
<groupId>com.datalogics.pdfl</groupId>
<artifactId>pdfl</artifactId>
<classifier>license</classifier>
<type>zip</type>
<outputDirectory>${project.build.directory}/lib</outputDirectory>
</artifactItem>
</artifactItems>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<mainClass>com.datalogics.pdfl.samples.RegexTextSearchKt</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>3.0.2</version>
</plugin>
</plugins>
</pluginManagement>
</build>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package com.datalogics.pdfl.samples

import com.datalogics.PDFL.*
import java.util.*

/*
*
* This sample shows how to search a PDF document using regex pattern matching. The program opens an input PDF, searches for
* words using the DocTextFinder, and then prints these words to the console.
*
* Copyright (c) 2024, Datalogics, Inc. All rights reserved.
*
*/

fun main(args: Array<String>) {
println("RegexTextSearch sample:")
val lib = Library()

try {
val sInput: String =
if (args.isNotEmpty()) {
args[0]
} else {
Library.getResourceDirectory() + "Sample_Input/RegexTextSearch.pdf"
}

val sOutput = "RegexTextSearch-out.pdf"

// Highlight occurrences of the words that match this regular expression.
// Phone numbers
val sRegex = "((1-)?(\\()?\\d{3}(\\))?(\\s)?(-)?\\d{3}-\\d{4})"
// Email addresses
//val sRegex = "(\\b[\\w.!#$%&'*+\\/=?^`{|}~-]+@[\\w-]+(?:\\.[\\w-]+)*\\b)"
// URLs
//val sRegex = "((https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,}))"

println("Reading $sInput")

val doc = Document(sInput)
val nPages = doc.numPages
println("Opened document $sInput")

val wordConfig = WordFinderConfig()

// Need to set this to true so phrases will be concatenated properly
wordConfig.noHyphenDetection = true

val docTextFinder = DocTextFinder(doc, wordConfig)
val docMatches = docTextFinder.getMatchList(0, nPages - 1, sRegex)

for (wInfo in docMatches) {
// Show the matching phrase
val s = wInfo.matchString
println(s)

// Get the word quads
val quadList = wInfo.quadInfo

// Iterate through the quad info and create highlights
for (qInfo in quadList) {
val docPage = doc.getPage(qInfo.pageNum)
val highlight = HighlightAnnotation(docPage, qInfo.quads)
highlight.normalAppearance = highlight.generateAppearance()
}
}

// Save the document with the highlighted matched strings
doc.save(EnumSet.of(SaveFlags.FULL), sOutput)
doc.close()

} finally {
lib.delete()
}
}
14 changes: 14 additions & 0 deletions Watermark/.idea/runConfigurations/Watermark.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 3601efa

Please sign in to comment.