documentcloud · BrandonNoad · May 28, 2014 · May 29, 2014 · Jun 9, 2014 · Oct 21, 2014
diff --git a/lib/docsplit/command_line.rb b/lib/docsplit/command_line.rb
@@ -101,6 +101,9 @@ def parse_options
         opts.on('-r', '--rolling', 'generate images from each previous image') do |r|
           @options[:rolling] = true
         end
+        opts.on('--delimiter [DELIMITER]', 'set page number delimiter (eg: _, -, -Page-...)') do |d|
+          @options[:delimiter] = d.tr('^','')
+        end
         opts.on_tail('-v', '--version', 'display docsplit version') do
           puts "Docsplit version #{Docsplit::VERSION}"
           exit

diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
@@ -7,6 +7,7 @@ class ImageExtractor
     MEMORY_ARGS     = "-limit memory 256MiB -limit map 512MiB"
     DEFAULT_FORMAT  = :png
     DEFAULT_DENSITY = '150'
+    DEFAULT_PAGE_DELIMITER = "_"
 
     # Extract a list of PDFs as rasterized page images, according to the
     # configuration in options.
@@ -41,7 +42,7 @@ def convert(pdf, size, format, previous=nil)
         raise ExtractionFailed, result if $? != 0
       else
         page_list(pages).each do |page|
-          out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
+          out_file  = ESCAPE[File.join(directory, "#{basename}#{@delimiter}#{page}.#{format}")]
           cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
           result = `#{cmd}`.chomp
           raise ExtractionFailed, result if $? != 0
@@ -63,6 +64,7 @@ def extract_options(options)
       @sizes   = [options[:size]].flatten.compact
       @sizes   = [nil] if @sizes.empty?
       @rolling = !!options[:rolling]
+      @delimiter = options[:delimiter] || DEFAULT_PAGE_DELIMITER
     end
 
     # If there's only one size requested, generate the images directly into

diff --git a/lib/docsplit/pdf_extractor.rb b/lib/docsplit/pdf_extractor.rb
@@ -16,7 +16,7 @@ def osx?
     def linux?
       !!HOST_OS.match(/linux/i)
     end
-    
+
     # The first line of the help output holds the name and version number
     # of the office software to be used for extraction.
     def version_string
@@ -35,10 +35,10 @@ def libre_office?
     def open_office?
       !!version_string.match(/^OpenOffice.org/)
     end
-    
+
     # A set of default locations to search for office software
     # These have been extracted from JODConverter.  Each listed
-    # path should contain a directory "program" which in turn 
+    # path should contain a directory "program" which in turn
     # contains the "soffice" executable.
     # see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
     def office_search_paths
@@ -56,6 +56,7 @@ def office_search_paths
         search_paths = %w(
           /usr/lib/libreoffice
           /usr/lib64/libreoffice
+          /usr/bin/libreoffice
           /opt/libreoffice
           /usr/lib/openoffice
           /usr/lib64/openoffice
@@ -65,7 +66,7 @@ def office_search_paths
       end
       search_paths
     end
-    
+
     # Identify the path to a working office executable.
     def office_executable
       paths = office_search_paths
@@ -77,7 +78,7 @@ def office_executable
         raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exists? ENV['OFFICE_PATH']
         paths.unshift(ENV['OFFICE_PATH'])
       end
-      
+
       # The location of the office executable is OS dependent
       path_pieces = ["soffice"]
       if windows?
@@ -87,7 +88,7 @@ def office_executable
       else
         path_pieces += [["program", "soffice"]]
       end
-      
+
       # Search for the first suitable office executable
       # and short circuit an executable is found.
       paths.each do |path|
@@ -103,12 +104,12 @@ def office_executable
       raise OfficeNotFound, "No office software found" unless @@executable
       @@executable
     end
-    
+
     # Used to specify the office location for JODConverter
     def office_path
       File.dirname(File.dirname(office_executable))
     end
-    
+
     # Convert documents to PDF.
     def extract(docs, opts)
       out = opts[:output] || '.'
@@ -124,7 +125,7 @@ def extract(docs, opts)
           if libre_office?
             # Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
             ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
-            
+
             options = "--headless --invisible  --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
             cmd = "#{office_executable} #{options} 2>&1"
             result = `#{cmd}`.chomp
@@ -143,9 +144,9 @@ def extract(docs, opts)
     LOGGING       = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
 
     HEADLESS      = "-Djava.awt.headless=true"
-    
+
     private
-    
+
     # Runs a Java command, with quieted logging, and the classpath set properly.
     def run_jod(command, pdfs, opts, return_output=false)