Merge branch 'master' of github.com:documentcloud/docsplit

datadesk · Sep 13, 2011 · 945120e · 945120e
2 parents 94eed8f + 94e8c6d
commit 945120e
Show file tree

Hide file tree

Showing 13 changed files with 53 additions and 16 deletions.
diff --git a/index.html b/index.html
@@ -125,18 +125,18 @@ <h2 id="installation">Installation &amp; Dependencies</h2>
         <a href="http://sourceforge.net/projects/graphicsmagick/files/">source</a>,
         or use a package manager:
 <pre>
-[aptitude | port] install graphicsmagick</pre>
+[aptitude | port | brew] install graphicsmagick</pre>
       </li>
       <li>
         Install <a href="http://poppler.freedesktop.org/">Poppler</a>.
         On Linux, use <b>aptitude</b>, <b>apt-get</b> or <b>yum</b>:<br />
         <tt>aptitude install poppler-utils</tt><br />
         On the Mac, you can install from source or use <b>MacPorts</b>:<br />
-        <tt>sudo port install poppler</tt><br />
+        <tt>sudo port install poppler | brew install poppler</tt><br />
       </li>
       <li>
         (Optional) Install <a href="http://code.google.com/p/tesseract-ocr/">Tesseract</a>:<br />
-        <tt>[aptitude | port] install tesseract</tt><br />
+        <tt>[aptitude | port | brew] install tesseract</tt><br />
         Without Tesseract installed, you'll still be able to extract text from
         documents, but you won't be able to automatically OCR them.
       </li>

diff --git a/lib/docsplit.rb b/lib/docsplit.rb
@@ -19,6 +19,8 @@ module Docsplit
 
   DEPENDENCIES  = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}
 
+  ESCAPE        = lambda {|x| Shellwords.shellescape(x) }
+
   # Check for all dependencies, and warn of their absence.
   dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
   DEPENDENCIES.each_key do |dep|
@@ -62,11 +64,13 @@ def self.extract_pdf(docs, opts={})
     [docs].flatten.each do |doc|
       ext = File.extname(doc)
       basename = File.basename(doc, ext)
+      escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
+
       if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
-        `gm convert "#{doc}" "#{out}/#{basename}.pdf"`
+        `gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
       else
         options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
-        run "#{options} \"#{doc}\" \"#{out}/#{basename}.pdf\"", [], {}
+        run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
       end
     end
   end
@@ -113,6 +117,7 @@ def self.normalize_value(value)
 
 require 'tmpdir'
 require 'fileutils'
+require 'shellwords'
 require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
 require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
 require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"

diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb
@@ -32,6 +32,7 @@ def convert(pdf, size, format, previous=nil)
       basename  = File.basename(pdf, File.extname(pdf))
       directory = directory_for(size)
       pages     = @pages || '1-' + Docsplit.extract_length(pdf).to_s
+      escaped_pdf = ESCAPE[pdf]
       FileUtils.mkdir_p(directory) unless File.exists?(directory)
       common    = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
       if previous
@@ -40,8 +41,8 @@ def convert(pdf, size, format, previous=nil)
         raise ExtractionFailed, result if $? != 0
       else
         page_list(pages).each do |page|
-          out_file  = File.join(directory, "#{basename}_#{page}.#{format}")
-          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}[#{page - 1}]\" \"#{out_file}\" 2>&1".chomp
+          out_file  = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
+          cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
           result = `#{cmd}`.chomp
           raise ExtractionFailed, result if $? != 0
         end

diff --git a/lib/docsplit/info_extractor.rb b/lib/docsplit/info_extractor.rb
@@ -18,7 +18,7 @@ class InfoExtractor
     # Pull out a single datum from a pdf.
     def extract(key, pdfs, opts)
       pdf = [pdfs].flatten.first
-      cmd = "pdfinfo #{pdf} 2>&1"
+      cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
       result = `#{cmd}`.chomp
       raise ExtractionFailed, result if $? != 0
       match = result.match(MATCHERS[key])

diff --git a/lib/docsplit/page_extractor.rb b/lib/docsplit/page_extractor.rb
@@ -11,7 +11,7 @@ def extract(pdfs, opts)
         pdf_name = File.basename(pdf, File.extname(pdf))
         page_path = File.join(@output, "#{pdf_name}_%d.pdf")
         FileUtils.mkdir_p @output unless File.exists?(@output)
-        cmd = "pdftk #{pdf} burst output #{page_path} 2>&1"
+        cmd = "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
         result = `#{cmd}`.chomp
         FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
         raise ExtractionFailed, result if $? != 0

diff --git a/lib/docsplit/text_extractor.rb b/lib/docsplit/text_extractor.rb
@@ -45,7 +45,7 @@ def extract(pdfs, opts)
 
     # Does a PDF have any text embedded?
     def contains_text?(pdf)
-      fonts = `pdffonts #{pdf} 2>&1`
+      fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
       !fonts.match(NO_TEXT_DETECTED)
     end
 
@@ -59,19 +59,22 @@ def extract_from_pdf(pdf, pages)
     def extract_from_ocr(pdf, pages)
       tempdir = Dir.mktmpdir
       base_path = File.join(@output, @pdf_name)
+      escaped_pdf = ESCAPE[pdf]
       if pages
         pages.each do |page|
           tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
+          escaped_tiff = ESCAPE[tiff]
           file = "#{base_path}_#{page}"
-          run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
-          run "tesseract #{tiff} #{file} -l eng 2>&1"
+          run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
+          run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l eng 2>&1"
           clean_text(file + '.txt') if @clean_ocr
           FileUtils.remove_entry_secure tiff
         end
       else
         tiff = "#{tempdir}/#{@pdf_name}.tif"
-        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
-        run "tesseract #{tiff} #{base_path} -l eng 2>&1"
+        escaped_tiff = ESCAPE[tiff]
+        run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
+        run "tesseract #{escaped_tiff} #{base_path} -l eng 2>&1"
         clean_text(base_path + '.txt') if @clean_ocr
       end
     ensure
@@ -100,14 +103,14 @@ def run(command)
     # Extract the full contents of a pdf as a single file, directly.
     def extract_full(pdf)
       text_path = File.join(@output, "#{@pdf_name}.txt")
-      run "pdftotext -enc UTF-8 #{pdf} #{text_path} 2>&1"
+      run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
     end
 
     # Extract the contents of a single page of text, directly, adding it to
     # the `@pages_to_ocr` list if the text length is inadequate.
     def extract_page(pdf, page)
       text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
-      run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path} 2>&1"
+      run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
       unless @forbid_ocr
         @pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
       end

diff --git a/test/fixtures/PDF file with spaces 'single' and "double quotes".doc b/test/fixtures/PDF file with spaces 'single' and "double quotes".doc
diff --git a/test/fixtures/PDF file with spaces 'single' and "double quotes".pdf b/test/fixtures/PDF file with spaces 'single' and "double quotes".pdf
diff --git a/test/unit/test_convert_to_pdf.rb b/test/unit/test_convert_to_pdf.rb
@@ -12,6 +12,10 @@ def test_rtf_conversion
     assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/obama_hopes.pdf"]
   end
 
+  def test_png_conversion
+    Docsplit.extract_pdf('test/fixtures/image.png', :output => OUTPUT)
+    assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/image.pdf"]
+  end
   def test_png_conversion
     Docsplit.extract_pdf('test/fixtures/image.png', :output => OUTPUT)
     assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/image.pdf"]
@@ -23,4 +27,9 @@ def test_conversion_then_page_extraction
     assert Dir["#{OUTPUT}/*.pdf"].length == 8
   end
 
+  def test_name_escaping_while_converting
+    Docsplit.extract_pdf('test/fixtures/PDF file with spaces \'single\' and "double quotes".doc', :output => OUTPUT)
+    assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/PDF file with spaces 'single' and \"double quotes\".pdf"]
+  end
+
 end
diff --git a/test/unit/test_extract_images.rb b/test/unit/test_extract_images.rb
@@ -41,4 +41,9 @@ def test_repeated_extraction_in_the_same_directory
     assert Dir["#{OUTPUT}/*"] == ['test/output/obama_arts_1.gif', 'test/output/obama_arts_2.gif']
   end
 
+  def test_name_escaping_while_extracting_images
+    Docsplit.extract_images('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :format => :gif, :size => "250x", :output => OUTPUT)
+    assert Dir["#{OUTPUT}/*"] == ['test/output/PDF file with spaces \'single\' and "double quotes"_1.gif', 'test/output/PDF file with spaces \'single\' and "double quotes"_2.gif']
+  end
+
 end
diff --git a/test/unit/test_extract_info.rb b/test/unit/test_extract_info.rb
@@ -32,4 +32,8 @@ def test_password_protected
     end
   end
 
+  def test_name_escaping_while_extracting_info
+    assert 2 == Docsplit.extract_length('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf')
+  end
+
 end
diff --git a/test/unit/test_extract_pages.rb b/test/unit/test_extract_pages.rb
@@ -18,4 +18,9 @@ def test_doc_page_extraction
     assert Dir["#{OUTPUT}/*.pdf"].length == 7
   end
 
+  def test_name_escaping_while_extracting_pages
+    Docsplit.extract_pages('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :output => OUTPUT)
+    assert Dir["#{OUTPUT}/*.pdf"].length == 2
+  end
+
 end
diff --git a/test/unit/test_extract_text.rb b/test/unit/test_extract_text.rb
@@ -44,4 +44,9 @@ def test_password_protected
     end
   end
 
+  def test_name_escaping_while_extracting_text
+    Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT)
+    assert Dir["#{OUTPUT}/*.txt"].length == 2
+  end
+
 end