Skip to content
This repository has been archived by the owner on Aug 31, 2021. It is now read-only.

Commit

Permalink
Merge branch 'master' of github.com:documentcloud/docsplit
Browse files Browse the repository at this point in the history
  • Loading branch information
jashkenas committed Sep 13, 2011
2 parents 94eed8f + 94e8c6d commit 945120e
Show file tree
Hide file tree
Showing 13 changed files with 53 additions and 16 deletions.
6 changes: 3 additions & 3 deletions index.html
Expand Up @@ -125,18 +125,18 @@ <h2 id="installation">Installation &amp; Dependencies</h2>
<a href="http://sourceforge.net/projects/graphicsmagick/files/">source</a>,
or use a package manager:
<pre>
[aptitude | port] install graphicsmagick</pre>
[aptitude | port | brew] install graphicsmagick</pre>
</li>
<li>
Install <a href="http://poppler.freedesktop.org/">Poppler</a>.
On Linux, use <b>aptitude</b>, <b>apt-get</b> or <b>yum</b>:<br />
<tt>aptitude install poppler-utils</tt><br />
On the Mac, you can install from source or use <b>MacPorts</b>:<br />
<tt>sudo port install poppler</tt><br />
<tt>sudo port install poppler | brew install poppler</tt><br />
</li>
<li>
(Optional) Install <a href="http://code.google.com/p/tesseract-ocr/">Tesseract</a>:<br />
<tt>[aptitude | port] install tesseract</tt><br />
<tt>[aptitude | port | brew] install tesseract</tt><br />
Without Tesseract installed, you'll still be able to extract text from
documents, but you won't be able to automatically OCR them.
</li>
Expand Down
9 changes: 7 additions & 2 deletions lib/docsplit.rb
Expand Up @@ -19,6 +19,8 @@ module Docsplit

DEPENDENCIES = {:java => false, :gm => false, :pdftotext => false, :pdftk => false, :tesseract => false}

ESCAPE = lambda {|x| Shellwords.shellescape(x) }

# Check for all dependencies, and warn of their absence.
dirs = ENV['PATH'].split(File::PATH_SEPARATOR)
DEPENDENCIES.each_key do |dep|
Expand Down Expand Up @@ -62,11 +64,13 @@ def self.extract_pdf(docs, opts={})
[docs].flatten.each do |doc|
ext = File.extname(doc)
basename = File.basename(doc, ext)
escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)

if ext.length > 0 && GM_FORMATS.include?(ext.sub(/^\./, '').downcase.to_sym)
`gm convert "#{doc}" "#{out}/#{basename}.pdf"`
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
else
options = "-jar #{ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-3.jar -r #{ROOT}/vendor/conf/document-formats.js"
run "#{options} \"#{doc}\" \"#{out}/#{basename}.pdf\"", [], {}
run "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
end
end
end
Expand Down Expand Up @@ -113,6 +117,7 @@ def self.normalize_value(value)

require 'tmpdir'
require 'fileutils'
require 'shellwords'
require "#{Docsplit::ROOT}/lib/docsplit/image_extractor"
require "#{Docsplit::ROOT}/lib/docsplit/transparent_pdfs"
require "#{Docsplit::ROOT}/lib/docsplit/text_extractor"
Expand Down
5 changes: 3 additions & 2 deletions lib/docsplit/image_extractor.rb
Expand Up @@ -32,6 +32,7 @@ def convert(pdf, size, format, previous=nil)
basename = File.basename(pdf, File.extname(pdf))
directory = directory_for(size)
pages = @pages || '1-' + Docsplit.extract_length(pdf).to_s
escaped_pdf = ESCAPE[pdf]
FileUtils.mkdir_p(directory) unless File.exists?(directory)
common = "#{MEMORY_ARGS} #{DENSITY_ARG} #{resize_arg(size)} #{quality_arg(format)}"
if previous
Expand All @@ -40,8 +41,8 @@ def convert(pdf, size, format, previous=nil)
raise ExtractionFailed, result if $? != 0
else
page_list(pages).each do |page|
out_file = File.join(directory, "#{basename}_#{page}.#{format}")
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} \"#{pdf}[#{page - 1}]\" \"#{out_file}\" 2>&1".chomp
out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")]
cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp
result = `#{cmd}`.chomp
raise ExtractionFailed, result if $? != 0
end
Expand Down
2 changes: 1 addition & 1 deletion lib/docsplit/info_extractor.rb
Expand Up @@ -18,7 +18,7 @@ class InfoExtractor
# Pull out a single datum from a pdf.
def extract(key, pdfs, opts)
pdf = [pdfs].flatten.first
cmd = "pdfinfo #{pdf} 2>&1"
cmd = "pdfinfo #{ESCAPE[pdf]} 2>&1"
result = `#{cmd}`.chomp
raise ExtractionFailed, result if $? != 0
match = result.match(MATCHERS[key])
Expand Down
2 changes: 1 addition & 1 deletion lib/docsplit/page_extractor.rb
Expand Up @@ -11,7 +11,7 @@ def extract(pdfs, opts)
pdf_name = File.basename(pdf, File.extname(pdf))
page_path = File.join(@output, "#{pdf_name}_%d.pdf")
FileUtils.mkdir_p @output unless File.exists?(@output)
cmd = "pdftk #{pdf} burst output #{page_path} 2>&1"
cmd = "pdftk #{ESCAPE[pdf]} burst output #{ESCAPE[page_path]} 2>&1"
result = `#{cmd}`.chomp
FileUtils.rm('doc_data.txt') if File.exists?('doc_data.txt')
raise ExtractionFailed, result if $? != 0
Expand Down
17 changes: 10 additions & 7 deletions lib/docsplit/text_extractor.rb
Expand Up @@ -45,7 +45,7 @@ def extract(pdfs, opts)

# Does a PDF have any text embedded?
def contains_text?(pdf)
fonts = `pdffonts #{pdf} 2>&1`
fonts = `pdffonts #{ESCAPE[pdf]} 2>&1`
!fonts.match(NO_TEXT_DETECTED)
end

Expand All @@ -59,19 +59,22 @@ def extract_from_pdf(pdf, pages)
def extract_from_ocr(pdf, pages)
tempdir = Dir.mktmpdir
base_path = File.join(@output, @pdf_name)
escaped_pdf = ESCAPE[pdf]
if pages
pages.each do |page|
tiff = "#{tempdir}/#{@pdf_name}_#{page}.tif"
escaped_tiff = ESCAPE[tiff]
file = "#{base_path}_#{page}"
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf}[#{page - 1}] #{tiff} 2>&1"
run "tesseract #{tiff} #{file} -l eng 2>&1"
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l eng 2>&1"
clean_text(file + '.txt') if @clean_ocr
FileUtils.remove_entry_secure tiff
end
else
tiff = "#{tempdir}/#{@pdf_name}.tif"
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{pdf} #{tiff} 2>&1"
run "tesseract #{tiff} #{base_path} -l eng 2>&1"
escaped_tiff = ESCAPE[tiff]
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
run "tesseract #{escaped_tiff} #{base_path} -l eng 2>&1"
clean_text(base_path + '.txt') if @clean_ocr
end
ensure
Expand Down Expand Up @@ -100,14 +103,14 @@ def run(command)
# Extract the full contents of a pdf as a single file, directly.
def extract_full(pdf)
text_path = File.join(@output, "#{@pdf_name}.txt")
run "pdftotext -enc UTF-8 #{pdf} #{text_path} 2>&1"
run "pdftotext -enc UTF-8 #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
end

# Extract the contents of a single page of text, directly, adding it to
# the `@pages_to_ocr` list if the text length is inadequate.
def extract_page(pdf, page)
text_path = File.join(@output, "#{@pdf_name}_#{page}.txt")
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{pdf} #{text_path} 2>&1"
run "pdftotext -enc UTF-8 -f #{page} -l #{page} #{ESCAPE[pdf]} #{ESCAPE[text_path]} 2>&1"
unless @forbid_ocr
@pages_to_ocr.push(page) if File.read(text_path).length < MIN_TEXT_PER_PAGE
end
Expand Down
Binary file not shown.
Binary file not shown.
9 changes: 9 additions & 0 deletions test/unit/test_convert_to_pdf.rb
Expand Up @@ -12,6 +12,10 @@ def test_rtf_conversion
assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/obama_hopes.pdf"]
end

def test_png_conversion
Docsplit.extract_pdf('test/fixtures/image.png', :output => OUTPUT)
assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/image.pdf"]
end
def test_png_conversion
Docsplit.extract_pdf('test/fixtures/image.png', :output => OUTPUT)
assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/image.pdf"]
Expand All @@ -23,4 +27,9 @@ def test_conversion_then_page_extraction
assert Dir["#{OUTPUT}/*.pdf"].length == 8
end

def test_name_escaping_while_converting
Docsplit.extract_pdf('test/fixtures/PDF file with spaces \'single\' and "double quotes".doc', :output => OUTPUT)
assert Dir["#{OUTPUT}/*.pdf"] == ["#{OUTPUT}/PDF file with spaces 'single' and \"double quotes\".pdf"]
end

end
5 changes: 5 additions & 0 deletions test/unit/test_extract_images.rb
Expand Up @@ -41,4 +41,9 @@ def test_repeated_extraction_in_the_same_directory
assert Dir["#{OUTPUT}/*"] == ['test/output/obama_arts_1.gif', 'test/output/obama_arts_2.gif']
end

def test_name_escaping_while_extracting_images
Docsplit.extract_images('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :format => :gif, :size => "250x", :output => OUTPUT)
assert Dir["#{OUTPUT}/*"] == ['test/output/PDF file with spaces \'single\' and "double quotes"_1.gif', 'test/output/PDF file with spaces \'single\' and "double quotes"_2.gif']
end

end
4 changes: 4 additions & 0 deletions test/unit/test_extract_info.rb
Expand Up @@ -32,4 +32,8 @@ def test_password_protected
end
end

def test_name_escaping_while_extracting_info
assert 2 == Docsplit.extract_length('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf')
end

end
5 changes: 5 additions & 0 deletions test/unit/test_extract_pages.rb
Expand Up @@ -18,4 +18,9 @@ def test_doc_page_extraction
assert Dir["#{OUTPUT}/*.pdf"].length == 7
end

def test_name_escaping_while_extracting_pages
Docsplit.extract_pages('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :output => OUTPUT)
assert Dir["#{OUTPUT}/*.pdf"].length == 2
end

end
5 changes: 5 additions & 0 deletions test/unit/test_extract_text.rb
Expand Up @@ -44,4 +44,9 @@ def test_password_protected
end
end

def test_name_escaping_while_extracting_text
Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT)
assert Dir["#{OUTPUT}/*.txt"].length == 2
end

end

0 comments on commit 945120e

Please sign in to comment.