Skip to content

Commit

Permalink
Add test for orientation-free OCRing.
Browse files Browse the repository at this point in the history
  • Loading branch information
knowtheory committed Nov 15, 2014
1 parent bb12800 commit 99aae9f
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 0 deletions.
Binary file not shown.
22 changes: 22 additions & 0 deletions test/unit/test_extract_text.rb
Expand Up @@ -53,5 +53,27 @@ def test_name_escaping_while_extracting_text
Docsplit.extract_text('test/fixtures/PDF file with spaces \'single\' and "double quotes".pdf', :pages => 'all', :output => OUTPUT)
assert Dir["#{OUTPUT}/*.txt"].length == 2
end

def test_orientation_detected_ocr_extraction
if Docsplit::DEPENDENCIES[:osd]
Docsplit.extract_text('test/fixtures/president-obamas-long-form-birth-certificate.sideways.pdf', :output => OUTPUT)
letters = Hash.new(0)
nonletters = Hash.new(0)
File.open(File.join(OUTPUT,'president-obamas-long-form-birth-certificate.sideways.txt')).each_char do |c|
case c
when /[A-Za-z]/
letters[c] += 1
when /\s/
else
nonletters[c] += 1
end
end

# There should be a ratio of better than 2:1 letters to non-letters.
assert letters.values.reduce(0,:+)/2 > nonletters.values.reduce(0,:+)
else
skip "Orientation detection module (osd) for Tesseract isn't installed"
end
end

end

0 comments on commit 99aae9f

Please sign in to comment.