From 708148f3d713a173987c965ad750b3a6a885001e Mon Sep 17 00:00:00 2001 From: Danilo Jeremias da Silva Date: Thu, 26 Mar 2020 23:03:53 -0300 Subject: [PATCH 1/4] Allo to receive a block on run command with the file_path when command run with success --- lib/rtesseract/base.rb | 6 ++---- lib/rtesseract/box.rb | 6 +++--- lib/rtesseract/command.rb | 12 ++++++++---- lib/rtesseract/pdf.rb | 6 +++--- lib/rtesseract/tsv.rb | 6 +++--- 5 files changed, 19 insertions(+), 17 deletions(-) diff --git a/lib/rtesseract/base.rb b/lib/rtesseract/base.rb index 4f82e08..f5748f5 100644 --- a/lib/rtesseract/base.rb +++ b/lib/rtesseract/base.rb @@ -6,10 +6,8 @@ class RTesseract module Base - def temp_file(ext = '') - @rand_file ||= "rtesseract_#{SecureRandom.uuid}" - - Pathname.new(Dir.tmpdir).join("#{@rand_file}#{ext}").to_s + def temp_file_path + Pathname.new(Dir.tmpdir).join("rtesseract_#{SecureRandom.uuid}").to_s end end end diff --git a/lib/rtesseract/box.rb b/lib/rtesseract/box.rb index f18db05..02d724f 100644 --- a/lib/rtesseract/box.rb +++ b/lib/rtesseract/box.rb @@ -8,9 +8,9 @@ class << self def run(source, errors, options) options.tessedit_create_hocr = 1 - RTesseract::Command.new(source, temp_file, errors, options).run - - parse(File.read(temp_file('.hocr'))) + RTesseract::Command.new(source, temp_file_path, errors, options).run do |output_path| + parse(File.read("#{output_path}.hocr")) + end end def parse(content) diff --git a/lib/rtesseract/command.rb b/lib/rtesseract/command.rb index 1b80efe..41793d7 100644 --- a/lib/rtesseract/command.rb +++ b/lib/rtesseract/command.rb @@ -6,12 +6,12 @@ class Command attr_reader :options - def initialize(source, output, errors, options) + def initialize(source, output_path, errors, options) @source = source - @output = output + @output_path = output_path @options = options @errors = errors - @full_command = [options.command, @source, @output] + @full_command = [options.command, @source, @output_path] end def full_command @@ -48,7 +48,11 @@ def run @errors.push(error) - return output if status.success? + if status.success? + return yield(@output_path) if block_given? + + return output + end raise RTesseract::Error, error end diff --git a/lib/rtesseract/pdf.rb b/lib/rtesseract/pdf.rb index 241c480..4bfd7e1 100644 --- a/lib/rtesseract/pdf.rb +++ b/lib/rtesseract/pdf.rb @@ -7,9 +7,9 @@ module Pdf def self.run(source, errors, options) options.tessedit_create_pdf = 1 - RTesseract::Command.new(source, temp_file, errors, options).run - - File.open(temp_file('.pdf'), 'r') + RTesseract::Command.new(source, temp_file_path, errors, options).run do |output_path| + File.open("#{output_path}.pdf", 'r') + end end end end diff --git a/lib/rtesseract/tsv.rb b/lib/rtesseract/tsv.rb index 5e7959e..724aeca 100644 --- a/lib/rtesseract/tsv.rb +++ b/lib/rtesseract/tsv.rb @@ -7,9 +7,9 @@ module Tsv def self.run(source, errors, options) options.tessedit_create_tsv = 1 - RTesseract::Command.new(source, temp_file, errors, options).run - - File.open(temp_file('.tsv'), 'r') + RTesseract::Command.new(source, temp_file_path, errors, options).run do |output_path| + File.open("#{output_path}.tsv", 'r') + end end end end From 404e9716c923d8b3b1dc707582298613375b7fc4 Mon Sep 17 00:00:00 2001 From: Danilo Jeremias da Silva Date: Wed, 1 Apr 2020 23:27:14 -0300 Subject: [PATCH 2/4] Better tests --- .rubocop.yml | 3 ++ spec/rtesseract/box_spec.rb | 12 +++-- spec/rtesseract/configuration_spec.rb | 41 +++++++++++---- spec/rtesseract/pdf_spec.rb | 20 +++---- spec/rtesseract/text_spec.rb | 37 +++++++++++++ spec/rtesseract/tsv_spec.rb | 18 ++++--- spec/rtesseract_spec.rb | 75 +++++++++++++-------------- spec/spec_helper.rb | 2 +- 8 files changed, 136 insertions(+), 72 deletions(-) create mode 100644 spec/rtesseract/text_spec.rb diff --git a/.rubocop.yml b/.rubocop.yml index a44bc54..9801571 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -18,3 +18,6 @@ Style/HashTransformKeys: Style/HashTransformValues: Enabled: true + +RSpec/FilePath: + Enabled: false diff --git a/spec/rtesseract/box_spec.rb b/spec/rtesseract/box_spec.rb index 8551f9d..6e38dcc 100644 --- a/spec/rtesseract/box_spec.rb +++ b/spec/rtesseract/box_spec.rb @@ -3,12 +3,14 @@ RSpec.describe RTesseract::Box do let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') } let(:words_image) { path.join('resources', 'test_words.png').to_s } + let(:words) { ['If', 'you', 'are', 'a', 'friend,', 'you', 'speak', 'the', 'password,', 'and', 'the', 'doors', 'will', 'open.'] } + let(:instance) { RTesseract.new(words_image) } - it 'bounding box' do - expect(RTesseract.new(words_image).to_s).to eql("If you are a friend,\nyou speak the password,\nand the doors will open.\n\f") - expect(RTesseract.new(words_image).to_box).to include(word: 'you', x_start: 69, y_start: 17, x_end: 100, y_end: 31) + it 'returns the list of words' do + expect(instance.words).to eql(words) + end - words = ['If', 'you', 'are', 'a', 'friend,', 'you', 'speak', 'the', 'password,', 'and', 'the', 'doors', 'will', 'open.'] - expect(RTesseract.new(words_image).words).to eql(words) + it 'bounding box' do + expect(instance.to_box).to include(word: 'you', x_start: 69, y_start: 17, x_end: 100, y_end: 31) end end diff --git a/spec/rtesseract/configuration_spec.rb b/spec/rtesseract/configuration_spec.rb index 3543005..056d49b 100644 --- a/spec/rtesseract/configuration_spec.rb +++ b/spec/rtesseract/configuration_spec.rb @@ -3,18 +3,39 @@ RSpec.describe RTesseract do let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') } - it ' support default config' do - RTesseract.configure { |config| config.psm = 7 } + context 'with global spm' do + before { described_class.configure { |config| config.psm = 7 } } - expect(RTesseract.config.psm).to eql(7) - expect(RTesseract.new(path, psm: 2).config.psm).to eql(2) + it 'gets the global psm value' do + expect(described_class.config.psm).to be(7) + end - expect(RTesseract.config.command).to eql('tesseract') - expect(RTesseract.new(path, command: '/usr/bin/tesseract4').config.command).to eql('/usr/bin/tesseract4') + it 'gets instance psm value' do + expect(described_class.new(path, psm: 2).config.psm).to be(2) + end + end + + context 'with default command' do + it 'gets the global psm value' do + expect(described_class.config.command).to eql('tesseract') + end + + it 'gets instance command value' do + expect(described_class.new(path, command: '/usr/bin/tesseract4').config.command).to eql('/usr/bin/tesseract4') + end + end + + context 'with other options' do + it 'allows to setup oem' do + expect(described_class.new(path, oem: 1).config.oem).to be(1) + end + + it 'allows to setup lang' do + expect(described_class.new(path, lang: 'eng').config.lang).to eql('eng') + end - expect(RTesseract.new(path, psm: 2).config.psm).to eql(2) - expect(RTesseract.new(path, oem: 1).config.oem).to eql(1) - expect(RTesseract.new(path, lang: 'eng').config.lang).to eql('eng') - expect(RTesseract.new(path, lang: 'eng+por').config.lang).to eql('eng+por') + it 'allows to setup multiple langs' do + expect(described_class.new(path, lang: 'eng+por').config.lang).to eql('eng+por') + end end end diff --git a/spec/rtesseract/pdf_spec.rb b/spec/rtesseract/pdf_spec.rb index 0f541dc..b464c29 100644 --- a/spec/rtesseract/pdf_spec.rb +++ b/spec/rtesseract/pdf_spec.rb @@ -2,17 +2,19 @@ RSpec.describe RTesseract::Pdf do let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') } + let(:words_image) { path.join('resources', 'test-pdf.png').to_s } + let(:file) { RTesseract.new(words_image).to_pdf } - let(:image_pdf_path) { path.join('resources', 'test-pdf.png').to_s } - - it ' support pdf output mode' do - pdf_ocr = RTesseract.new(image_pdf_path).to_pdf - - expect(File.extname(pdf_ocr.path)).to eql('.pdf') - expect(File.exist?(pdf_ocr.path)).to be_truthy + after do + file.close + File.unlink(file) + end - pdf_ocr.close + it 'returns a file with extension .pdf' do + expect(File.extname(file.path)).to eql('.pdf') + end - File.unlink(pdf_ocr) + it 'checks if file pdf exisits' do + expect(File).to exist(file.path) end end diff --git a/spec/rtesseract/text_spec.rb b/spec/rtesseract/text_spec.rb new file mode 100644 index 0000000..f4a5640 --- /dev/null +++ b/spec/rtesseract/text_spec.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: true + +RSpec.describe RTesseract::Text do + let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') } + let(:image_path) { path.join('resources', 'test.tif').to_s } + let(:pdf_path) { path.join('resources', 'test.tif').to_s } + + let(:words_image) { path.join('resources', 'test_words.png').to_s } + + it 'translate image to text' do + expect(RTesseract.new(image_path).to_s_without_spaces).to eql('43XF') + end + + it 'translate tif image to text' do + expect(RTesseract.new(path.join('resources', 'test1.tif').to_s).to_s_without_spaces).to eql('V2V4') + end + + it 'translate tif image with spaces to text' do + expect(RTesseract.new(path.join('resources', 'test with spaces.tif').to_s).to_s_without_spaces).to eql('V2V4') + end + + it 'translate png image with spaces to text' do + expect(RTesseract.new(path.join('resources', 'test.png').to_s).to_s_without_spaces).to eql('HW9W') + end + + it 'translate jpg image with spaces to text' do + expect(RTesseract.new(path.join('resources', 'test.jpg').to_s).to_s_without_spaces).to eql('3R8F') + end + + it 'translate image to text with options' do + expect(RTesseract.new(image_path, psm: 7, oem: 1).to_s_without_spaces).to eql('43XF') + end + + it 'tests output text' do + expect(RTesseract.new(words_image).to_s).to eql("If you are a friend,\nyou speak the password,\nand the doors will open.\n\f") + end +end diff --git a/spec/rtesseract/tsv_spec.rb b/spec/rtesseract/tsv_spec.rb index f438250..980dfdb 100644 --- a/spec/rtesseract/tsv_spec.rb +++ b/spec/rtesseract/tsv_spec.rb @@ -1,18 +1,20 @@ # frozen_string_literal: true -require 'csv' - RSpec.describe RTesseract::Tsv do let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') } let(:words_image) { path.join('resources', 'test_words.png').to_s } + let(:file) { RTesseract.new(words_image).to_tsv } - it ' support tsv output mode' do - tsv_ocr = RTesseract.new(words_image).to_tsv + after do + file.close + File.unlink(file) + end - expect(File.extname(tsv_ocr.path)).to eql('.tsv') - expect(tsv_ocr.read).to include('level page_num block_num par_num line_num word_num left top width height conf text') + it 'returns a file with extension .tsv' do + expect(File.extname(file.path)).to eql('.tsv') + end - tsv_ocr.close - File.unlink(tsv_ocr) + it ' support tsv output mode' do + expect(file.read).to include('level page_num block_num par_num line_num word_num left top width height conf text') end end diff --git a/spec/rtesseract_spec.rb b/spec/rtesseract_spec.rb index 521885f..9bf85b8 100644 --- a/spec/rtesseract_spec.rb +++ b/spec/rtesseract_spec.rb @@ -5,60 +5,57 @@ let(:image_path) { path.join('resources', 'test.tif').to_s } let(:pdf_path) { path.join('resources', 'test.tif').to_s } - it 'has a version number' do - expect(RTesseract::VERSION).not_to be nil - - expect(RTesseract.tesseract_version).to be > 3.05 + it 'returns the tesseract version' do + expect(described_class.tesseract_version).to be > 3.05 end - it 'be instantiable' do - expect(RTesseract.new.class).to eql(RTesseract) - expect(RTesseract.new('').class).to eql(RTesseract) - expect(RTesseract.new(image_path).class).to eql(RTesseract) + it 'be instantiable without path' do + expect(described_class.new.class).to eql(described_class) end - it 'translate image to text' do - expect(RTesseract.new(image_path).to_s_without_spaces).to eql('43XF') - { - 'test1.tif' => 'V2V4', - 'test with spaces.tif' => 'V2V4', - 'test.png' => 'HW9W', - 'test.jpg' => '3R8F' - }.each do |file, value| - expect(RTesseract.new(path.join('resources', file).to_s).to_s_without_spaces).to eql(value) - end + it 'be instantiable with blank string' do + expect(described_class.new('').class).to eql(described_class) end - it 'translate image to text with options' do - expect(RTesseract.new(image_path, psm: 7, oem: 1).to_s_without_spaces).to eql('43XF') + it 'be instantiable with a path' do + expect(described_class.new(image_path).class).to eql(described_class) end - it 'get tesseract version' do - expect(RTesseract.tesseract_version).to be > 0 + context 'when tesseract not installed' do + before do + described_class.configure { |config| config.command = 'tesseract_not_installed' } + end - RTesseract.configure { |config| config.command = 'tesseract_not_installed' } + it 'returns zero on #tesseract_version' do + expect(described_class.tesseract_version).to be(0) + end - expect(RTesseract.tesseract_version).to eql(0) + it 'raise a error if tesseract version < 3.05' do + expect { described_class.check_version! }.to raise_error(RTesseract::Error) + end end - it 'raise a error if tesseract version < 3.05' do - RTesseract.configure { |config| config.command = 'tesseract_not_installed' } + context 'without source' do + let(:instance) { described_class.new } - expect { RTesseract.check_version! }.to raise_error(RTesseract::Error) - end - - it 'raise a error when tesseract raise a error' do - expect { RTesseract.new.to_s }.to raise_error(RTesseract::Error) + it 'raise an exception' do + aggregate_failures 'raise an exception' do + expect { instance.to_s }.to raise_error(RTesseract::Error) + expect(instance.errors.first).to include('Error during processing') + end + end end - it 'store the error on a variable to debug' do - instance = RTesseract.new - expect { instance.to_s }.to raise_error(RTesseract::Error) - expect(instance.errors.first).to include('Error during processing') - - error_intance = RTesseract.new(path.join('resources', 'image_with_error.png').to_s) + context 'with errors on image' do + let(:error_intance) do + described_class.new(path.join('resources', 'image_with_error.png').to_s) + end - expect(error_intance.to_s_without_spaces).to eql('RTX-0003-03-02-01PRE') - expect(error_intance.errors).to eql(["Error in boxClipToRectangle: box outside rectangle\nError in pixScanForForeground: invalid box\n"]) + it 'stores the error on a variable to debug' do + aggregate_failures 'stores the error on a variable to debug' do + expect(error_intance.to_s_without_spaces).to eql('RTX-0003-03-02-01PRE') + expect(error_intance.errors).to eql(["Error in boxClipToRectangle: box outside rectangle\nError in pixScanForForeground: invalid box\n"]) + end + end end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 39b9127..b38965d 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -20,7 +20,7 @@ c.syntax = :expect end - config.before(:each) do + config.before do RTesseract.reset_config! end end From c915465ec7d9d08f2b653ba1ee2588c43591dcca Mon Sep 17 00:00:00 2001 From: Danilo Jeremias da Silva Date: Wed, 1 Apr 2020 23:44:47 -0300 Subject: [PATCH 3/4] Remove rubocop-rspec --- .rubocop.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.rubocop.yml b/.rubocop.yml index 9801571..a44bc54 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -18,6 +18,3 @@ Style/HashTransformKeys: Style/HashTransformValues: Enabled: true - -RSpec/FilePath: - Enabled: false From 5d6947596641337da394a01b46a02a2b418a9373 Mon Sep 17 00:00:00 2001 From: Danilo Jeremias da Silva Date: Sat, 4 Apr 2020 11:27:11 -0300 Subject: [PATCH 4/4] Start github action --- .github/workflows/ci.yml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..19d08e8 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,32 @@ +name: CI +on: [push] +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + ruby: + - '2.5.x' + - '2.6.x' + - '2.7.x' + steps: + steps: + - uses: actions/checkout@v2 + - name: Install tesseract-ocr + run: | + sudo add-apt-repository ppa:alex-p/tesseract-ocr -y + sudo apt-get update -q + sudo apt-get install tesseract-ocr tesseract-ocr-eng ghostscript -y + - name: Setup Ruby + uses: actions/setup-ruby@v1 + with: + ruby-version: ${{ matrix.ruby }} + - name: Bundle + env: + MTSR_RAILS_VERSION: ${{ matrix.rails }} + run: | + gem uninstall -aIx bundler + gem install bundler + bundle install --jobs 4 --retry 3 + - name: Test + run: bundle exec rake