Skip to content

Commit

Permalink
Remove nokogiri (#72)
Browse files Browse the repository at this point in the history
* Remove nokogiri dependency

* Update Gemfile and bump version

* Update methods

* Update rubocop version on hound and a little refactoring.
  • Loading branch information
dannnylo committed Mar 20, 2020
1 parent c68edcd commit f50e774
Show file tree
Hide file tree
Showing 23 changed files with 111 additions and 57 deletions.
3 changes: 3 additions & 0 deletions .hound.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
rubocop:
config_file: .rubocop.yml
version: 0.80.0
18 changes: 15 additions & 3 deletions .rubocop.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,20 @@
Documentation:
Enabled: false

Metrics/LineLength:
Layout/LineLength:
Max: 150

Metrics/BlockLength:
Max: 50

Metrics/AbcSize:
Max: 30

Style/Documentation:
Enabled: false
Style/HashEachMethods:
Enabled: true

Style/HashTransformKeys:
Enabled: true

Style/HashTransformValues:
Enabled: true
2 changes: 2 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

source 'https://rubygems.org'

git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
Expand Down
45 changes: 19 additions & 26 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,51 +1,45 @@
PATH
remote: .
specs:
rtesseract (3.0.5)
nokogiri
rtesseract (3.1.0)

GEM
remote: https://rubygems.org/
specs:
coveralls (0.7.2)
multi_json (~> 1.3)
rest-client (= 1.6.7)
simplecov (>= 0.7)
term-ansicolor (= 1.2.2)
thor (= 0.18.1)
coveralls (0.8.23)
json (>= 1.8, < 3)
simplecov (~> 0.16.1)
term-ansicolor (~> 1.3)
thor (>= 0.19.4, < 2.0)
tins (~> 1.6)
diff-lcs (1.3)
docile (1.3.2)
mime-types (3.3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2019.1009)
mini_portile2 (2.4.0)
multi_json (1.14.1)
nokogiri (1.10.9)
mini_portile2 (~> 2.4.0)
json (2.3.0)
rake (13.0.1)
rest-client (1.6.7)
mime-types (>= 1.16)
rspec (3.9.0)
rspec-core (~> 3.9.0)
rspec-expectations (~> 3.9.0)
rspec-mocks (~> 3.9.0)
rspec-core (3.9.1)
rspec-support (~> 3.9.1)
rspec-expectations (3.9.0)
rspec-expectations (3.9.1)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.9.0)
rspec-mocks (3.9.1)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.9.0)
rspec-support (3.9.2)
simplecov (0.18.5)
simplecov (0.16.1)
docile (~> 1.1)
simplecov-html (~> 0.11)
simplecov-html (0.12.2)
term-ansicolor (1.2.2)
tins (~> 0.8)
thor (0.18.1)
tins (0.13.2)
json (>= 1.8, < 3)
simplecov-html (~> 0.10.0)
simplecov-html (0.10.2)
sync (0.5.0)
term-ansicolor (1.7.1)
tins (~> 1.0)
thor (1.0.1)
tins (1.24.1)
sync

PLATFORMS
ruby
Expand All @@ -56,7 +50,6 @@ DEPENDENCIES
rake
rspec
rtesseract!
simplecov

BUNDLED WITH
2.1.4
2 changes: 2 additions & 0 deletions Rakefile
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

require 'bundler/gem_tasks'
require 'rspec/core/rake_task'

Expand Down
1 change: 1 addition & 0 deletions bin/console
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env ruby
# frozen_string_literal: true

require 'bundler/setup'
require 'rtesseract'
Expand Down
2 changes: 2 additions & 0 deletions lib/rtesseract.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

require 'rtesseract/check'
require 'rtesseract/configuration'
require 'rtesseract/command'
Expand Down
2 changes: 2 additions & 0 deletions lib/rtesseract/base.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

require 'tmpdir'
require 'securerandom'
require 'pathname'
Expand Down
52 changes: 32 additions & 20 deletions lib/rtesseract/box.rb
Original file line number Diff line number Diff line change
@@ -1,33 +1,45 @@
require 'nokogiri'
# frozen_string_literal: true

class RTesseract
module Box
extend RTesseract::Base

def self.run(source, errors, options)
options.tessedit_create_hocr = 1
class << self
def run(source, errors, options)
options.tessedit_create_hocr = 1

RTesseract::Command.new(source, temp_file, errors, options).run
RTesseract::Command.new(source, temp_file, errors, options).run

parse(File.read(temp_file('.hocr')))
end
parse(File.read(temp_file('.hocr')))
end

def self.parse(content)
html = Nokogiri::HTML(content)
html.css('span.ocrx_word, span.ocr_word').map do |word|
attributes = word.attributes['title'].value.to_s.delete(';').split(' ')
word_info(word, attributes)
def parse(content)
content.lines.map { |line| parse_line(line) }.compact
end
end

def self.word_info(word, data)
{
word: word.text,
x_start: data[1].to_i,
y_start: data[2].to_i,
x_end: data[3].to_i,
y_end: data[4].to_i
}
def parse_line(line)
return unless line.match?(/oc(rx|r)_word/)

word = line.match(/(?<=>)(.*?)(?=<)/).to_s

return if word.strip == ''

word_info(word, parse_position(line))
end

def word_info(word, positions)
{
word: word,
x_start: positions[1].to_i,
y_start: positions[2].to_i,
x_end: positions[3].to_i,
y_end: positions[4].to_i
}
end

def parse_position(line)
line.match(/(?<=title)(.*?)(?=;)/).to_s.split(' ')
end
end
end
end
2 changes: 2 additions & 0 deletions lib/rtesseract/check.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

class RTesseract
class << self
def tesseract_version
Expand Down
2 changes: 2 additions & 0 deletions lib/rtesseract/command.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

class RTesseract
class Command
FIXED = %i[command psm oem lang tessdata_dir user_words user_patterns config_file].freeze
Expand Down
2 changes: 2 additions & 0 deletions lib/rtesseract/configuration.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

require 'ostruct'

class RTesseract
Expand Down
2 changes: 2 additions & 0 deletions lib/rtesseract/pdf.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

class RTesseract
module Pdf
extend Base
Expand Down
2 changes: 2 additions & 0 deletions lib/rtesseract/text.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

require 'open3'

class RTesseract
Expand Down
2 changes: 2 additions & 0 deletions lib/rtesseract/tsv.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

class RTesseract
module Tsv
extend Base
Expand Down
4 changes: 3 additions & 1 deletion lib/rtesseract/version.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

class RTesseract
VERSION = '3.0.5'.freeze
VERSION = '3.1.0'
end
11 changes: 5 additions & 6 deletions rtesseract.gemspec
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

lib = File.expand_path('lib', __dir__)
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
require 'rtesseract/version'
Expand All @@ -8,9 +10,9 @@ Gem::Specification.new do |spec|
spec.authors = ['Danilo Jeremias da Silva']
spec.email = ['dannnylo@gmail.com']

spec.summary = 'Ruby library for working with the Tesseract OCR.'.freeze
spec.description = 'Ruby library for working with the Tesseract OCR.'.freeze
spec.homepage = 'http://github.com/dannnylo/rtesseract'.freeze
spec.summary = 'Ruby library for working with the Tesseract OCR.'
spec.description = 'Ruby library for working with the Tesseract OCR.'
spec.homepage = 'http://github.com/dannnylo/rtesseract'
spec.license = 'MIT'

# Specify which files should be added to the gem when it is released.
Expand All @@ -26,7 +28,4 @@ Gem::Specification.new do |spec|
spec.add_development_dependency 'coveralls'
spec.add_development_dependency 'rake'
spec.add_development_dependency 'rspec'
spec.add_development_dependency 'simplecov'

spec.add_dependency 'nokogiri'
end
2 changes: 2 additions & 0 deletions spec/rtesseract/box_spec.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

RSpec.describe RTesseract::Box do
let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') }
let(:words_image) { path.join('resources', 'test_words.png').to_s }
Expand Down
2 changes: 2 additions & 0 deletions spec/rtesseract/configuration_spec.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

RSpec.describe RTesseract do
let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') }

Expand Down
2 changes: 2 additions & 0 deletions spec/rtesseract/pdf_spec.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

RSpec.describe RTesseract::Pdf do
let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') }

Expand Down
2 changes: 2 additions & 0 deletions spec/rtesseract/tsv_spec.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

require 'csv'

RSpec.describe RTesseract::Tsv do
Expand Down
4 changes: 3 additions & 1 deletion spec/rtesseract_spec.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

RSpec.describe RTesseract do
let(:path) { Pathname.new(__dir__) }
let(:image_path) { path.join('resources', 'test.tif').to_s }
Expand Down Expand Up @@ -52,7 +54,7 @@
it 'store the error on a variable to debug' do
instance = RTesseract.new
expect { instance.to_s }.to raise_error(RTesseract::Error)
expect(instance.errors.first).to include("Error during processing")
expect(instance.errors.first).to include('Error during processing')

error_intance = RTesseract.new(path.join('resources', 'image_with_error.png').to_s)

Expand Down
2 changes: 2 additions & 0 deletions spec/spec_helper.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# frozen_string_literal: true

require 'bundler/setup'
require 'coveralls'
require 'simplecov'
Expand Down

0 comments on commit f50e774

Please sign in to comment.