/
documentalist.rb
187 lines (153 loc) · 5.57 KB
/
documentalist.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
require 'rubygems'
require 'yaml'
require 'system_timer'
require 'logger'
require 'kconv'
require File.join(File.dirname(__FILE__),'dependencies')
module Documentalist
@@config = {}
@@logger = nil
def self.config
default_config! unless config?
@@config
end
def self.config=(hash)
# We want to symbolize keys ourselves since we're not depending on Active Support
@@config = symbolize hash
end
def self.config?
@@config != {}
end
def self.default_config!
config_from_yaml! File.join(File.dirname(__FILE__), %w{.. config default.yml})
end
def self.config_from_yaml!(file, options = {})
self.config = YAML::load(File.open(file))
self.config = config[options[:section].to_sym] if options[:section]
end
BACKENDS = {
# Find a better pattern to pick backend, this one smells pretty bad
:WkHtmlToPdf => {[:html, :htm] => :pdf},
:OpenOffice => {[:odt, :doc, :rtf, :docx, :txt, :wps] => [:odt, :doc, :rtf, :pdf, :txt, :html, :htm, :wps]},
:NetPBM => {:ppm => [:jpg, :jpeg]},
:PdfTools => {:pdf => :txt},
}
# Finds the relevant server to perform the conversion
def self.backend_for_conversion(origin, destination)
origin = origin.to_s.gsub(/.*\./, "").to_sym
destination = destination.to_s.gsub(/.*\./, "").to_sym
BACKENDS.map { |b| [send(:const_get, b[0]), b[1]] }.detect do |s, conversions|
conversions.keys.flatten.include?(origin) and conversions.values.flatten.include?(destination)
end.to_a.first
end
# Takes all conversion requests and dispatches them appropriately
def self.convert(file=nil, options={})
if options[:input] and options[:input_format] and file.nil?
file = File.join(Dir.tmpdir, "#{rand(10**9)}.#{options[:input_format].to_s}")
File.open(file, 'w') { |f| f.write(options[:input]) }
end
raise Documentalist::Error.new("#{file} does not exist !") unless File.exist?(file)
if options[:to_format]
options[:to] = file.gsub(/#{"\\" + File.extname(file)}$/, ".#{options[:to_format].to_s}")
elsif options[:to]
options[:to_format] = File.extname(options[:to]).gsub(/\./, "").to_sym
elsif options[:stream]
options[:to_format] = options[:stream]
options[:to] = File.join(Dir.tmpdir, "#{rand(10**9)}.#{options[:to_format]}")
else
raise Documentalist::Error.new("No destination, format, or stream format was given")
end
options[:from_format] = File.extname(file).gsub(/\./, "").to_sym
backend = backend_for_conversion(options[:from_format], options[:to_format])
backend.convert(file, options)
if options[:input] and options[:input_format] and file.nil?
FileUtils.rm(file)
end
if options[:stream]
data = File.read(options[:to])
FileUtils.rm(options[:to])
yield(data) if block_given?
data
else
yield(options[:to]) if block_given?
options[:to]
end
end
def self.extract_text(file)
converted = convert(file, :to_format => :txt)
if converted and File.exist?(converted)
text = Kconv.toutf8(File.open(converted).read)
FileUtils.rm(converted)
yield(text) if block_given?
text
end
end
def self.extract_images(file)
temp_dir = File.join(Dir.tmpdir, rand(10**9).to_s)
if File.extname(file) == '.pdf'
temp_file = File.join(temp_dir, File.basename(file))
FileUtils.mkdir_p temp_dir
FileUtils.cp file, temp_file
system "pdfimages #{temp_file} '#{File.join(temp_dir, "img")}'"
Dir.glob(File.join(temp_dir, "*.ppm")).each do |ppm_image|
#raise ppm_image
Documentalist.convert(ppm_image, :to_format => :jpeg)
end
else
Documentalist.convert file, :to_format => :html
end
image_files = Dir.glob(File.join(temp_dir, "*.{jpg,jpeg,bmp,tif,tiff,gif,png}"))
yield(image_files) if block_given?
image_files
end
# Runs a block with a system-enforced timeout and optionally retry with an
# optional sleep between attempts of running the given block.
# All times are in seconds.
def self.timeout(time_limit = 0, options = {:attempts => 1, :sleep => nil})
if block_given?
attempts = options[:attempts] || 1
begin
SystemTimer.timeout time_limit do
yield
end
rescue Timeout::Error
attempts -= 1
sleep(options[:sleep]) if options[:sleep]
retry unless attempts.zero?
raise
end
end
end
# Returns the logger object used to log documentalist operations
def self.logger
unless @@logger
Documentalist.config[:log_file] ||= File.join(File.dirname(File.expand_path(__FILE__)), %w{.. documentalist.log})
@@logger = Logger.new(Documentalist.config[:log_file])
@@logger.level = Logger.const_get(config[:log_level] ? config[:log_level].upcase : "WARN")
end
@@logger
end
# Checks the dependencies for backends
def self.check_dependencies
puts "Checking backends system dependencies"
Documentalist.constants.each do |backend|
backend = Documentalist.const_get backend.to_sym
if backend.respond_to? :check_dependencies
puts "Checking dependencies for #{backend.to_s}"
backend.send :check_dependencies
end
end
end
# Returns a new hash with recursively symbolized keys
def self.symbolize(hash)
hash.each_key do |key|
hash[key.to_sym] = hash.delete key
hash[key.to_sym] = symbolize(hash[key.to_sym]) if hash[key.to_sym].is_a?(Hash)
end
end
class Error < RuntimeError; end
end
# Require all backends
Dir.glob(File.join(File.dirname(__FILE__), 'backends', '*.rb')).each do |backend|
require backend
end