/
pull_hotlinked_images.rb
250 lines (197 loc) · 8.24 KB
/
pull_hotlinked_images.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# frozen_string_literal: true
module Jobs
class PullHotlinkedImages < ::Jobs::Base
sidekiq_options queue: 'low'
def initialize
@max_size = SiteSetting.max_image_size_kb.kilobytes
end
def execute(args)
@post_id = args[:post_id]
raise Discourse::InvalidParameters.new(:post_id) if @post_id.blank?
post = Post.find_by(id: @post_id)
return if post.blank?
raw = post.raw.dup
start_raw = raw.dup
large_image_urls = post.custom_fields[Post::LARGE_IMAGES] || []
broken_image_urls = post.custom_fields[Post::BROKEN_IMAGES] || []
downloaded_image_ids = post.custom_fields[Post::DOWNLOADED_IMAGES] || {}
upload_records = Upload.where(id: downloaded_image_ids.values)
upload_records = Hash[upload_records.map { |u| [u.id, u] }]
downloaded_images = {}
downloaded_image_ids.each { |url, id| downloaded_images[url] = upload_records[id] }
extract_images_from(post.cooked).each do |node|
download_src = original_src = node['src'] || node['href']
download_src = "#{SiteSetting.force_https ? "https" : "http"}:#{original_src}" if original_src.start_with?("//")
normalized_src = normalize_src(download_src)
next if !should_download_image?(download_src, post)
begin
already_attempted_download = downloaded_images.include?(normalized_src) || large_image_urls.include?(normalized_src) || broken_image_urls.include?(normalized_src)
if !already_attempted_download
downloaded_images[normalized_src] = attempt_download(download_src, post.user_id)
end
rescue ImageTooLargeError
large_image_urls << normalized_src
rescue ImageBrokenError
broken_image_urls << normalized_src
end
# have we successfully downloaded that file?
if upload = downloaded_images[normalized_src]
raw = replace_in_raw(original_src: original_src, upload: upload, raw: raw)
end
rescue => e
raise e if Rails.env.test?
log(:error, "Failed to pull hotlinked image (#{download_src}) post: #{@post_id}\n" + e.message + "\n" + e.backtrace.join("\n"))
end
large_image_urls.uniq!
broken_image_urls.uniq!
downloaded_images.compact!
post.custom_fields[Post::LARGE_IMAGES] = large_image_urls
post.custom_fields[Post::BROKEN_IMAGES] = broken_image_urls
downloaded_image_ids = {}
downloaded_images.each { |url, upload| downloaded_image_ids[url] = upload.id }
post.custom_fields[Post::DOWNLOADED_IMAGES] = downloaded_image_ids
[Post::LARGE_IMAGES, Post::BROKEN_IMAGES, Post::DOWNLOADED_IMAGES].each do |key|
post.custom_fields.delete(key) if !post.custom_fields[key].present?
end
custom_fields_updated = !post.custom_fields_clean?
# only save custom fields if they changed
post.save_custom_fields if custom_fields_updated
# If post changed while we were downloading images, never apply edits
post.reload
post_changed_elsewhere = (start_raw != post.raw)
raw_changed_here = (raw != post.raw)
if !post_changed_elsewhere && raw_changed_here
changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") }
post.revise(Discourse.system_user, changes, bypass_bump: true, skip_staff_log: true)
elsif custom_fields_updated
post.trigger_post_process(
bypass_bump: true,
skip_pull_hotlinked_images: true # Avoid an infinite loop of job scheduling
)
end
end
def download(src)
downloaded = nil
begin
retries ||= 3
downloaded = FileHelper.download(
src,
max_file_size: @max_size,
retain_on_max_file_size_exceeded: true,
tmp_file_name: "discourse-hotlinked",
follow_redirect: true
)
rescue
if (retries -= 1) > 0 && !Rails.env.test?
sleep 1
retry
end
end
downloaded
end
class ImageTooLargeError < StandardError; end
class ImageBrokenError < StandardError; end
def attempt_download(src, user_id)
# secure-media-uploads endpoint prevents anonymous downloads, so we
# need the presigned S3 URL here
src = Upload.signed_url_from_secure_media_url(src) if Upload.secure_media_url?(src)
hotlinked = download(src)
raise ImageBrokenError if !hotlinked
raise ImageTooLargeError if File.size(hotlinked.path) > @max_size
filename = File.basename(URI.parse(src).path)
filename << File.extname(hotlinked.path) unless filename["."]
upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(user_id)
if upload.persisted?
upload
else
log(:info, "Failed to persist downloaded hotlinked image for post: #{@post_id}: #{src} - #{upload.errors.full_messages.join("\n")}")
nil
end
end
def replace_in_raw(original_src:, raw:, upload:)
raw = raw.dup
escaped_src = Regexp.escape(original_src)
replace_raw = ->(match, match_src, replacement, _index) {
if normalize_src(original_src) == normalize_src(match_src)
replacement =
if replacement.include?(InlineUploads::PLACEHOLDER)
replacement.sub(InlineUploads::PLACEHOLDER, upload.short_url)
elsif replacement.include?(InlineUploads::PATH_PLACEHOLDER)
replacement.sub(InlineUploads::PATH_PLACEHOLDER, upload.short_path)
end
raw = raw.gsub(
match,
replacement
)
end
}
# there are 6 ways to insert an image in a post
# HTML tag - <img src="http://...">
InlineUploads.match_img(raw, external_src: true, &replace_raw)
# BBCode tag - [img]http://...[/img]
InlineUploads.match_bbcode_img(raw, external_src: true, &replace_raw)
# Markdown linked image - [![alt](http://...)](http://...)
# Markdown inline - ![alt](http://...)
# Markdown inline - ![](http://... "image title")
# Markdown inline - ![alt](http://... "image title")
InlineUploads.match_md_inline_img(raw, external_src: true, &replace_raw)
# Direct link
raw.gsub!(/^#{escaped_src}(\s?)$/) { "![](#{upload.short_url})#{$1}" }
raw
end
def extract_images_from(html)
doc = Nokogiri::HTML5::fragment(html)
doc.css("img[src], a.lightbox[href]") -
doc.css("img.avatar") -
doc.css(".lightbox img[src]")
end
def should_download_image?(src, post = nil)
# make sure we actually have a url
return false unless src.present?
local_bases = [
Discourse.base_url,
Discourse.asset_host,
].compact.map { |s| normalize_src(s) }
if Discourse.store.has_been_uploaded?(src) || normalize_src(src).start_with?(*local_bases) || src =~ /\A\/[^\/]/i
return false if !(src =~ /\/uploads\// || Upload.secure_media_url?(src))
# Someone could hotlink a file from a different site on the same CDN,
# so check whether we have it in this database
#
# if the upload already exists and is attached to a different post,
# or the original_sha1 is missing meaning it was created before secure
# media was enabled, then we definitely want to redownload again otherwise
# we end up reusing existing uploads which may be linked to many posts
# already.
upload = Upload.consider_for_reuse(Upload.get_from_url(src), post)
return !upload.present?
end
# Don't download non-local images unless site setting enabled
return false unless SiteSetting.download_remote_images_to_local?
# parse the src
begin
uri = URI.parse(src)
rescue URI::Error
return false
end
hostname = uri.hostname
return false unless hostname
# check the domains blocklist
SiteSetting.should_download_images?(src)
end
def log(log_level, message)
Rails.logger.public_send(
log_level,
"#{RailsMultisite::ConnectionManagement.current_db}: #{message}"
)
end
private
def normalize_src(src)
uri = Addressable::URI.heuristic_parse(src)
uri.normalize!
uri.scheme = nil
uri.to_s
rescue URI::Error, Addressable::URI::InvalidURIError
src
end
end
end