-
Notifications
You must be signed in to change notification settings - Fork 8.3k
/
pull_hotlinked_images.rb
253 lines (205 loc) · 7.74 KB
/
pull_hotlinked_images.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
# frozen_string_literal: true
module Jobs
class PullHotlinkedImages < ::Jobs::Base
sidekiq_options queue: "low"
def execute(args)
disable_if_low_on_disk_space
@post_id = args[:post_id]
raise Discourse::InvalidParameters.new(:post_id) if @post_id.blank?
# in test we have no choice cause we don't want to cause a deadlock
if Jobs.run_immediately?
pull_hotlinked_images
else
DistributedMutex.synchronize("pull_hotlinked_images_#{@post_id}", validity: 2.minutes) do
pull_hotlinked_images
end
end
end
def pull_hotlinked_images
post = Post.find_by(id: @post_id)
return if post.nil? || post.topic.nil?
hotlinked_map = post.post_hotlinked_media.map { |r| [r.url, r] }.to_h
changed_hotlink_records = false
extract_images_from(post.cooked).each do |node|
download_src =
original_src = node["src"] || node[PrettyText::BLOCKED_HOTLINKED_SRC_ATTR] || node["href"]
download_src = replace_encoded_src(download_src)
download_src =
"#{SiteSetting.force_https ? "https" : "http"}:#{original_src}" if original_src.start_with?(
"//",
)
normalized_src = normalize_src(download_src)
next if !should_download_image?(download_src, post)
hotlink_record = hotlinked_map[normalized_src]
if hotlink_record.nil?
hotlinked_map[normalized_src] = hotlink_record =
PostHotlinkedMedia.new(post: post, url: normalized_src)
begin
hotlink_record.upload = attempt_download(download_src, post.user_id)
hotlink_record.status = :downloaded
rescue ImageTooLargeError
hotlink_record.status = :too_large
rescue ImageBrokenError
hotlink_record.status = :download_failed
rescue UploadCreateError
hotlink_record.status = :upload_create_failed
end
end
if hotlink_record.changed?
changed_hotlink_records = true
hotlink_record.save!
end
rescue => e
raise e if Rails.env.test?
log(
:error,
"Failed to pull hotlinked image (#{download_src}) post: #{@post_id}\n" + e.message +
"\n" + e.backtrace.join("\n"),
)
end
if changed_hotlink_records
post.trigger_post_process(
bypass_bump: true,
skip_pull_hotlinked_images: true, # Avoid an infinite loop of job scheduling
)
end
if hotlinked_map.size > 0
Jobs.cancel_scheduled_job(:update_hotlinked_raw, post_id: post.id)
update_raw_delay = SiteSetting.editing_grace_period + 1
Jobs.enqueue_in(update_raw_delay, :update_hotlinked_raw, post_id: post.id)
end
end
def download(src)
downloaded = nil
begin
retries ||= 3
if SiteSetting.verbose_upload_logging
Rails.logger.warn("Verbose Upload Logging: Downloading hotlinked image from #{src}")
end
downloaded =
FileHelper.download(
src,
max_file_size: SiteSetting.max_image_size_kb.kilobytes,
retain_on_max_file_size_exceeded: true,
tmp_file_name: "discourse-hotlinked",
follow_redirect: true,
read_timeout: 15,
)
rescue => e
if SiteSetting.verbose_upload_logging
Rails.logger.warn("Verbose Upload Logging: Error '#{e.message}' while downloading #{src}")
end
if (retries -= 1) > 0 && !Rails.env.test?
sleep 1
retry
end
end
downloaded
end
class ImageTooLargeError < StandardError
end
class ImageBrokenError < StandardError
end
class UploadCreateError < StandardError
end
def attempt_download(src, user_id)
# secure-uploads endpoint prevents anonymous downloads, so we
# need the presigned S3 URL here
src = Upload.signed_url_from_secure_uploads_url(src) if Upload.secure_uploads_url?(src)
hotlinked = download(src)
raise ImageBrokenError if !hotlinked
if File.size(hotlinked.path) > SiteSetting.max_image_size_kb.kilobytes
raise ImageTooLargeError
end
filename = File.basename(URI.parse(src).path)
filename << File.extname(hotlinked.path) unless filename["."]
upload = UploadCreator.new(hotlinked, filename, origin: src).create_for(user_id)
if upload.persisted?
upload
else
log(
:info,
"Failed to persist downloaded hotlinked image for post: #{@post_id}: #{src} - #{upload.errors.full_messages.join("\n")}",
)
raise UploadCreateError
end
end
def extract_images_from(html)
doc = Nokogiri::HTML5.fragment(html)
doc.css("img[src], [#{PrettyText::BLOCKED_HOTLINKED_SRC_ATTR}], a.lightbox[href]") -
doc.css("img.avatar") - doc.css(".lightbox img[src]")
end
def should_download_image?(src, post = nil)
# make sure we actually have a url
return false if src.blank?
local_bases =
[Discourse.base_url, Discourse.asset_host, SiteSetting.external_emoji_url.presence].compact
.map { |s| normalize_src(s) }
if Discourse.store.has_been_uploaded?(src) || normalize_src(src).start_with?(*local_bases) ||
src =~ %r{\A/[^/]}i
return false if !(src =~ %r{/uploads/} || Upload.secure_uploads_url?(src))
# Someone could hotlink a file from a different site on the same CDN,
# so check whether we have it in this database
#
# if the upload already exists and is attached to a different post,
# or the original_sha1 is missing meaning it was created before secure
# media was enabled, then we definitely want to redownload again otherwise
# we end up reusing existing uploads which may be linked to many posts
# already.
upload = Upload.consider_for_reuse(Upload.get_from_url(src), post)
return !upload.present?
end
# Don't download non-local images unless site setting enabled
return false unless SiteSetting.download_remote_images_to_local?
# parse the src
begin
uri = URI.parse(src)
rescue URI::Error
return false
end
hostname = uri.hostname
return false unless hostname
# check the domains blocklist
SiteSetting.should_download_images?(src)
end
def log(log_level, message)
Rails.logger.public_send(
log_level,
"#{RailsMultisite::ConnectionManagement.current_db}: #{message}",
)
end
protected
def replace_encoded_src(src)
PostHotlinkedMedia.normalize_src(src, reset_scheme: false)
end
def normalize_src(src)
PostHotlinkedMedia.normalize_src(src)
end
def disable_if_low_on_disk_space
return if Discourse.store.external?
return if !SiteSetting.download_remote_images_to_local
return if available_disk_space >= SiteSetting.download_remote_images_threshold
SiteSetting.download_remote_images_to_local = false
# log the site setting change
reason = I18n.t("disable_remote_images_download_reason")
staff_action_logger = StaffActionLogger.new(Discourse.system_user)
staff_action_logger.log_site_setting_change(
"download_remote_images_to_local",
true,
false,
details: reason,
)
# also send a private message to the site contact user notify_about_low_disk_space
notify_about_low_disk_space
end
def notify_about_low_disk_space
SystemMessage.create_from_system_user(
Discourse.site_contact_user,
:download_remote_images_disabled,
)
end
def available_disk_space
100 - DiskSpace.percent_free("#{Rails.root}/public/uploads")
end
end
end