Skip to content

Commit

Permalink
FIX: broken onebox images due to url normalization bugs
Browse files Browse the repository at this point in the history
normalized_encode in addressable has a number of issues, including sporkmonger/addressable#472

To temporaily work around those issues for the majority of cases, we try parsing with `::URI`. If that fails (e.g. due to non-ascii characters) then we will fall back to addressable.

Hopefully we can simplify this back to `Addressable::URI.normalized_encode` in the future.

This commit also adds support for unicode domain names and emoji domain names with escape_uri.

This removes an unneeded hack checking for pre-signed urls, which are now handled by the general case due to starting off valid and only being minimally normalized. Previous test case continues to pass.

UrlHelper.s3_presigned_url? which was somewhat wide was removed.
  • Loading branch information
SamSaffron authored and davidtaylorhq committed Aug 9, 2022
1 parent 3755bad commit f0a0252
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 30 deletions.
60 changes: 54 additions & 6 deletions lib/url_helper.rb
Expand Up @@ -60,9 +60,35 @@ def self.secure_proxy_without_cdn(url)
self.absolute(Upload.secure_media_url_from_upload_url(url), nil)
end

# This is a poorly named method. In reality, it **normalizes** the given URL,
# and does not escape it. Therefore it's idempotent, and can be used on user
# input which includes a mix of escaped and unescaped characters
def self.escape_uri(uri)
return uri if s3_presigned_url?(uri)
Addressable::URI.normalized_encode(uri)
validated = nil
url = uri.to_s

# Ideally we will jump straight to `Addressable::URI.normalized_encode`. However,
# that implementation has some edge-case issues like https://github.com/sporkmonger/addressable/issues/472.
# To temporaily work around those issues for the majority of cases, we try parsing with `::URI`.
# If that fails (e.g. due to non-ascii characters) then we will fall back to addressable.
# Hopefully we can simplify this back to `Addressable::URI.normalized_encode` in the future.

# edge case where we expect mailto:test%40test.com to normalize to mailto:test@test.com
if url.match(/\Amailto:/)
return normalize_with_addressable(url)
end

# If it doesn't pass the regexp, it's definitely not gonna parse with URI.parse. Skip
# to addressable
if !url.match?(/\A#{URI::regexp}\z/)
return normalize_with_addressable(url)
end

begin
normalize_with_ruby_uri(url)
rescue URI::Error
normalize_with_addressable(url)
end
end

def self.rails_route_from_url(url)
Expand All @@ -72,10 +98,6 @@ def self.rails_route_from_url(url)
nil
end

def self.s3_presigned_url?(url)
url[/x-amz-(algorithm|credential)/i].present?
end

def self.cook_url(url, secure: false, local: nil)
is_secure = SiteSetting.secure_media && secure
local = is_local(url) if local.nil?
Expand Down Expand Up @@ -120,4 +142,30 @@ def self.local_cdn_url(url)
end
end

private

def self.normalize_with_addressable(url)
u = Addressable::URI.normalized_encode(url, Addressable::URI)

if u.host && !u.host.ascii_only?
u.host = ::Addressable::IDNA.to_ascii(u.host)
end

u.to_s
end

def self.normalize_with_ruby_uri(url)
u = URI.parse(url)

if u.scheme && u.scheme != u.scheme.downcase
u.scheme = u.scheme.downcase
end

if u.host && u.host != u.host.downcase
u.host = u.host.downcase
end

u.to_s
end

end
2 changes: 1 addition & 1 deletion spec/lib/final_destination_spec.rb
Expand Up @@ -26,7 +26,7 @@
when 'any-subdomain.ihaveawildcard.com' then '104.25.152.11'
when 'wikipedia.com' then '1.2.3.4'
else
as_ip = IPAddr.new(host)
_as_ip = IPAddr.new(host)
host
end
end
Expand Down
43 changes: 41 additions & 2 deletions spec/lib/url_helper_spec.rb
Expand Up @@ -86,6 +86,16 @@
end

describe "#escape_uri" do
it "does not double escape %3A (:)" do
url = "http://discourse.org/%3A/test"
expect(UrlHelper.escape_uri(url)).to eq(url)
end

it "does not double escape %2F (/)" do
url = "http://discourse.org/%2F/test"
expect(UrlHelper.escape_uri(url)).to eq(url)
end

it "doesn't escape simple URL" do
url = UrlHelper.escape_uri('http://example.com/foo/bar')
expect(url).to eq('http://example.com/foo/bar')
Expand All @@ -107,8 +117,37 @@
end

it "doesn't escape already escaped chars (hash)" do
url = UrlHelper.escape_uri('https://calendar.google.com/calendar/embed?src=en.uk%23holiday%40group.v.calendar.google.com&ctz=Europe%2FLondon')
expect(url).to eq('https://calendar.google.com/calendar/embed?src=en.uk%23holiday@group.v.calendar.google.com&ctz=Europe/London')
url = 'https://calendar.google.com/calendar/embed?src=en.uk%23holiday@group.v.calendar.google.com&ctz=Europe%2FLondon'
escaped = UrlHelper.escape_uri(url)
expect(escaped).to eq(url)
end

it "leaves reserved chars alone in edge cases" do
skip "see: https://github.com/sporkmonger/addressable/issues/472"
url = "https://example.com/ article/id%3A1.2%2F1/bar"
expected = "https://example.com/%20article/id%3A1.2%2F1/bar"
escaped = UrlHelper.escape_uri(url)
expect(escaped).to eq(expected)
end

it "handles emoji domain names" do
url = "https://💻.example/💻?computer=💻"
expected = "https://xn--3s8h.example/%F0%9F%92%BB?computer=%F0%9F%92%BB"
escaped = UrlHelper.escape_uri(url)
expect(escaped).to eq(expected)
end

it "handles special-character domain names" do
url = "https://éxample.com/test"
expected = "https://xn--xample-9ua.com/test"
escaped = UrlHelper.escape_uri(url)
expect(escaped).to eq(expected)
end

it "performs basic normalization" do
url = "http://EXAMPLE.com/a"
escaped = UrlHelper.escape_uri(url)
expect(escaped).to eq("http://example.com/a")
end

it "doesn't escape S3 presigned URLs" do
Expand Down
21 changes: 0 additions & 21 deletions spec/models/upload_spec.rb
Expand Up @@ -568,27 +568,6 @@ def enable_secure_media
end
end

describe ".signed_url_from_secure_media_url" do
before do
# must be done so signed_url_for_path exists
enable_secure_media
end

it "correctly gives back a signed url from a path only" do
secure_url = "/secure-media-uploads/original/1X/c5a2c4ba0fa390f5aac5c2c1a12416791ebdd9e9.png"
signed_url = Upload.signed_url_from_secure_media_url(secure_url)
expect(signed_url).not_to include("secure-media-uploads")
expect(UrlHelper.s3_presigned_url?(signed_url)).to eq(true)
end

it "correctly gives back a signed url from a full url" do
secure_url = "http://localhost:3000/secure-media-uploads/original/1X/c5a2c4ba0fa390f5aac5c2c1a12416791ebdd9e9.png"
signed_url = Upload.signed_url_from_secure_media_url(secure_url)
expect(signed_url).not_to include(Discourse.base_url)
expect(UrlHelper.s3_presigned_url?(signed_url)).to eq(true)
end
end

describe ".secure_media_url_from_upload_url" do
before do
# must be done so signed_url_for_path exists
Expand Down

1 comment on commit f0a0252

@discoursebot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This commit has been mentioned on Discourse Meta. There might be relevant details there:

https://meta.discourse.org/t/oneboxing-changes-link-url/182545/9

Please sign in to comment.