Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEV: Small Amazon fixes #463

Merged
merged 7 commits into from Apr 2, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
33 changes: 23 additions & 10 deletions lib/onebox/engine/amazon_onebox.rb
Expand Up @@ -14,17 +14,20 @@ class AmazonOnebox
matches_regexp(/^https?:\/\/(?:www\.)?(?:smile\.)?(amazon|amzn)\.(?<tld>com|ca|de|it|es|fr|co\.jp|co\.uk|cn|in|com\.br|com\.mx|nl|pl|sa|sg|se|com\.tr|ae)\//)

def url
# Have we cached the HTML body of the requested URL?
# If so, try to grab the canonical URL from that document,
# If possible, fetch the cached HTML body immediately so we can
# try to grab the canonical URL from that document,
# rather than guess at the best URL structure to use
if @body_cacher && @body_cacher.respond_to?('cache_response_body?')
jbrw marked this conversation as resolved.
Show resolved Hide resolved
if @body_cacher.cached_response_body_exists?(uri.to_s)
if @body_cacher.cache_response_body?(uri.to_s) && @body_cacher.cached_response_body_exists?(uri.to_s)
@raw ||= Onebox::Helpers.fetch_html_doc(@url, http_params, @body_cacher)
canonical_link = @raw.at('//link[@rel="canonical"]/@href')
return canonical_link.to_s if canonical_link
end
end

if @raw
canonical_link = @raw.at('//link[@rel="canonical"]/@href')
return canonical_link.to_s if canonical_link
end

if match && match[:id]
return "https://www.amazon.#{tld}/dp/#{Onebox::Helpers.uri_encode(match[:id])}"
end
Expand All @@ -45,7 +48,7 @@ def http_params
private

def match
@match ||= @url.match(/(?:d|g)p\/(?:product\/|video\/detail\/)?(?<id>[^\/]+)(?:\/|$)/mi)
@match ||= @url.match(/(?:d|g)p\/(?:product\/|video\/detail\/)?(?<id>[A-Z0-9]+)(?:\/|\?|$)/mi)
CvX marked this conversation as resolved.
Show resolved Hide resolved
end

def image
Expand All @@ -60,6 +63,10 @@ def image
end

if (landing_image = raw.css("#landingImage")) && landing_image.any?
attributes = landing_image.first.attributes

return attributes["data-old-hires"].to_s if attributes["data-old-hires"]

landing_image.first["src"].to_s
end

Expand Down Expand Up @@ -110,7 +117,7 @@ def data
end

result = {
link: link,
link: url,
title: title,
by_info: authors,
image: og.image || image,
Expand Down Expand Up @@ -141,7 +148,7 @@ def data
end

result = {
link: link,
link: url,
title: title,
by_info: authors,
image: og.image || image,
Expand All @@ -157,7 +164,7 @@ def data
else
title = og.title || CGI.unescapeHTML(raw.css("title").inner_text)
result = {
link: link,
link: url,
title: title,
image: og.image || image,
price: price
Expand All @@ -167,7 +174,13 @@ def data
result[:by_info] = Onebox::Helpers.clean(result[:by_info].inner_html) if result[:by_info]

summary = raw.at("#productDescription")
result[:description] = og.description || (summary && summary.inner_text) || CGI.unescapeHTML(Onebox::Helpers.truncate(raw.css("meta[name=description]").first["content"], 250))

description = og.description || (summary && summary.inner_text)
jbrw marked this conversation as resolved.
Show resolved Hide resolved
description ||= begin
meta_description = raw.css("meta[name=description]")
meta_description.first["content"] if meta_description.length > 0
end
jbrw marked this conversation as resolved.
Show resolved Hide resolved
result[:description] = CGI.unescapeHTML(Onebox::Helpers.truncate(description, 250)) if description
end

result[:price] = nil if result[:price].start_with?("$0") || result[:price] == 0
Expand Down
4 changes: 2 additions & 2 deletions lib/onebox/engine/html.rb
Expand Up @@ -11,8 +11,8 @@ def http_params
end

def raw
body_cacher = self.options[:body_cacher] if self.options
@raw ||= Onebox::Helpers.fetch_html_doc(url, http_params, body_cacher)
@body_cacher = self.options[:body_cacher] if self.options
jbrw marked this conversation as resolved.
Show resolved Hide resolved
@raw ||= Onebox::Helpers.fetch_html_doc(url, http_params, @body_cacher)
end

def html?
Expand Down
2 changes: 1 addition & 1 deletion lib/onebox/version.rb
@@ -1,5 +1,5 @@
# frozen_string_literal: true

module Onebox
VERSION = "2.2.9"
VERSION = "2.2.10"
end
7 changes: 7 additions & 0 deletions spec/lib/onebox/engine/amazon_onebox_spec.rb
Expand Up @@ -73,6 +73,13 @@ def check_link(tdl, link)
expect(described_class.new("http://www.amazon.fr/gp/product/B01BYD0TZM").url)
.to eq("https://www.amazon.fr/dp/B01BYD0TZM")
end

let(:long_url) { "https://www.amazon.ca/gp/product/B087Z3N428?pf_rd_r=SXABADD0ZZ3NF9Q5F8TW&pf_rd_p=05378fd5-c43e-4948-99b1-a65b129fdd73&pd_rd_r=0237fb28-7f47-49f4-986a-be0c78e52863&pd_rd_w=FfIoI&pd_rd_wg=Hw4qq&ref_=pd_gw_unk" }

it "removes parameters from the URL" do
expect(described_class.new(long_url).url)
.not_to include("?pf_rd_r")
end
end

describe "#to_html" do
Expand Down