Skip to content

Commit

Permalink
weibo: fix multiple issues with extracting videos.
Browse files Browse the repository at this point in the history
* Add support for posts with both images and videos.
* Fix bug where we didn't grab 1080p videos. Caused by getting videos from the mobile page, which
  doesn't return 1080p videos. The fix is to use the `https://www.weibo.com/ajax/statuses/show?id=:id`
  API. We can't use this API for everything because it doesn't work for posts that can only be opened
  on the mobile site, so we only use it to grab videos. This API also requires a session cookie.
* Update the hashtag search link to `https://s.weibo.com/weibo?q=%23fgo%23`.
* Fix commentaries to strip `https://weibo.cn/sinaurl?u=` from outgoing links, to not output image
  links for emoticons, and to output desktop links instead of mobile links.

Fixes #5726: Weibo video samples are fetched instead of fullsize.
  • Loading branch information
evazion committed May 22, 2024
1 parent 6a34465 commit b10b21c
Show file tree
Hide file tree
Showing 3 changed files with 240 additions and 88 deletions.
133 changes: 98 additions & 35 deletions app/logical/source/extractor/weibo.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,41 +5,54 @@ module Source
class Extractor
class Weibo < Source::Extractor
def image_urls
if parsed_url.image_url?
if parsed_url.full_image_url.present?
[parsed_url.full_image_url]
elsif api_response.present?
if api_response["pics"].present?
api_response["pics"].pluck("url").map { |url| Source::URL.parse(url).full_image_url }
elsif api_response.dig("page_info", "type") == "video"
variants = api_response["page_info"]["media_info"].to_h.values + api_response["page_info"]["urls"].to_h.values
largest_video = variants.max_by do |variant|
if /template=(?<width>\d+)x(?<height>\d+)/ =~ variant.to_s
width.to_i * height.to_i
else
0
end
end
[largest_video]
elsif parsed_url.image_url?
[parsed_url.to_s]
elsif page_json.dig(:page_info, :type) == "video"
image_urls_for_video
else
page_json[:pics].to_a.pluck(:url).map do |image_url|
Source::URL.parse(image_url).try(:full_image_url) || image_url
end
end
end

def image_urls_for_video
# https://weibo.com/2427303621/MxojLlLgQ (mixed videos and images)
if post[:mix_media_info].present?
media_items = post.dig(:mix_media_info, :items).to_a.pluck(:data).map do |item|
item[:media_info] || item
end
# https://www.weibo.com/7817290049/N62KL5MpJ (single video)
else
[]
media_items = [post.dig(:page_info, :media_info)].compact
end

media_items.filter_map do |item|
# https://weibo.com/2427303621/MxojLlLgQ (mixed videos and images)
# https://www.weibo.com/7817290049/N62KL5MpJ (video with playback_list)
# https://m.weibo.cn/detail/4142890299009993 (video with empty playback_list)
media_url = [
item[:playback_list]&.max_by { |video| video.dig(:meta, :quality_index) }&.dig(:play_info, :url),
item[:stream_url_hd],
item[:stream_url],
item.dig(:largest, :url),
].compact.first

Source::URL.parse(media_url).try(:full_image_url) || media_url
end
end

def page_url
return nil unless api_response.present?

artist_id = api_response["user"]["id"]
illust_base62_id = api_response["bid"]
"https://www.weibo.com/#{artist_id}/#{illust_base62_id}"
"https://www.weibo.com/#{artist_id}/#{illust_base62_id}" if artist_id.present? && illust_base62_id.present?
end

def tags
return [] if api_response.blank?

matches = api_response["text"]&.scan(/surl-text">#(.*?)#</).to_a.map { |m| m[0] }
matches.map do |match|
[match, "https://s.weibo.com/weibo/#{match}"]
tags = page_json[:text]&.parse_html&.css(".surl-text").to_a.map(&:text).select { |text| text&.match?(/^\#.*\#$/) }
tags.map do |tag|
tag = tag.delete_prefix("#").delete_suffix("#")
[tag, "https://s.weibo.com/weibo?q=#{Danbooru::URL.escape("##{tag}#")}"]
end
end

Expand All @@ -52,25 +65,42 @@ def tag_name
end

def display_name
api_response&.dig("user", "screen_name")
page_json.dig(:user, :screen_name)
end

def artist_id
parsed_url.artist_short_id || parsed_referer&.artist_short_id || api_response&.dig("user", "id")
parsed_url.artist_short_id || parsed_referer&.artist_short_id || page_json.dig(:user, :id)
end

def artist_commentary_desc
return if api_response.blank?

api_response["text"]
page_json[:text]
end

def dtext_artist_commentary_desc
DText.from_html(artist_commentary_desc, base_url: "https://www.weibo.com") do |element|
if element["src"].present?
src = Addressable::URI.heuristic_parse(element["src"])
src.scheme ||= "https"
element["src"] = src.to_s
case element.name

# Fix hashtag links to use desktop instead of mobile version.
in "a" if element[:href]&.starts_with?("https://m.weibo.cn/search")
element[:href] = "https://s.weibo.com/weibo?q=#{Danbooru::URL.escape(element.text)}"

# Fix user profile links to use desktop instead of mobile version.
in "a" if element[:href]&.starts_with?("https://m.weibo.cn/p/index")
id = Danbooru::URL.parse(element[:href]).params[:containerid]
element[:href] = "https://weibo.com/p/#{id}"

# Fix external links.
in "a" if element[:href]&.starts_with?("https://weibo.cn/sinaurl?u=")
element[:href] = Danbooru::URL.parse(element[:href]).params[:u]

# Remove emoticon images.
# <span class="url-icon"> <img alt="[舔屏]" src="https://h5.sinaimg.cn/m/emoticon/icon/default/d_tian-3b1ce0a112.png" style="width:1em; height:1em;" /></span>
in "img" if element[:src]&.starts_with?("https://h5.sinaimg.cn/m/emoticon")
element.name = "span"
element.content = element[:alt]

else
nil
end
end
end
Expand All @@ -79,10 +109,43 @@ def mobile_page_url
parsed_url.mobile_url || parsed_referer&.mobile_url
end

memoize def api_response
def illust_id
parsed_url.illust_id || parsed_referer&.illust_id
end

def illust_base62_id
parsed_url.illust_base62_id || parsed_referer&.illust_base62_id || page_json[:bid]
end

memoize def page_json
html = http.cache(1.minute).parsed_get(mobile_page_url)
html.to_s[/var \$render_data = \[(.*)\]\[0\]/m, 1]&.parse_json&.dig("status") || {}
end

# This API doesn't work for certain posts that can only be opened on the mobile site. It's only used to grab
# videos, since the mobile page doesn't return 1080p videos.
memoize def post
url = "https://www.weibo.com/ajax/statuses/show?id=#{illust_id}" if illust_id.present?
http.no_follow.cookies(SUB: sub_cookie).cache(1.minute).parsed_get(url) || {}
end

# This `tid` value is tied to your IP and user agent.
memoize def tid
response = http.post("https://passport.weibo.com/visitor/genvisitor?cb=gen_callback")
data = response.to_s[/({.*})/]&.parse_json&.dig(:data) || {}
data[:tid]
end

memoize def visitor_cookies
return {} unless tid.present?

response = http.get("https://passport.weibo.com/visitor/visitor", params: { a: "incarnate", t: tid })
response.cookies.to_h { |c| [c.name, c.value] }
end

def sub_cookie
visitor_cookies["SUB"]
end
end
end
end
47 changes: 28 additions & 19 deletions app/logical/source/url/weibo.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,94 +11,99 @@
class Source::URL::Weibo < Source::URL
RESERVED_USERNAMES = %w[u n p profile status detail]

attr_reader :full_image_url, :artist_short_id, :artist_long_id, :display_name, :username
attr_reader :full_image_url, :artist_short_id, :artist_long_id, :illust_long_id, :illust_base62_id, :display_name, :username

def self.match?(url)
url.domain.in?(["weibo.com", "weibo.cn", "sinaimg.cn"])
url.domain.in?(%w[weibo.com weibo.cn sinaimg.cn weibocdn.com])
end

def parse
case [host, *path_segments]
case [subdomain, domain, *path_segments]

# https://f.video.weibocdn.com/o0/wPhyi3dIlx086mr8Md3y01041200xT4N0E010.mp4?label=mp4_1080p&template=1080x1920.24.0&media_id=4914351942074379&tp=8x8A3El:YTkl0eM8&us=0&ori=1&bf=4&ot=v&ps=3lckmu&uid=3ZoTIp&ab=,3601-g32,8143-g0,8013-g0,3601-g32,3601-g37&Expires=1716316057&ssig=uW43Bg6Lo1&KID=unistore,video
# https://f.us.sinaimg.cn/003K8vB7lx07rz92ubWg010412002UHB0E010.mp4?label=mp4_1080p&template=1920x1080.20.0&media_id=4339747921802209&tp=8x8A3El:YTkl0eM8&us=0&ori=1&bf=4&ot=h&lp=00002g58dE&ps=mZ6WB&uid=zszavag&ab=13038-g0,&Expires=1716411960&ssig=qmkXwFd%2B1m&KID=unistore,video
# https://g.us.sinaimg.cn/o0/qNZcaAAglx07Wuf921CM0104120005tc0E010.mp4?label=gif_mp4
in _, ("weibocdn.com" | "sinaimg.cn"), *rest if file_ext == "mp4"
@full_image_url = with(params: params.slice(:Expires, :ssig, :KID)).to_s

# http://ww1.sinaimg.cn/large/69917555gw1f6ggdghk28j20c87lbhdt.jpg
# https://wx1.sinaimg.cn/large/002NQ2vhly1gqzqfk1agfj62981aw4qr02.jpg
# http://ww4.sinaimg.cn/mw690/77a2d531gw1f4u411ws3aj20m816fagg.jpg (sample)
# https://wx4.sinaimg.cn/orj360/e3930166gy1g546bz86cij20u00u040y.jpg (sample)
# http://ww3.sinaimg.cn/mw1024/0065kjmOgw1fabcanrzx6j30f00lcjwv.jpg (sample)
# https://wx1.sinaimg.cn/original/7004ec1cly1ge9dcbsw4lj20jg2ir7wh.jpg
in /\w+\.sinaimg\.cn$/ => host, size, file
in _, "sinaimg.cn", size, file
@full_image_url = "https://#{host}/large/#{file}"

# http://tw.weibo.com/1300957955/3786333853668537
in "tw.weibo.com", /^\w+$/, /^\d+$/ => illust_long_id
in "tw", "weibo.com", /^\w+$/, /^\d+$/ => illust_long_id
@illust_long_id = illust_long_id

# http://weibo.com/3357910224/EEHA1AyJP
# https://www.weibo.com/5501756072/IF9fugHzj?from=page_1005055501756072_profile&wvr=6&mod=weibotime
in /weibo\.(com|cn)$/, /^\d+$/ => artist_short_id, /^\w+$/ => illust_base62_id
in _, ("weibo.com" | "weibo.cn"), /^\d+$/ => artist_short_id, /^\w+$/ => illust_base62_id
@artist_short_id = artist_short_id
@illust_base62_id = illust_base62_id

# http://photo.weibo.com/2125874520/wbphotos/large/mid/4194742441135220/pid/7eb64558gy1fnbryb5nzoj20dw10419t
# http://photo.weibo.com/5732523783/talbum/detail/photo_id/4029784374069389?prel=p6_3
in "photo.weibo.com", /^\d+$/ => artist_short_id, _, _, _, /^\d+$/ => illust_long_id, *rest
in "photo", "weibo.com", /^\d+$/ => artist_short_id, _, _, _, /^\d+$/ => illust_long_id, *rest
@artist_short_id = artist_short_id
@illust_long_id = illust_long_id

# https://m.weibo.cn/detail/4506950043618873
# https://www.weibo.com/detail/4676597657371957
in _, "detail", /^\d+$/ => illust_long_id
in _, ("weibo.cn" | "weibo.com"), "detail", /^\d+$/ => illust_long_id
@illust_long_id = illust_long_id

# https://share.api.weibo.cn/share/304950356,4767694689143828.html
# https://share.api.weibo.cn/share/304950356,4767694689143828
in "share.api.weibo.cn", "share", /^(\d+),(\d+)/
in "share.api", "weibo.cn", "share", /^(\d+),(\d+)/
@illust_long_id = $2

# https://m.weibo.cn/status/J33G4tH1B
in "m.weibo.cn", "status", /^\w+$/ => illust_base62_id
in "m", "weibo.cn", "status", /^\w+$/ => illust_base62_id
@illust_base62_id = illust_base62_id

# https://www.weibo.com/u/5501756072
# https://www.weibo.com/u/5957640693/home?wvr=5
# https://m.weibo.cn/profile/5501756072
# https://m.weibo.cn/u/5501756072
in _, ("u" | "profile"), /^\d+$/ => artist_short_id, *rest
in _, _, ("u" | "profile"), /^\d+$/ => artist_short_id, *rest
@artist_short_id = artist_short_id

# https://www.weibo.com/p/1005055399876326 (short id: https://www.weibo.com/u/5399876326; username: https://www.weibo.com/chengziyou666)
# https://www.weibo.com/p/1005055399876326/home?from=page_100505&mod=TAB&is_hot=1
# https://www.weibo.cn/p/1005055399876326
# https://m.weibo.com/p/1005055399876326
in _, "p", /^\d+$/ => artist_long_id, *rest
in _, _, "p", /^\d+$/ => artist_long_id, *rest
@artist_long_id = artist_long_id

# https://www.weibo.com/5501756072
# https://www.weibo.cn/5501756072
# https://weibo.com/1843267214/profile
in _, /^\d+$/ => artist_short_id, *rest
in _, _, /^\d+$/ => artist_short_id, *rest
@artist_short_id = artist_short_id

# https://weibo.com/n/肆巳4
# https://www.weibo.com/n/小小男爵不要坑
in _, "n", display_name, *rest
in _, _, "n", display_name, *rest
@display_name = display_name

# https://www.weibo.com/endlessnsmt (short id: https://www.weibo.com/u/1879370780)
# https://www.weibo.cn/endlessnsmt
# https://www.weibo.com/lvxiuzi0/home
in _, /^\w+$/ => username, *rest unless username.in?(RESERVED_USERNAMES)
in _, _, /^\w+$/ => username, *rest unless username.in?(RESERVED_USERNAMES)
@username = username

# https://weibo.com/tv/show/1034:4914351942074379?from=old_pc_videoshow
# https://video.weibo.com/show?fid=1034:4914351942074379
# https://weibo.com/ajax/common/download?pid=7eb64558gy1fnbry9mgx0j20dw104qjd
else
nil
end
end

def image_url?
full_image_url.present?
end

def profile_url
if artist_short_id.present?
"https://www.weibo.com/u/#{artist_short_id}"
Expand All @@ -111,6 +116,10 @@ def profile_url
end
end

def illust_id
illust_long_id || illust_base62_id
end

def mobile_url
if @illust_long_id.present?
"https://m.weibo.cn/detail/#{@illust_long_id}"
Expand Down
Loading

0 comments on commit b10b21c

Please sign in to comment.