weibo: fix multiple issues with extracting videos.

* Add support for posts with both images and videos. * Fix bug where we didn't grab 1080p videos. Caused by getting videos from the mobile page, which doesn't return 1080p videos. The fix is to use the `https://www.weibo.com/ajax/statuses/show?id=:id` API. We can't use this API for everything because it doesn't work for posts that can only be opened on the mobile site, so we only use it to grab videos. This API also requires a session cookie. * Update the hashtag search link to `https://s.weibo.com/weibo?q=%23fgo%23`. * Fix commentaries to strip `https://weibo.cn/sinaurl?u=` from outgoing links, to not output image links for emoticons, and to output desktop links instead of mobile links. Fixes #5726: Weibo video samples are fetched instead of fullsize.
danbooru · May 22, 2024 · b10b21c · b10b21c
1 parent 6a34465
commit b10b21c
Show file tree

Hide file tree

Showing 3 changed files with 240 additions and 88 deletions.
diff --git a/app/logical/source/extractor/weibo.rb b/app/logical/source/extractor/weibo.rb
@@ -5,41 +5,54 @@ module Source
   class Extractor
     class Weibo < Source::Extractor
       def image_urls
-        if parsed_url.image_url?
+        if parsed_url.full_image_url.present?
           [parsed_url.full_image_url]
-        elsif api_response.present?
-          if api_response["pics"].present?
-            api_response["pics"].pluck("url").map { |url| Source::URL.parse(url).full_image_url }
-          elsif api_response.dig("page_info", "type") == "video"
-            variants = api_response["page_info"]["media_info"].to_h.values + api_response["page_info"]["urls"].to_h.values
-            largest_video = variants.max_by do |variant|
-              if /template=(?<width>\d+)x(?<height>\d+)/ =~ variant.to_s
-                width.to_i * height.to_i
-              else
-                0
-              end
-            end
-            [largest_video]
+        elsif parsed_url.image_url?
+          [parsed_url.to_s]
+        elsif page_json.dig(:page_info, :type) == "video"
+          image_urls_for_video
+        else
+          page_json[:pics].to_a.pluck(:url).map do |image_url|
+            Source::URL.parse(image_url).try(:full_image_url) || image_url
+          end
+        end
+      end
+
+      def image_urls_for_video
+        # https://weibo.com/2427303621/MxojLlLgQ (mixed videos and images)
+        if post[:mix_media_info].present?
+          media_items = post.dig(:mix_media_info, :items).to_a.pluck(:data).map do |item|
+            item[:media_info] || item
           end
+        # https://www.weibo.com/7817290049/N62KL5MpJ (single video)
         else
-          []
+          media_items = [post.dig(:page_info, :media_info)].compact
+        end
+
+        media_items.filter_map do |item|
+          # https://weibo.com/2427303621/MxojLlLgQ (mixed videos and images)
+          # https://www.weibo.com/7817290049/N62KL5MpJ (video with playback_list)
+          # https://m.weibo.cn/detail/4142890299009993 (video with empty playback_list)
+          media_url = [
+            item[:playback_list]&.max_by { |video| video.dig(:meta, :quality_index) }&.dig(:play_info, :url),
+            item[:stream_url_hd],
+            item[:stream_url],
+            item.dig(:largest, :url),
+          ].compact.first
+
+          Source::URL.parse(media_url).try(:full_image_url) || media_url
         end
       end
 
       def page_url
-        return nil unless api_response.present?
-
-        artist_id = api_response["user"]["id"]
-        illust_base62_id = api_response["bid"]
-        "https://www.weibo.com/#{artist_id}/#{illust_base62_id}"
+        "https://www.weibo.com/#{artist_id}/#{illust_base62_id}" if artist_id.present? && illust_base62_id.present?
       end
 
       def tags
-        return [] if api_response.blank?
-
-        matches = api_response["text"]&.scan(/surl-text">#(.*?)#</).to_a.map { |m| m[0] }
-        matches.map do |match|
-          [match, "https://s.weibo.com/weibo/#{match}"]
+        tags = page_json[:text]&.parse_html&.css(".surl-text").to_a.map(&:text).select { |text| text&.match?(/^\#.*\#$/) }
+        tags.map do |tag|
+          tag = tag.delete_prefix("#").delete_suffix("#")
+          [tag, "https://s.weibo.com/weibo?q=#{Danbooru::URL.escape("##{tag}#")}"]
         end
       end
 
@@ -52,25 +65,42 @@ def tag_name
       end
 
       def display_name
-        api_response&.dig("user", "screen_name")
+        page_json.dig(:user, :screen_name)
       end
 
       def artist_id
-        parsed_url.artist_short_id || parsed_referer&.artist_short_id || api_response&.dig("user", "id")
+        parsed_url.artist_short_id || parsed_referer&.artist_short_id || page_json.dig(:user, :id)
       end
 
       def artist_commentary_desc
-        return if api_response.blank?
-
-        api_response["text"]
+        page_json[:text]
       end
 
       def dtext_artist_commentary_desc
         DText.from_html(artist_commentary_desc, base_url: "https://www.weibo.com") do |element|
-          if element["src"].present?
-            src = Addressable::URI.heuristic_parse(element["src"])
-            src.scheme ||= "https"
-            element["src"] = src.to_s
+          case element.name
+
+          # Fix hashtag links to use desktop instead of mobile version.
+          in "a" if element[:href]&.starts_with?("https://m.weibo.cn/search")
+            element[:href] = "https://s.weibo.com/weibo?q=#{Danbooru::URL.escape(element.text)}"
+
+          # Fix user profile links to use desktop instead of mobile version.
+          in "a" if element[:href]&.starts_with?("https://m.weibo.cn/p/index")
+            id = Danbooru::URL.parse(element[:href]).params[:containerid]
+            element[:href] = "https://weibo.com/p/#{id}"
+
+          # Fix external links.
+          in "a" if element[:href]&.starts_with?("https://weibo.cn/sinaurl?u=")
+            element[:href] = Danbooru::URL.parse(element[:href]).params[:u]
+
+          # Remove emoticon images.
+          # <span class="url-icon"> <img alt="[舔屏]" src="https://h5.sinaimg.cn/m/emoticon/icon/default/d_tian-3b1ce0a112.png" style="width:1em; height:1em;" /></span>
+          in "img" if element[:src]&.starts_with?("https://h5.sinaimg.cn/m/emoticon")
+            element.name = "span"
+            element.content = element[:alt]
+
+          else
+            nil
           end
         end
       end
@@ -79,10 +109,43 @@ def mobile_page_url
         parsed_url.mobile_url || parsed_referer&.mobile_url
       end
 
-      memoize def api_response
+      def illust_id
+        parsed_url.illust_id || parsed_referer&.illust_id
+      end
+
+      def illust_base62_id
+        parsed_url.illust_base62_id || parsed_referer&.illust_base62_id || page_json[:bid]
+      end
+
+      memoize def page_json
         html = http.cache(1.minute).parsed_get(mobile_page_url)
         html.to_s[/var \$render_data = \[(.*)\]\[0\]/m, 1]&.parse_json&.dig("status") || {}
       end
+
+      # This API doesn't work for certain posts that can only be opened on the mobile site. It's only used to grab
+      # videos, since the mobile page doesn't return 1080p videos.
+      memoize def post
+        url = "https://www.weibo.com/ajax/statuses/show?id=#{illust_id}" if illust_id.present?
+        http.no_follow.cookies(SUB: sub_cookie).cache(1.minute).parsed_get(url) || {}
+      end
+
+      # This `tid` value is tied to your IP and user agent.
+      memoize def tid
+        response = http.post("https://passport.weibo.com/visitor/genvisitor?cb=gen_callback")
+        data = response.to_s[/({.*})/]&.parse_json&.dig(:data) || {}
+        data[:tid]
+      end
+
+      memoize def visitor_cookies
+        return {} unless tid.present?
+
+        response = http.get("https://passport.weibo.com/visitor/visitor", params: { a: "incarnate", t: tid })
+        response.cookies.to_h { |c| [c.name, c.value] }
+      end
+
+      def sub_cookie
+        visitor_cookies["SUB"]
+      end
     end
   end
 end
diff --git a/app/logical/source/url/weibo.rb b/app/logical/source/url/weibo.rb
@@ -11,94 +11,99 @@
 class Source::URL::Weibo < Source::URL
   RESERVED_USERNAMES = %w[u n p profile status detail]
 
-  attr_reader :full_image_url, :artist_short_id, :artist_long_id, :display_name, :username
+  attr_reader :full_image_url, :artist_short_id, :artist_long_id, :illust_long_id, :illust_base62_id, :display_name, :username
 
   def self.match?(url)
-    url.domain.in?(["weibo.com", "weibo.cn", "sinaimg.cn"])
+    url.domain.in?(%w[weibo.com weibo.cn sinaimg.cn weibocdn.com])
   end
 
   def parse
-    case [host, *path_segments]
+    case [subdomain, domain, *path_segments]
+
+    # https://f.video.weibocdn.com/o0/wPhyi3dIlx086mr8Md3y01041200xT4N0E010.mp4?label=mp4_1080p&template=1080x1920.24.0&media_id=4914351942074379&tp=8x8A3El:YTkl0eM8&us=0&ori=1&bf=4&ot=v&ps=3lckmu&uid=3ZoTIp&ab=,3601-g32,8143-g0,8013-g0,3601-g32,3601-g37&Expires=1716316057&ssig=uW43Bg6Lo1&KID=unistore,video
+    # https://f.us.sinaimg.cn/003K8vB7lx07rz92ubWg010412002UHB0E010.mp4?label=mp4_1080p&template=1920x1080.20.0&media_id=4339747921802209&tp=8x8A3El:YTkl0eM8&us=0&ori=1&bf=4&ot=h&lp=00002g58dE&ps=mZ6WB&uid=zszavag&ab=13038-g0,&Expires=1716411960&ssig=qmkXwFd%2B1m&KID=unistore,video
+    # https://g.us.sinaimg.cn/o0/qNZcaAAglx07Wuf921CM0104120005tc0E010.mp4?label=gif_mp4
+    in _, ("weibocdn.com" | "sinaimg.cn"), *rest if file_ext == "mp4"
+      @full_image_url = with(params: params.slice(:Expires, :ssig, :KID)).to_s
 
     # http://ww1.sinaimg.cn/large/69917555gw1f6ggdghk28j20c87lbhdt.jpg
     # https://wx1.sinaimg.cn/large/002NQ2vhly1gqzqfk1agfj62981aw4qr02.jpg
     # http://ww4.sinaimg.cn/mw690/77a2d531gw1f4u411ws3aj20m816fagg.jpg (sample)
     # https://wx4.sinaimg.cn/orj360/e3930166gy1g546bz86cij20u00u040y.jpg (sample)
     # http://ww3.sinaimg.cn/mw1024/0065kjmOgw1fabcanrzx6j30f00lcjwv.jpg (sample)
     # https://wx1.sinaimg.cn/original/7004ec1cly1ge9dcbsw4lj20jg2ir7wh.jpg
-    in /\w+\.sinaimg\.cn$/ => host, size, file
+    in _, "sinaimg.cn", size, file
       @full_image_url = "https://#{host}/large/#{file}"
 
     # http://tw.weibo.com/1300957955/3786333853668537
-    in "tw.weibo.com", /^\w+$/, /^\d+$/ => illust_long_id
+    in "tw", "weibo.com", /^\w+$/, /^\d+$/ => illust_long_id
       @illust_long_id = illust_long_id
 
     # http://weibo.com/3357910224/EEHA1AyJP
     # https://www.weibo.com/5501756072/IF9fugHzj?from=page_1005055501756072_profile&wvr=6&mod=weibotime
-    in /weibo\.(com|cn)$/, /^\d+$/ => artist_short_id, /^\w+$/ => illust_base62_id
+    in _, ("weibo.com" | "weibo.cn"), /^\d+$/ => artist_short_id, /^\w+$/ => illust_base62_id
       @artist_short_id = artist_short_id
       @illust_base62_id = illust_base62_id
 
     # http://photo.weibo.com/2125874520/wbphotos/large/mid/4194742441135220/pid/7eb64558gy1fnbryb5nzoj20dw10419t
     # http://photo.weibo.com/5732523783/talbum/detail/photo_id/4029784374069389?prel=p6_3
-    in "photo.weibo.com", /^\d+$/ => artist_short_id, _, _, _, /^\d+$/ => illust_long_id, *rest
+    in "photo", "weibo.com", /^\d+$/ => artist_short_id, _, _, _, /^\d+$/ => illust_long_id, *rest
       @artist_short_id = artist_short_id
       @illust_long_id = illust_long_id
 
     # https://m.weibo.cn/detail/4506950043618873
     # https://www.weibo.com/detail/4676597657371957
-    in _, "detail", /^\d+$/ => illust_long_id
+    in _, ("weibo.cn" | "weibo.com"), "detail", /^\d+$/ => illust_long_id
       @illust_long_id = illust_long_id
 
     # https://share.api.weibo.cn/share/304950356,4767694689143828.html
     # https://share.api.weibo.cn/share/304950356,4767694689143828
-    in "share.api.weibo.cn", "share", /^(\d+),(\d+)/
+    in "share.api", "weibo.cn", "share", /^(\d+),(\d+)/
       @illust_long_id = $2
 
     # https://m.weibo.cn/status/J33G4tH1B
-    in "m.weibo.cn", "status", /^\w+$/ => illust_base62_id
+    in "m", "weibo.cn", "status", /^\w+$/ => illust_base62_id
       @illust_base62_id = illust_base62_id
 
     # https://www.weibo.com/u/5501756072
     # https://www.weibo.com/u/5957640693/home?wvr=5
     # https://m.weibo.cn/profile/5501756072
     # https://m.weibo.cn/u/5501756072
-    in _, ("u" | "profile"), /^\d+$/ => artist_short_id, *rest
+    in _, _, ("u" | "profile"), /^\d+$/ => artist_short_id, *rest
       @artist_short_id = artist_short_id
 
     # https://www.weibo.com/p/1005055399876326 (short id: https://www.weibo.com/u/5399876326; username: https://www.weibo.com/chengziyou666)
     # https://www.weibo.com/p/1005055399876326/home?from=page_100505&mod=TAB&is_hot=1
     # https://www.weibo.cn/p/1005055399876326
     # https://m.weibo.com/p/1005055399876326
-    in _, "p", /^\d+$/ => artist_long_id, *rest
+    in _, _, "p", /^\d+$/ => artist_long_id, *rest
       @artist_long_id = artist_long_id
 
     # https://www.weibo.com/5501756072
     # https://www.weibo.cn/5501756072
     # https://weibo.com/1843267214/profile
-    in _, /^\d+$/ => artist_short_id, *rest
+    in _, _, /^\d+$/ => artist_short_id, *rest
       @artist_short_id = artist_short_id
 
     # https://weibo.com/n/肆巳4
     # https://www.weibo.com/n/小小男爵不要坑
-    in _, "n", display_name, *rest
+    in _, _, "n", display_name, *rest
       @display_name = display_name
 
     # https://www.weibo.com/endlessnsmt (short id: https://www.weibo.com/u/1879370780)
     # https://www.weibo.cn/endlessnsmt
     # https://www.weibo.com/lvxiuzi0/home
-    in _, /^\w+$/ => username, *rest unless username.in?(RESERVED_USERNAMES)
+    in _, _, /^\w+$/ => username, *rest unless username.in?(RESERVED_USERNAMES)
       @username = username
 
+    # https://weibo.com/tv/show/1034:4914351942074379?from=old_pc_videoshow
+    # https://video.weibo.com/show?fid=1034:4914351942074379
+    # https://weibo.com/ajax/common/download?pid=7eb64558gy1fnbry9mgx0j20dw104qjd
     else
       nil
     end
   end
 
-  def image_url?
-    full_image_url.present?
-  end
-
   def profile_url
     if artist_short_id.present?
       "https://www.weibo.com/u/#{artist_short_id}"
@@ -111,6 +116,10 @@ def profile_url
     end
   end
 
+  def illust_id
+    illust_long_id || illust_base62_id
+  end
+
   def mobile_url
     if @illust_long_id.present?
       "https://m.weibo.cn/detail/#{@illust_long_id}"