[CI] auto update yt_dlp to upstream commit 12d8ea8246fa901de302ff5cc7…

…48caddadc82f41
firsttris · May 17, 2024 · 5439057 · 5439057
1 parent e48cb8e
commit 5439057
Show file tree

Hide file tree

Showing 8 changed files with 481 additions and 197 deletions.
diff --git a/lib/yt_dlp/cookies.py b/lib/yt_dlp/cookies.py
@@ -46,7 +46,7 @@
 from .utils._utils import _YDLLogger
 from .utils.networking import normalize_url
 
-CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'}
+CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi', 'whale'}
 SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'}
 
 
@@ -219,6 +219,7 @@ def _get_chromium_based_browser_settings(browser_name):
             'edge': os.path.join(appdata_local, R'Microsoft\Edge\User Data'),
             'opera': os.path.join(appdata_roaming, R'Opera Software\Opera Stable'),
             'vivaldi': os.path.join(appdata_local, R'Vivaldi\User Data'),
+            'whale': os.path.join(appdata_local, R'Naver\Naver Whale\User Data'),
         }[browser_name]
 
     elif sys.platform == 'darwin':
@@ -230,6 +231,7 @@ def _get_chromium_based_browser_settings(browser_name):
             'edge': os.path.join(appdata, 'Microsoft Edge'),
             'opera': os.path.join(appdata, 'com.operasoftware.Opera'),
             'vivaldi': os.path.join(appdata, 'Vivaldi'),
+            'whale': os.path.join(appdata, 'Naver/Whale'),
         }[browser_name]
 
     else:
@@ -241,6 +243,7 @@ def _get_chromium_based_browser_settings(browser_name):
             'edge': os.path.join(config, 'microsoft-edge'),
             'opera': os.path.join(config, 'opera'),
             'vivaldi': os.path.join(config, 'vivaldi'),
+            'whale': os.path.join(config, 'naver-whale'),
         }[browser_name]
 
     # Linux keyring names can be determined by snooping on dbus while opening the browser in KDE:
@@ -252,6 +255,7 @@ def _get_chromium_based_browser_settings(browser_name):
         'edge': 'Microsoft Edge' if sys.platform == 'darwin' else 'Chromium',
         'opera': 'Opera' if sys.platform == 'darwin' else 'Chromium',
         'vivaldi': 'Vivaldi' if sys.platform == 'darwin' else 'Chrome',
+        'whale': 'Whale',
     }[browser_name]
 
     browsers_without_profiles = {'opera'}

diff --git a/lib/yt_dlp/extractor/bbc.py b/lib/yt_dlp/extractor/bbc.py
diff --git a/lib/yt_dlp/extractor/cda.py b/lib/yt_dlp/extractor/cda.py
@@ -16,7 +16,6 @@
     merge_dicts,
     multipart_encode,
     parse_duration,
-    random_birthday,
     traverse_obj,
     try_call,
     try_get,
@@ -63,38 +62,57 @@ class CDAIE(InfoExtractor):
             'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
             'thumbnail': r're:^https?://.*\.jpg$',
             'uploader': 'crash404',
-            'view_count': int,
             'average_rating': float,
             'duration': 137,
             'age_limit': 0,
+            'upload_date': '20160220',
+            'timestamp': 1455968218,
         }
     }, {
-        # Age-restricted
-        'url': 'http://www.cda.pl/video/1273454c4',
+        # Age-restricted with vfilm redirection
+        'url': 'https://www.cda.pl/video/8753244c4',
+        'md5': 'd8eeb83d63611289507010d3df3bb8b3',
         'info_dict': {
-            'id': '1273454c4',
+            'id': '8753244c4',
             'ext': 'mp4',
-            'title': 'Bronson (2008) napisy HD 1080p',
-            'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c',
+            'title': '[18+] Bez Filtra: Rezerwowe Psy czyli...  najwulgarniejsza polska gra?',
+            'description': 'md5:ae80bac31bd6a9f077a6cce03c7c077e',
             'height': 1080,
-            'uploader': 'boniek61',
+            'uploader': 'arhn eu',
             'thumbnail': r're:^https?://.*\.jpg$',
-            'duration': 5554,
+            'duration': 991,
             'age_limit': 18,
-            'view_count': int,
             'average_rating': float,
-        },
+            'timestamp': 1633888264,
+            'upload_date': '20211010',
+        }
+    }, {
+        # Age-restricted without vfilm redirection
+        'url': 'https://www.cda.pl/video/17028157b8',
+        'md5': 'c1fe5ff4582bace95d4f0ce0fbd0f992',
+        'info_dict': {
+            'id': '17028157b8',
+            'ext': 'mp4',
+            'title': 'STENDUPY MICHAŁ OGIŃSKI',
+            'description': 'md5:5851f3272bfc31f762d616040a1d609a',
+            'height': 480,
+            'uploader': 'oginski',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'duration': 18855,
+            'age_limit': 18,
+            'average_rating': float,
+            'timestamp': 1699705901,
+            'upload_date': '20231111',
+        }
     }, {
         'url': 'http://ebd.cda.pl/0x0/5749950c',
         'only_matching': True,
     }]
 
     def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
-        form_data = random_birthday('rok', 'miesiac', 'dzien')
-        form_data.update({'return': url, 'module': 'video', 'module_id': video_id})
-        data, content_type = multipart_encode(form_data)
+        data, content_type = multipart_encode({'age_confirm': ''})
         return self._download_webpage(
-            urljoin(url, '/a/validatebirth'), video_id, *args,
+            url, video_id, *args,
             data=data, headers={
                 'Referer': url,
                 'Content-Type': content_type,
@@ -164,7 +182,7 @@ def _real_extract(self, url):
         if 'Authorization' in self._API_HEADERS:
             return self._api_extract(video_id)
         else:
-            return self._web_extract(video_id, url)
+            return self._web_extract(video_id)
 
     def _api_extract(self, video_id):
         meta = self._download_json(
@@ -197,9 +215,9 @@ def _api_extract(self, video_id):
             'view_count': meta.get('views'),
         }
 
-    def _web_extract(self, video_id, url):
+    def _web_extract(self, video_id):
         self._set_cookie('cda.pl', 'cda.player', 'html5')
-        webpage = self._download_webpage(
+        webpage, urlh = self._download_webpage_handle(
             f'{self._BASE_URL}/video/{video_id}/vfilm', video_id)
 
         if 'Ten film jest dostępny dla użytkowników premium' in webpage:
@@ -209,10 +227,10 @@ def _web_extract(self, video_id, url):
             self.raise_geo_restricted()
 
         need_confirm_age = False
-        if self._html_search_regex(r'(<form[^>]+action="[^"]*/a/validatebirth[^"]*")',
+        if self._html_search_regex(r'(<button[^>]+name="[^"]*age_confirm[^"]*")',
                                    webpage, 'birthday validate form', default=None):
             webpage = self._download_age_confirm_page(
-                url, video_id, note='Confirming age')
+                urlh.url, video_id, note='Confirming age')
             need_confirm_age = True
 
         formats = []
@@ -222,9 +240,6 @@ def _web_extract(self, video_id, url):
             (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
             <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
         ''', webpage, 'uploader', default=None, group='uploader')
-        view_count = self._search_regex(
-            r'Odsłony:(?:\s|&nbsp;)*([0-9]+)', webpage,
-            'view_count', default=None)
         average_rating = self._search_regex(
             (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
              r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False,
@@ -235,7 +250,6 @@ def _web_extract(self, video_id, url):
             'title': self._og_search_title(webpage),
             'description': self._og_search_description(webpage),
             'uploader': uploader,
-            'view_count': int_or_none(view_count),
             'average_rating': float_or_none(average_rating),
             'thumbnail': self._og_search_thumbnail(webpage),
             'formats': formats,

diff --git a/lib/yt_dlp/extractor/common.py b/lib/yt_dlp/extractor/common.py
@@ -957,7 +957,8 @@ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=
         if urlh is False:
             assert not fatal
             return False
-        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
+        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal,
+                                             encoding=encoding, data=data)
         return (content, urlh)
 
     @staticmethod
@@ -1005,8 +1006,10 @@ def __check_blocked(self, content):
                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
                 expected=True)
 
-    def _request_dump_filename(self, url, video_id):
-        basen = f'{video_id}_{url}'
+    def _request_dump_filename(self, url, video_id, data=None):
+        if data is not None:
+            data = hashlib.md5(data).hexdigest()
+        basen = join_nonempty(video_id, data, url, delim='_')
         trim_length = self.get_param('trim_file_name') or 240
         if len(basen) > trim_length:
             h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
@@ -1028,16 +1031,18 @@ def __decode_webpage(self, webpage_bytes, encoding, headers):
         except LookupError:
             return webpage_bytes.decode('utf-8', 'replace')
 
-    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
+    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True,
+                              prefix=None, encoding=None, data=None):
         webpage_bytes = urlh.read()
         if prefix is not None:
             webpage_bytes = prefix + webpage_bytes
+        url_or_request = self._create_request(url_or_request, data)
         if self.get_param('dump_intermediate_pages', False):
             self.to_screen('Dumping request to ' + urlh.url)
             dump = base64.b64encode(webpage_bytes).decode('ascii')
             self._downloader.to_screen(dump)
         if self.get_param('write_pages'):
-            filename = self._request_dump_filename(urlh.url, video_id)
+            filename = self._request_dump_filename(urlh.url, video_id, url_or_request.data)
             self.to_screen(f'Saving request to {filename}')
             with open(filename, 'wb') as outf:
                 outf.write(webpage_bytes)
@@ -1098,7 +1103,7 @@ def download_content(self, url_or_request, video_id, note=note, errnote=errnote,
                              impersonate=None, require_impersonation=False):
             if self.get_param('load_pages'):
                 url_or_request = self._create_request(url_or_request, data, headers, query)
-                filename = self._request_dump_filename(url_or_request.url, video_id)
+                filename = self._request_dump_filename(url_or_request.url, video_id, url_or_request.data)
                 self.to_screen(f'Loading request from {filename}')
                 try:
                     with open(filename, 'rb') as dumpf:

diff --git a/lib/yt_dlp/extractor/tiktok.py b/lib/yt_dlp/extractor/tiktok.py
@@ -45,19 +45,18 @@ class TikTokBaseIE(InfoExtractor):
         # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0
         'aid': '0',
     }
-    _KNOWN_APP_INFO = [
-        '7351144126450059040',
-        '7351149742343391009',
-        '7351153174894626592',
-    ]
     _APP_INFO_POOL = None
     _APP_INFO = None
     _APP_USER_AGENT = None
 
+    @property
+    def _KNOWN_APP_INFO(self):
+        return self._configuration_arg('app_info', ie_key=TikTokIE)
+
     @property
     def _API_HOSTNAME(self):
         return self._configuration_arg(
-            'api_hostname', ['api22-normal-c-useast2a.tiktokv.com'], ie_key=TikTokIE)[0]
+            'api_hostname', ['api16-normal-c-useast1a.tiktokv.com'], ie_key=TikTokIE)[0]
 
     def _get_next_app_info(self):
         if self._APP_INFO_POOL is None:
@@ -66,13 +65,10 @@ def _get_next_app_info(self):
                 for key, default in self._APP_INFO_DEFAULTS.items()
                 if key != 'iid'
             }
-            app_info_list = (
-                self._configuration_arg('app_info', ie_key=TikTokIE)
-                or random.sample(self._KNOWN_APP_INFO, len(self._KNOWN_APP_INFO)))
             self._APP_INFO_POOL = [
                 {**defaults, **dict(
                     (k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/')) if v
-                )} for app_info in app_info_list
+                )} for app_info in self._KNOWN_APP_INFO
             ]
 
         if not self._APP_INFO_POOL:
@@ -757,11 +753,13 @@ class TikTokIE(TikTokBaseIE):
 
     def _real_extract(self, url):
         video_id, user_id = self._match_valid_url(url).group('id', 'user_id')
-        try:
-            return self._extract_aweme_app(video_id)
-        except ExtractorError as e:
-            e.expected = True
-            self.report_warning(f'{e}; trying with webpage')
+
+        if self._KNOWN_APP_INFO:
+            try:
+                return self._extract_aweme_app(video_id)
+            except ExtractorError as e:
+                e.expected = True
+                self.report_warning(f'{e}; trying with webpage')
 
         url = self._create_url(user_id, video_id)
         webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'Mozilla/5.0'})

diff --git a/lib/yt_dlp/extractor/twitter.py b/lib/yt_dlp/extractor/twitter.py
@@ -36,7 +36,7 @@ class TwitterBaseIE(InfoExtractor):
     _NETRC_MACHINE = 'twitter'
     _API_BASE = 'https://api.twitter.com/1.1/'
     _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/'
-    _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/'
+    _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:(?:twitter|x)\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/'
     _AUTH = 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
     _LEGACY_AUTH = 'AAAAAAAAAAAAAAAAAAAAAIK1zgAAAAAA2tUWuhGZ2JceoId5GwYWU5GspY4%3DUq7gzFoCZs1QfwGoVdvSac3IniczZEYXIcDyumCauIXpcAPorE'
     _flow_token = None
@@ -1191,6 +1191,31 @@ class TwitterIE(TwitterBaseIE):
             'age_limit': 0,
             '_old_archive_ids': ['twitter 1724884212803834154'],
         },
+    }, {
+        # x.com
+        'url': 'https://x.com/historyinmemes/status/1790637656616943991',
+        'md5': 'daca3952ba0defe2cfafb1276d4c1ea5',
+        'info_dict': {
+            'id': '1790637589910654976',
+            'ext': 'mp4',
+            'title': 'Historic Vids - One of the most intense moments in history',
+            'description': 'One of the most intense moments in history https://t.co/Zgzhvix8ES',
+            'display_id': '1790637656616943991',
+            'uploader': 'Historic Vids',
+            'uploader_id': 'historyinmemes',
+            'uploader_url': 'https://twitter.com/historyinmemes',
+            'channel_id': '855481986290524160',
+            'upload_date': '20240515',
+            'timestamp': 1715756260.0,
+            'duration': 15.488,
+            'tags': [],
+            'comment_count': int,
+            'repost_count': int,
+            'like_count': int,
+            'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+',
+            'age_limit': 0,
+            '_old_archive_ids': ['twitter 1790637656616943991'],
+        }
     }, {
         # onion route
         'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273',