From f6fcb7d99bd82cfbfdd3c3212c42436b4a83571b Mon Sep 17 00:00:00 2001 From: Zhengyang Song Date: Sun, 27 Aug 2023 15:11:12 +0800 Subject: [PATCH] Fix the crawling of toutiao article urls. --- weibo_spider/parser/page_parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/weibo_spider/parser/page_parser.py b/weibo_spider/parser/page_parser.py index a2c5900..fcd1714 100644 --- a/weibo_spider/parser/page_parser.py +++ b/weibo_spider/parser/page_parser.py @@ -75,7 +75,7 @@ def get_one_page(self, weibo_id_list): publish_time = datetime_util.str_to_time( weibo.publish_time) - if publish_time < since_date: + if publish_time < since_date: # As of 2023.05, there can be at most 2 pinned weibo. # We will continue for at most 2 times before return. if self.page == 1 and cur_pinned_count < MAX_PINNED_COUNT: @@ -158,9 +158,9 @@ def get_article_url(self, info): """获取微博头条文章的url""" article_url = '' text = handle_garbled(info) - if text.startswith(u'发布了头条文章'): + if text.startswith(u'发布了头条文章') or text.startswith(u'我发表了头条文章'): url = info.xpath('.//a/@href') - if url and url[0].startswith('https://weibo.cn/sinaurl'): + if url and url[0].startswith('https://weibo.com/ttarticle'): article_url = url[0] return article_url