Skip to content

Commit

Permalink
Merge pull request #575 from myshero/issues_bug_574
Browse files Browse the repository at this point in the history
issues_bug_574 优化获取微博长文
  • Loading branch information
dataabc committed Apr 27, 2024
2 parents 4b9d66a + 241d109 commit d7de931
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 8 deletions.
21 changes: 15 additions & 6 deletions weibo_spider/parser/comment_parser.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import logging
import random
import requests
import re
from time import sleep

from lxml.html import tostring
from lxml.html import fromstring
from lxml import etree
from .parser import Parser
from .util import handle_garbled, handle_html

Expand All @@ -21,11 +24,17 @@ def get_long_weibo(self):
for i in range(5):
self.selector = handle_html(self.cookie, self.url)
if self.selector is not None:
info = self.selector.xpath("//div[@class='c']")[1]
wb_content = handle_garbled(info)
wb_time = info.xpath("//span[@class='ct']/text()")[0]
weibo_content = wb_content[wb_content.find(':') +
1:wb_content.rfind(wb_time)]
info_div = self.selector.xpath("//div[@class='c' and @id='M_']")[0]
info_span = info_div.xpath("//span[@class='ctt']")[0]
# 1. 获取 info_span 中的所有 HTML 代码作为字符串
html_string = etree.tostring(info_span, encoding='unicode', method='html')
# 2. 将 <br> 替换为 \n
html_string = html_string.replace('<br>', '\n')
# 3. 去掉所有 HTML 标签,但保留标签内的有效文本
new_content = fromstring(html_string).text_content()
# 4. 替换多个连续的 \n 为一个 \n
new_content = re.sub(r'\n+', '\n', new_content)
weibo_content = handle_garbled(new_content)
if weibo_content is not None:
return weibo_content
sleep(random.randint(6, 10))
Expand Down
9 changes: 7 additions & 2 deletions weibo_spider/parser/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,13 @@ def handle_html(cookie, url):
def handle_garbled(info):
"""处理乱码"""
try:
info = (info.xpath('string(.)').replace(u'\u200b', '').encode(
sys.stdout.encoding, 'ignore').decode(sys.stdout.encoding))
if hasattr(info, 'xpath'): # 检查 info 是否具有 xpath 方法
info_str = info.xpath('string(.)') # 提取字符串内容
else:
info_str = str(info) # 若不支持 xpath,将其转换为字符串

info = info_str.replace(u'\u200b', '').encode(
sys.stdout.encoding, 'ignore').decode(sys.stdout.encoding)
return info
except Exception as e:
logger.exception(e)
Expand Down

0 comments on commit d7de931

Please sign in to comment.