From 047cbb39987d4fd5d51202ce6b35ff26cdb7e5ad Mon Sep 17 00:00:00 2001 From: junbaor Date: Sun, 2 Jan 2022 11:18:49 +0800 Subject: [PATCH 1/2] =?UTF-8?q?refactor:=20=E8=8E=B7=E5=8F=96=E5=8E=9F?= =?UTF-8?q?=E5=88=9B=E5=BE=AE=E5=8D=9A=E9=95=BF=E5=86=85=E5=AE=B9=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=B8=BA=E6=8E=A5=E5=8F=A3=E8=B0=83=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/parser/comment_parser.py | 16 +++++----------- weibo_spider/parser/util.py | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py index 28e75e3..a2774b3 100644 --- a/weibo_spider/parser/comment_parser.py +++ b/weibo_spider/parser/comment_parser.py @@ -1,10 +1,9 @@ import logging import random -import requests from time import sleep from .parser import Parser -from .util import handle_garbled, handle_html +from .util import handle_html, get_long_weibo_detail logger = logging.getLogger('spider.comment_parser') @@ -12,6 +11,7 @@ class CommentParser(Parser): def __init__(self, cookie, weibo_id): self.cookie = cookie + self.weibo_id = weibo_id self.url = 'https://weibo.cn/comment/' + weibo_id self.selector = handle_html(self.cookie, self.url) @@ -19,15 +19,9 @@ def get_long_weibo(self): """获取长原创微博""" try: for i in range(5): - self.selector = handle_html(self.cookie, self.url) - if self.selector is not None: - info = self.selector.xpath("//div[@class='c']")[1] - wb_content = handle_garbled(info) - wb_time = info.xpath("//span[@class='ct']/text()")[0] - weibo_content = wb_content[wb_content.find(':') + - 1:wb_content.rfind(wb_time)] - if weibo_content is not None: - return weibo_content + weibo_content = get_long_weibo_detail(self.cookie, self.weibo_id) + if weibo_content is not None: + return weibo_content sleep(random.randint(6, 10)) except Exception: logger.exception(u'网络出错') diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py index 3169f24..b357e08 100644 --- a/weibo_spider/parser/util.py +++ b/weibo_spider/parser/util.py @@ -1,6 +1,7 @@ import hashlib import json import logging +import re import sys import requests @@ -118,3 +119,16 @@ def string_to_int(string): elif string.endswith(u'亿'): string = float(string[:-1]) * 100000000 return int(string) + + +def get_long_weibo_detail(cookie, id): + """获取长微博详情""" + try: + user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36' + headers = {'User_Agent': user_agent, 'Cookie': cookie} + resp = requests.get("https://m.weibo.cn/statuses/show?id=" + id, headers=headers) + if resp.status_code == 200: + content = resp.json()['data']['text'].replace("
", "\n") + return re.sub("]+>", "", content) + except Exception as e: + logger.exception(e) From 307a1b8b78497870e193201e3a61de2907422e4c Mon Sep 17 00:00:00 2001 From: junbaor Date: Mon, 3 Jan 2022 10:51:53 +0800 Subject: [PATCH 2/2] =?UTF-8?q?fix:=20=E5=88=A0=E9=99=A4=E6=97=A0=E7=94=A8?= =?UTF-8?q?=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- weibo_spider/parser/comment_parser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py index a2774b3..e1131ad 100644 --- a/weibo_spider/parser/comment_parser.py +++ b/weibo_spider/parser/comment_parser.py @@ -13,7 +13,6 @@ def __init__(self, cookie, weibo_id): self.cookie = cookie self.weibo_id = weibo_id self.url = 'https://weibo.cn/comment/' + weibo_id - self.selector = handle_html(self.cookie, self.url) def get_long_weibo(self): """获取长原创微博"""