dataabc · junbaor · Jan 2, 2022 · Jan 3, 2022 · Mar 31, 2022 · dataabc
diff --git a/weibo_spider/parser/comment_parser.py b/weibo_spider/parser/comment_parser.py
@@ -1,33 +1,27 @@
 import logging
 import random
-import requests
 from time import sleep
 
 from .parser import Parser
-from .util import handle_garbled, handle_html
+from .util import handle_html, get_long_weibo_detail
 
 logger = logging.getLogger('spider.comment_parser')
 
 
 class CommentParser(Parser):
     def __init__(self, cookie, weibo_id):
         self.cookie = cookie
+        self.weibo_id = weibo_id
         self.url = 'https://weibo.cn/comment/' + weibo_id
         self.selector = handle_html(self.cookie, self.url)
 
     def get_long_weibo(self):
         """获取长原创微博"""
         try:
             for i in range(5):
-                self.selector = handle_html(self.cookie, self.url)
-                if self.selector is not None:
-                    info = self.selector.xpath("//div[@class='c']")[1]
-                    wb_content = handle_garbled(info)
-                    wb_time = info.xpath("//span[@class='ct']/text()")[0]
-                    weibo_content = wb_content[wb_content.find(':') +
-                                               1:wb_content.rfind(wb_time)]
-                    if weibo_content is not None:
-                        return weibo_content
+                weibo_content = get_long_weibo_detail(self.cookie, self.weibo_id)
+                if weibo_content is not None:
+                    return weibo_content
                 sleep(random.randint(6, 10))
         except Exception:
             logger.exception(u'网络出错')

diff --git a/weibo_spider/parser/util.py b/weibo_spider/parser/util.py
@@ -1,6 +1,7 @@
 import hashlib
 import json
 import logging
+import re
 import sys
 
 import requests
@@ -118,3 +119,16 @@ def string_to_int(string):
     elif string.endswith(u'亿'):
         string = float(string[:-1]) * 100000000
     return int(string)
+
+
+def get_long_weibo_detail(cookie, id):
+    """获取长微博详情"""
+    try:
+        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
+        headers = {'User_Agent': user_agent, 'Cookie': cookie}
+        resp = requests.get("https://m.weibo.cn/statuses/show?id=" + id, headers=headers)
+        if resp.status_code == 200:
+            content = resp.json()['data']['text'].replace("<br />", "\n")
+            return re.sub("</?[^>]+>", "", content)
+    except Exception as e:
+        logger.exception(e)