<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [6]:
import requests
import csv
import time
import re
from datetime import datetime, timedelta
from urllib.parse import quote
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def get_cookie_manually():
    print("请手动登录微博并获取目标页面的Cookie：")
    print("1. 打开Chrome浏览器，登录你的微博账号。")
    print("2. 访问微博高级搜索页面：https://s.weibo.com/")
    print("3. 按下F12打开开发者工具，切换到Network面板。")
    print("4. 刷新页面，在左侧列表中找到任意一个请求，点击它，在右侧的Headers选项卡中找到Request Headers下的cookie，复制其值。")
    cookie = input("请粘贴获取到的Cookie：")
    return cookie

def get_weibo_session(cookie):
    """创建带有重试机制的会话"""
    session = requests.Session()
    retry_strategy = Retry(
        total=5,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)
    session.mount("http://", adapter)
    
    headers = {
        'cookie': cookie,
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
        'referer': 'https://s.weibo.com/'
    }
    session.headers.update(headers)
    return session

def search_weibo(session, keyword, max_pages=50):
    """搜索微博文章 - 只筛选热门帖子"""
    base_url = "https://s.weibo.com/weibo"
    
    params = {
        'q': keyword,
        'xsort': 'hot',  # 按热门排序
        'suball': 1,     # 包含所有子类型
        'Refer': 'g',
        'page': 1
    }
    
    articles = []
    page_count = 0
    
    while page_count < max_pages:
        page_count += 1
        print(f"正在搜索第 {page_count} 页...")
        params['page'] = page_count
        
        try:
            response = session.get(base_url, params=params, timeout=30)
            if response.status_code != 200:
                print(f"搜索请求失败，状态码: {response.status_code}")
                print(f"响应内容: {response.text[:200]}...")
                break
                
            # 检查是否被重定向到验证页面
            if "安全验证" in response.text or "验证码" in response.text:
                print("⚠️ 遇到安全验证，请手动解决验证问题后再继续")
                input("按回车键继续...")
                continue
                
            soup = BeautifulSoup(response.text, 'html.parser')
            cards = soup.find_all('div', class_='card-wrap')
            
            if not cards:
                print("没有找到更多文章，停止搜索")
                break
                
            for card in cards:
                # 过滤广告和推荐内容
                if card.find('div', class_='card-top'):
                    continue
                    
                # 获取文章信息
                mid = card.get('mid', '')
                user_info = card.find('a', class_='name')
                user_name = user_info.text.strip() if user_info else "未知用户"
                
                # 处理用户链接
                user_link = user_info['href'] if user_info and user_info.get('href') else ""
                user_id = ""
                if user_link:
                    # 提取用户ID - 更健壮的方法
                    user_id_match = re.search(r'/(\d+)(\?|\b)', user_link)
                    if user_id_match:
                        user_id = user_id_match.group(1)
                
                # 获取内容
                content = card.find('p', class_='txt')
                content_text = content.text.strip() if content else ""
                
                # 获取发布时间 - 改进方法
                post_time = "未知时间"
                time_tag = card.find('p', class_='from')
                if time_tag:
                    time_a = time_tag.find('a')
                    if time_a:
                        post_time = time_a.text.strip()
                
                # 获取文章链接 - 改进方法
                article_link = ""
                # 从时间标签获取链接
                if time_tag and time_tag.find('a'):
                    link_tag = time_tag.find('a')
                    if link_tag and 'href' in link_tag.attrs:
                        article_link = "https://s.weibo.com" + link_tag['href'] if not link_tag['href'].startswith('http') else link_tag['href']
                
                # 从内容区域获取链接
                if not article_link:
                    content_link = card.find('a', class_='t')
                    if content_link and 'href' in content_link.attrs:
                        article_link = "https://s.weibo.com" + content_link['href'] if not content_link['href'].startswith('http') else content_link['href']
                
                # 从其他可能的元素获取链接
                if not article_link:
                    possible_links = card.find_all('a', href=True)
                    for link in possible_links:
                        if '/detail/' in link['href'] or '/status/' in link['href']:
                            article_link = "https://s.weibo.com" + link['href'] if not link['href'].startswith('http') else link['href']
                            break
                
                # 获取评论数和转发数 - 改进方法
                comment_count = 0
                repost_count = 0
                
                # 查找互动区域
                card_act = card.find('div', class_='card-act')
                if card_act:
                    # 查找所有li标签
                    li_tags = card_act.find_all('li')
                    for li in li_tags:
                        # 查找a标签
                        a_tag = li.find('a')
                        if a_tag:
                            text = a_tag.text.strip()
                            # 提取数字
                            num_text = re.sub(r'\D', '', text)
                            if num_text:
                                num = int(num_text)
                                
                                # 根据文本内容判断类型
                                if '转发' in text:
                                    repost_count = num
                                elif '评论' in text:
                                    comment_count = num
                                elif '赞' in text:
                                    pass  # 不需要赞数
                
                # 提取文章ID - 改进方法
                if not mid:
                    # 尝试从文章链接中提取
                    if article_link:
                        # 尝试匹配 /weibo? 格式的链接
                        mid_match = re.search(r'/(\d+)/(\w+)', article_link)
                        if mid_match:
                            mid = mid_match.group(2)
                        else:
                            # 尝试匹配 /detail/ 格式的链接
                            mid_match = re.search(r'/detail/(\w+)', article_link)
                            if mid_match:
                                mid = mid_match.group(1)
                
                # 如果还没有提取到mid，生成一个临时ID
                if not mid:
                    mid = f"temp_{int(time.time())}"
                
                article = {
                    'mid': mid,
                    'user_id': user_id,
                    'user_name': user_name,
                    'content': content_text,
                    'post_time': post_time,
                    'article_link': article_link,
                    'comment_count': comment_count,
                    'repost_count': repost_count
                }
                
                articles.append(article)
                print(f"找到文章: {user_name} - {content_text[:30]}... (评论: {comment_count}, 转发: {repost_count}, 发布时间: {post_time})")
            
            print(f"第 {page_count} 页找到 {len(cards)} 篇文章，已获取 {len(articles)} 篇文章")
            
            # 检查是否有下一页
            next_page = soup.find('a', class_='next')
            if not next_page:
                print("已到达最后一页")
                break
                
            # 避免频繁请求
            time.sleep(5 + page_count % 3)
            
        except Exception as e:
            print(f"搜索过程中发生错误: {str(e)}")
            break
    
    return articles

def get_comments(session, article):
    """获取单篇文章的所有评论"""
    # 如果文章链接缺失，尝试使用其他方式获取评论
    if not article.get('article_link') and not article.get('mid'):
        print("文章链接和ID都缺失，无法获取评论")
        return []
    
    # 如果缺少用户ID，尝试从文章链接中提取
    if not article.get('user_id') and article.get('article_link'):
        user_id_match = re.search(r'/(\d+)/', article['article_link'])
        if user_id_match:
            article['user_id'] = user_id_match.group(1)
    
    if not article.get('user_id'):
        print("⚠️ 无法提取用户ID，使用默认值")
        article['user_id'] = "0"
    
    comments = []
    next_param = 'count=20'
    page_count = 0
    max_retries = 3
    retry_count = 0
    total_comments = 0
    expected_total = article.get('comment_count', 0)
    
    print(f"开始获取文章 {article['mid']} 的评论 (预计: {expected_total})...")
    
    while True:
        page_count += 1
        url = f'https://weibo.com/ajax/statuses/buildComments?is_reload=1&id={article["mid"]}&is_show_bulletin=2&is_mix=0&{next_param}&uid={article["user_id"]}&fetch_level=0&locale=zh-CN'
        
        try:
            response = session.get(url, timeout=30)
            if response.status_code != 200:
                print(f"评论请求失败，状态码: {response.status_code}")
                if retry_count < max_retries:
                    retry_count += 1
                    print(f"第 {retry_count} 次重试...")
                    time.sleep(10)
                    continue
                else:
                    print("重试次数已达上限，停止获取")
                    break
            
            json_data = response.json()
            data_list = json_data.get('data', [])
            max_id = json_data.get('max_id', 0)
            
            if not data_list:
                print("没有评论数据")
                break
                
            for comment in data_list:
                text_raw = comment.get('text_raw', '')
                id = comment.get('id', '')
                created_at = comment.get('created_at', '')
                like_counts = comment.get('like_counts', 0)
                total_number = comment.get('total_number', 0)
                user_info = comment.get('user', {})
                screen_name = user_info.get('screen_name', '')
                user_id = user_info.get('id', '')
                
                comment_data = {
                    'article_mid': article['mid'],
                    'article_content': article['content'][:100] if article.get('content') else "",
                    'article_user': article.get('user_name', ""),
                    'comment_id': id,
                    'user_id': user_id,
                    'user_name': screen_name,
                    'content': text_raw,
                    'like_counts': like_counts,
                    'created_at': created_at
                }
                
                comments.append(comment_data)
                total_comments += 1
            
            print(f"已获取 {len(data_list)} 条评论，总计: {total_comments}/{expected_total}")
            
            if max_id and max_id != 0:
                next_param = f'max_id={max_id}&count=20'
            else:
                print("已到达最后一页评论")
                break
                
            # 检查是否已获取所有评论
            if expected_total > 0 and total_comments >= expected_total:
                print(f"已获取所有 {expected_total} 条评论")
                break
                
            # 避免频繁请求
            time.sleep(3 + page_count % 2)
            retry_count = 0  # 重置重试计数器
            
        except Exception as e:
            print(f"获取评论时发生错误: {str(e)}")
            if retry_count < max_retries:
                retry_count += 1
                print(f"第 {retry_count} 次重试...")
                time.sleep(10)
            else:
                print("重试次数已达上限，停止获取")
                break
    
    return comments

def save_to_csv(data, filename, fieldnames):
    """保存数据到CSV文件"""
    try:
        with open(filename, 'a', encoding='utf-8-sig', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            if f.tell() == 0:  # 如果是新文件，写入表头
                writer.writeheader()
            writer.writerows(data)
        print(f"数据已保存到 {filename}")
    except Exception as e:
        print(f"保存文件时出错: {str(e)}")

def main():
    # 获取Cookie
    cookie = get_cookie_manually()
    
    # 创建会话
    session = get_weibo_session(cookie)
    
    # 用户输入搜索条件
    keyword = input("请输入搜索关键词: ")
    max_pages = int(input("请输入最大搜索页数 (默认50): ") or 50)
    
    # 搜索文章 - 只获取热门帖子
    articles = search_weibo(session, keyword, max_pages)
    print(f"共找到 {len(articles)} 篇热门文章")
    
    # 保存文章数据
    if articles:
        save_to_csv(articles, '微博热门文章.csv', ['mid', 'user_id', 'user_name', 'content', 'post_time', 'article_link', 'comment_count', 'repost_count'])
    
    # 获取评论
    all_comments = []
    for i, article in enumerate(articles):
        print(f"\n处理文章 {i+1}/{len(articles)}: {article.get('content', '')[:50]}...")
        
        # 跳过评论数为0的文章
        if article.get('comment_count', 0) == 0:
            print(f"评论数为0，跳过 (实际评论数: {article.get('comment_count', 0)})")
            continue
            
        comments = get_comments(session, article)
        all_comments.extend(comments)
        
        # 每处理完一篇文章保存一次评论
        if comments:
            save_to_csv(comments, '微博热门评论.csv', ['article_mid', 'article_content', 'article_user', 'comment_id', 'user_id', 'user_name', 'content', 'like_counts', 'created_at'])
        else:
            print(f"未获取到评论，但文章显示有 {article.get('comment_count', 0)} 条评论")
        
        # 避免频繁请求
        time.sleep(5)
    
    print(f"\n所有处理完成! 共获取 {len(articles)} 篇热门文章，{len(all_comments)} 条评论")
    print("热门文章数据已保存到: 微博热门文章.csv")
    print("热门评论数据已保存到: 微博热门评论.csv")

if __name__ == "__main__":
    start_time = time.time()
    main()
    end_time = time.time()
    print(f"程序运行总时长: {(end_time - start_time)/60:.2f} 分钟")

请手动登录微博并获取目标页面的Cookie：
1. 打开Chrome浏览器，登录你的微博账号。
2. 访问微博高级搜索页面：https://s.weibo.com/
3. 按下F12打开开发者工具，切换到Network面板。
4. 刷新页面，在左侧列表中找到任意一个请求，点击它，在右侧的Headers选项卡中找到Request Headers下的cookie，复制其值。
请粘贴获取到的Cookie：XSRF-TOKEN=zPIJr7U2rjYtkWe22U6_rnpY; SCF=Ak6ph5Fk1uPRSLkJscl_C8llTMJ-SErIRuV8P9JdlNr3-uZsOj5AZD3qBHgWjRSO6QRgDzgIkS9S3ZciDa3Ll0I.; SUB=_2A25FZIU3DeRhGeFL71oS9C3JyziIHXVmG5j_rDV8PUNbmtAbLXfskW9NQhGYLoCSuFlCfX0E3ldt_7NtfT2-kuaw; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWJQUYTWDrUcICTQl2cLgxA5JpX5KzhUgL.FoMfShn0ShefehB2dJLoIpQLxK-L122LB-2LxK-L122LB-8VUg4fdJ8VqBtt; ALF=02_1753776743; _s_tentry=weibo.com; Apache=800341091193.9557.1751184867191; SINAGLOBAL=800341091193.9557.1751184867191; ULV=1751184867266:1:1:1:800341091193.9557.1751184867191:; WBPSESS=lUUKvpExzH8yjl_-2iqM6BAVpX9Ppx5rm3aOWmOXARdEtIH-5wsHU5Cslu7VakQr4r491LITBMCFwhS5gVw6zobjZ34AuU0e035kidvDh8GOaVeTLf1bdly4dMKlIuLx36syoDjhFLI0PMKsw1NKGA==
请输入搜索关键词: 黑龙江 北黑线 列车 脱轨
请输入最大搜索页数 (默认50): 2
正在搜索第 1 页...
找到文章: 头条新闻 - 【“11·1”#黑河列车脱轨20名相关责任人被处